In [16]:
# notebooks/02_Quantitative_Analysis.ipynb
import pandas as pd
import sys
import os
from datetime import timedelta # Added for plot_start_date calculation

# --- Add src to Python path ---
# This assumes notebooks are in 'project_root/notebooks/' and src is in 'project_root/src/'
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

import data_processing
import financial_analysis
import visualization_tools # This module will provide plot_stock_with_indicators

# --- Configuration ---
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid", palette="pastel") # Using a seaborn theme
plt.rcParams['figure.figsize'] = (14, 7) # Default figure size for this notebook
plt.rcParams['font.size'] = 12            # Default font size

# !!! IMPORTANT: VERIFY THIS PATH AND FILENAME FOR YOUR NEWS DATA !!!
# This is used to get context (tickers, date range) for loading stock data.
# Replace 'raw_analyst_ratings.csv' with your actual news data filename if different.
NEWS_DATA_FILE = '../data/raw_analyst_ratings.csv' # <<< YOUR ACTUAL NEWS FILE PATH
# Example if your news file is in a different location or named differently:
# NEWS_DATA_FILE = '../data/raw/my_actual_news_data.csv'

# !!! IMPORTANT: VERIFY COLUMN NAMES IN YOUR NEWS DATA CSV if used for context !!!
# These should match the columns in your NEWS_DATA_FILE.
DATE_COL_NEWS_CONTEXT = 'date'    # Column with publication datetime in news data (VERIFY)
STOCK_COL_NEWS_CONTEXT = 'stock'  # Column with stock ticker symbols in news data (VERIFY)

# --- Stock Data CSV Configuration ---
# !!! IMPORTANT: VERIFY THESE PATHS AND SETTINGS FOR YOUR STOCK PRICE CSVs !!!
STOCK_CSV_DIRECTORY = '../data/raw/stock_historical_data/' # Directory containing individual stock CSVs
FILENAME_TEMPLATE = "{}_historical_data.csv"  # e.g., AAPL_historical_data.csv (TICKER part will be uppercased)
DATE_COL_IN_STOCK_CSV = 'Date'                 # Column name for date in your stock CSVs (VERIFY THIS)
PRICE_COL_FOR_TA = 'Close'                     # Price column to use for TA Lib (e.g., 'Close', 'Adj Close') (VERIFY THIS)
# Ensure these columns exist in your stock CSVs, or adjust the list.
# The 'load_stock_prices_from_csvs' function has some logic to use 'Adj Close' if 'Close' is missing.
REQUIRED_OHLCV_COLS = ['Open', 'High', 'Low', 'Close', 'Volume'] # (VERIFY THESE)

print(f"Using news data file for context: {NEWS_DATA_FILE}")
print(f"Loading stock CSVs from: {STOCK_CSV_DIRECTORY} using template: {FILENAME_TEMPLATE}")
print(f"Expecting date column in stock CSVs: '{DATE_COL_IN_STOCK_CSV}'")
print(f"Using price column for TA: '{PRICE_COL_FOR_TA}'")

# --- 1. Load News Data to Get Tickers and Date Range for context ---
print("\n--- Loading News Data for Context ---")
news_df_raw = data_processing.load_financial_news_data(NEWS_DATA_FILE)

# Proceed only if news data is loaded successfully
if not news_df_raw.empty:
    # Ensure the date column for context exists before trying to extract features
    if DATE_COL_NEWS_CONTEXT not in news_df_raw.columns:
        print(f"Error: Date column '{DATE_COL_NEWS_CONTEXT}' not found in news data. Cannot establish context.")
        # Handle error: maybe use default tickers/dates or stop
        news_df_processed_context = pd.DataFrame() # Empty df to signal failure
    else:
        news_df_processed_context = data_processing.extract_date_features(news_df_raw, date_col=DATE_COL_NEWS_CONTEXT)

    if news_df_processed_context.empty or STOCK_COL_NEWS_CONTEXT not in news_df_processed_context.columns or news_df_processed_context[STOCK_COL_NEWS_CONTEXT].isnull().all():
        print(f"Warning: '{STOCK_COL_NEWS_CONTEXT}' column missing or empty in processed news data, or news data processing failed. Using default tickers for demo.")
        valid_tickers_for_stock_loading = ['AAPL', 'MSFT', 'GOOG'] # Fallback for demo
        start_date_news_context = '2023-01-01' # Arbitrary context start
        end_date_news_context = '2023-12-31'   # Arbitrary context end
    else:
        # Extract unique tickers, convert to string, uppercase, and filter for typical formats
        unique_tickers_from_news = news_df_processed_context[STOCK_COL_NEWS_CONTEXT].dropna().astype(str).str.upper().unique().tolist()
        # Basic filter: alphanumeric, length 1-5 (common for US equities)
        valid_tickers_for_stock_loading = [t for t in unique_tickers_from_news if t.isalnum() and 0 < len(t) <= 5]
        if not valid_tickers_for_stock_loading:
            print("No valid tickers extracted from news after filtering. Using demo tickers.")
            valid_tickers_for_stock_loading = ['AAPL', 'MSFT', 'GOOG'] # Fallback

        # Dates from news for plotting/display context
        if 'publication_date_only' in news_df_processed_context.columns and not news_df_processed_context['publication_date_only'].isnull().all():
            start_date_news_context = news_df_processed_context['publication_date_only'].min().strftime('%Y-%m-%d')
            end_date_news_context = news_df_processed_context['publication_date_only'].max().strftime('%Y-%m-%d')
        else:
            print("Warning: 'publication_date_only' not found or empty in processed news. Using default date context.")
            start_date_news_context = '2023-01-01'
            end_date_news_context = '2023-12-31'

    print(f"News data context (for display/plotting) spans from {start_date_news_context} to {end_date_news_context}")
    print(f"Attempting to load stock data for {len(valid_tickers_for_stock_loading)} unique valid tickers derived from news (showing first 5): {valid_tickers_for_stock_loading[:5]}...")

    # --- 2. Load Stock Price Data from CSVs ---
    print("\n--- Loading Stock Price Data from CSVs ---")
    # The 'tickers' argument here uses the list derived from the news context.
    all_stock_data = financial_analysis.load_stock_prices_from_csvs(
        tickers=valid_tickers_for_stock_loading, # Use the filtered list from news
        csv_directory=STOCK_CSV_DIRECTORY,
        filename_template=FILENAME_TEMPLATE,
        date_col=DATE_COL_IN_STOCK_CSV,
        required_ohlcv_cols=REQUIRED_OHLCV_COLS
    )

    if not all_stock_data: # Checks if the dictionary is empty
        print("No stock data was loaded from CSVs. Cannot proceed with TA calculation.")
        # You might want to exit or handle this case if no stock data means the rest of the notebook is pointless.
    else:
        print(f"Successfully loaded data for {len(all_stock_data)} tickers from CSVs.")
        # Example: print names of loaded tickers
        print(f"Loaded tickers: {list(all_stock_data.keys())[:10]}") # Show first 10 loaded

        # --- 3. Calculate Technical Indicators ---
        print("\n--- Calculating Technical Indicators ---")
        stock_data_with_indicators = {}
        for ticker_symbol, df_stock_loaded in all_stock_data.items(): # ticker_symbol is already uppercase from load_stock_prices
            if PRICE_COL_FOR_TA not in df_stock_loaded.columns:
                print(f"Price column '{PRICE_COL_FOR_TA}' not found for TA in {ticker_symbol}'s data. Skipping TA for this stock.")
                stock_data_with_indicators[ticker_symbol] = df_stock_loaded # Store original data without TAs
                continue
            
            print(f"Calculating TAs for {ticker_symbol} using price column '{PRICE_COL_FOR_TA}'...")
            stock_data_with_indicators[ticker_symbol] = financial_analysis.calculate_technical_indicators(
                df_stock_loaded, price_col=PRICE_COL_FOR_TA
            )
            # print(f"Finished TAs for {ticker_symbol}.") # Optional: more verbose logging

            # Display sample of calculated indicators (Optional, can be verbose)
            # print(f"Sample indicators for {ticker_symbol} (NaNs possible if data too short or at ends of series):")
            # df_display_indicators = stock_data_with_indicators[ticker_symbol]
            # cols_to_print_indicators = [PRICE_COL_FOR_TA, 'SMA_20', 'SMA_50', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_hist']
            # actual_cols_to_print_indicators = [col for col in cols_to_print_indicators if col in df_display_indicators.columns]
            # if not df_display_indicators.empty and actual_cols_to_print_indicators:
            #     df_context_period_for_display = df_display_indicators[
            #         (df_display_indicators.index >= pd.to_datetime(start_date_news_context)) &
            #         (df_display_indicators.index <= pd.to_datetime(end_date_news_context))
            #     ]
            #     if not df_context_period_for_display.empty:
            #         print(df_context_period_for_display[actual_cols_to_print_indicators].dropna(how='all', subset=actual_cols_to_print_indicators[1:]).tail(3))
            #     else: # If no data in news context, show last few from overall data
            #         print(f"No data for {ticker_symbol} within news context for display, showing last 3 overall:")
            #         print(df_display_indicators[actual_cols_to_print_indicators].dropna(how='all', subset=actual_cols_to_print_indicators[1:]).tail(3))

        # --- 4. Visualize Data ---
        print("\n--- Visualizing Stock Data with Indicators ---")
        if stock_data_with_indicators:
            tickers_plotted_count = 0
            max_tickers_to_plot = 3 # Control how many stock charts are generated

            for ticker_to_plot, df_plot_data_with_tas in stock_data_with_indicators.items():
                if tickers_plotted_count >= max_tickers_to_plot:
                    print(f"Reached max number of plots ({max_tickers_to_plot}). Stopping visualization here.")
                    break

                if PRICE_COL_FOR_TA not in df_plot_data_with_tas.columns:
                    print(f"Skipping plot for {ticker_to_plot}: Price column '{PRICE_COL_FOR_TA}' not found in its data.")
                    continue
                
                # Ensure there's enough data to plot meaningfully after TA calculations (which introduce NaNs)
                if df_plot_data_with_tas[PRICE_COL_FOR_TA].dropna().empty:
                    print(f"No valid price data to plot for {ticker_to_plot} after TA NaNs. Skipping plot.")
                    continue

                print(f"Preparing plot for {ticker_to_plot}...")
                # Define a relevant plotting window: news context +/- some buffer for TAs to show clearly
                # Using a fixed lookback for TA context rather than start_date_news_context for plotting TAs
                plot_window_end_dt = pd.to_datetime(end_date_news_context) + timedelta(days=10) # Show a bit after news period
                plot_window_start_dt = plot_window_end_dt - timedelta(days=365 + 90) # Approx 1 year + 3 months TA buffer (for TAs to be visible)

                df_plot_filtered = df_plot_data_with_tas[
                    (df_plot_data_with_tas.index >= plot_window_start_dt) &
                    (df_plot_data_with_tas.index <= plot_window_end_dt)
                ]

                if not df_plot_filtered.empty:
                    visualization_tools.plot_stock_with_indicators(
                        df_plot_filtered, ticker_to_plot, price_col=PRICE_COL_FOR_TA
                    )
                    tickers_plotted_count += 1
                else:
                    print(f"No data in the defined plotting window ({plot_window_start_dt.date()} to {plot_window_end_dt.date()}) for {ticker_to_plot}. Skipping plot.")
        else:
            print("No data with indicators available to visualize (all_stock_data might have been empty or TAs failed for all).")
else:
    print("News data for context could not be loaded. Quantitative analysis on stock data cannot effectively proceed without ticker context.")

print("\n--- Quantitative Analysis (Task 2) complete. ---")

Using news data file for context: ../data/raw_analyst_ratings.csv
Loading stock CSVs from: ../data/raw/stock_historical_data/ using template: {}_historical_data.csv
Expecting date column in stock CSVs: 'Date'
Using price column for TA: 'Close'

--- Loading News Data for Context ---
News data context (for display/plotting) spans from 2011-04-27 to 2020-06-11
Attempting to load stock data for 6187 unique valid tickers derived from news (showing first 5): ['A', 'AA', 'AAC', 'AADR', 'AAL']...

--- Loading Stock Price Data from CSVs ---
Successfully loaded data for 5 tickers from CSVs.
Loaded tickers: ['AAPL', 'AMZN', 'GOOG', 'NVDA', 'TSLA']

--- Calculating Technical Indicators ---
Calculating TAs for AAPL using price column 'Close'...
Calculating TAs for AMZN using price column 'Close'...
Calculating TAs for GOOG using price column 'Close'...
Calculating TAs for NVDA using price column 'Close'...
Calculating TAs for TSLA using price column 'Close'...

--- Visualizing Stock Data with Indic

KeyError: ['MACD', 'MACD_signal']