In [None]:
# --- Imports for Task 2 ---
import pandas as pd
import talib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import logging
import numpy as np  # Added numpy import


In [10]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename='task2_analysis.log', filemode='a')
logger = logging.getLogger(__name__)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logger.addHandler(console)

# Stock symbols
stock_symbols = ['TSLA', 'NVDA', 'META', 'AMZN', 'GOOG', 'AAPL', 'MSFT']

# Directories
DATA_DIR = os.path.abspath('../data')
PLOTS_DIR = os.path.abspath('../Plots-task2')
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

# Set plotting style
sns.set_theme(style="whitegrid")
logger.info("Starting Task 2: Technical Analysis for %s", ', '.join(stock_symbols))



Starting Task 2: Technical Analysis for TSLA, NVDA, META, AMZN, GOOG, AAPL, MSFT
Starting Task 2: Technical Analysis for TSLA, NVDA, META, AMZN, GOOG, AAPL, MSFT


In [25]:
#Load and prepare stock data
def load_stock_data(symbol, data_dir):
    csv_path = os.path.join(data_dir, f"{symbol}_historical_data.csv")
    logger.info("Loading %s from %s", symbol, csv_path)
    try:
        if not os.path.exists(csv_path):
            logger.error("File %s does not exist", csv_path)
            return None

        df = pd.read_csv(csv_path)
        if df.empty:
            logger.warning("%s is empty", csv_path)
            return None

        # Log columns and sample data for debugging
        logger.info("Columns in %s: %s", symbol, df.columns.tolist())
        logger.info("First few rows of %s:\n%s", symbol, df.head().to_string())

        # Date processing
        date_cols = ['Date', 'date', 'Datetime', 'datetime', 'Timestamp', 'timestamp']
        date_col = next((col for col in date_cols if col in df.columns), None)
        if not date_col:
            logger.error("No date column in %s", csv_path)
            return None

        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        df = df.dropna(subset=[date_col])
        df.set_index(date_col, inplace=True)

        # OHLCV column mapping (prioritize 'Close' over 'Adj Close')
        col_map = {}
        close_mapped = False
        for col in df.columns:
            col_lower = col.lower().replace(' ', '_')
            if col_lower == 'open' or col_lower == 'open_price':
                col_map[col] = 'Open'
            elif col_lower == 'high' or col_lower == 'high_price':
                col_map[col] = 'High'
            elif col_lower == 'low' or col_lower == 'low_price':
                col_map[col] = 'Low'
            elif col_lower == 'close' and not close_mapped:
                col_map[col] = 'Close'
                close_mapped = True
            elif col_lower == 'volume':
                col_map[col] = 'Volume'

        df.rename(columns=col_map, inplace=True)
        required_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            logger.error("Missing columns in %s: %s", symbol, missing_cols)
            return None

        df = df[required_cols].copy()
        df = df.apply(pd.to_numeric, errors='coerce').ffill().bfill().dropna()
        if df.empty:
            logger.warning("No valid data after cleaning for %s", symbol)
            return None

        # Log Close column details
        if isinstance(df['Close'], pd.Series):
            logger.info("Close column dtype: %s", df['Close'].dtype)
            logger.info("Close column shape: %s", df['Close'].shape)
            logger.info("Close column sample: %s", df['Close'].head().to_list())
        else:
            logger.error("Close column is not a Series for %s, type: %s", symbol, type(df['Close']))
            return None

        # Save compressed .pkl
        pkl_path = os.path.join(data_dir, f"{symbol}_processed.pkl")
        df.to_pickle(pkl_path, compression='gzip')
        logger.info("Saved processed data to %s", pkl_path)
        return df
    except FileNotFoundError:
        logger.error("%s not found", csv_path)
        return None
    except Exception as e:
        logger.error("Error loading %s: %s", symbol, e, exc_info=True)
        return None 

In [26]:
# Load news data
def load_news_data(data_dir):
    csv_path = os.path.join(data_dir, 'raw_analyst_ratings.csv')
    logger.info("Loading news from %s", csv_path)
    try:
        # Load CSV
        news_df = pd.read_csv(csv_path)
        
        # Log columns for debugging
        logger.info("Columns in news data: %s", news_df.columns.tolist())
        
        # Handle numeric columns (none in this case, based on logs)
        numeric_cols = ['score', 'impact']  # Adjust based on actual columns
        for col in numeric_cols:
            if col in news_df.columns:
                logger.info("Unique values in %s: %s", col, news_df[col].unique()[:10])
                news_df[col] = pd.to_numeric(news_df[col], errors='coerce')
                if news_df[col].isna().any():
                    logger.warning("Non-numeric values found in %s, converted to NaN", col)
        
        # Process dates
        date_cols = ['date', 'Date', 'datetime', 'Datetime', 'Timestamp', 'timestamp']
        date_col = next((col for col in date_cols if col in news_df.columns), None)
        if not date_col:
            logger.error("No date column found in %s", csv_path)
            return None

        news_df[date_col] = pd.to_datetime(news_df[date_col], utc=True, errors='coerce').dt.tz_convert('US/Eastern')
        news_df = news_df.dropna(subset=[date_col])
        
        # Save processed data
        pkl_path = os.path.join(data_dir, 'processed_news_data.pkl')
        news_df.to_pickle(pkl_path, compression='gzip')
        logger.info("Saved processed news data to %s", pkl_path)
        return news_df
    except Exception as e:
        logger.error("Error loading news data: %s", e, exc_info=True)
        return None

# Load stock data
all_stocks_data = {}
for symbol in stock_symbols:
    df = load_stock_data(symbol, DATA_DIR)
    if df is not None:
        all_stocks_data[symbol] = df

if not all_stocks_data:
    logger.error("No stock data loaded. Exiting.")
    exit(1)

logger.info("Loaded data for %d stocks: %s", len(all_stocks_data), list(all_stocks_data.keys()))

Loading TSLA from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\TSLA_historical_data.csv
Loading TSLA from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\TSLA_historical_data.csv
Columns in TSLA: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Dividends', 'Stock Splits']
Columns in TSLA: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Dividends', 'Stock Splits']
First few rows of TSLA:
         Date      Open      High       Low     Close  Adj Close     Volume  Dividends  Stock Splits
0  2010-06-29  1.266667  1.666667  1.169333  1.592667   1.592667  281494500        0.0           0.0
1  2010-06-30  1.719333  2.028000  1.553333  1.588667   1.588667  257806500        0.0           0.0
2  2010-07-01  1.666667  1.728000  1.351333  1.464000   1.464000  123282000        0.0           0.0
3  2010-07-02  1.533333  1.540000  1.247333  1.280000   1.280000   77097000        0.0           0.0
4  2010-07-06  

In [27]:
# Load news data
news_df = load_news_data(DATA_DIR)

# Calculate indicators and plot
for symbol, df in all_stocks_data.items():
    logger.info("Processing %s", symbol)
    try:
        # Ensure Close prices are a 1D NumPy array
        if not isinstance(df['Close'], pd.Series):
            logger.error("Close column is not a Series for %s", symbol)
            continue
        close_prices = df['Close'].dropna().values.astype(np.float64)
        
        # Log array details for debugging
        logger.info("close_prices shape: %s", close_prices.shape)
        logger.info("close_prices dtype: %s", close_prices.dtype)
        logger.info("close_prices sample: %s", close_prices[:5].tolist())

        # Check if data is sufficient and valid
        if len(close_prices) < 20:  # Minimum for SMA_20
            logger.warning("Insufficient data for %s: only %d data points", symbol, len(close_prices))
            continue
        if np.any(np.isnan(close_prices)) or np.any(np.isinf(close_prices)):
            logger.warning("Invalid data (NaN or inf) in Close prices for %s", symbol)
            continue
        if close_prices.ndim != 1:
            logger.error("close_prices is not 1D for %s, shape: %s", symbol, close_prices.shape)
            close_prices = close_prices.flatten()  # Attempt to fix by flattening
            logger.info("Flattened close_prices to shape: %s", close_prices.shape)

        # Calculate indicators
        df['SMA_20'] = talib.SMA(close_prices, timeperiod=20)
        df['RSI_14'] = talib.RSI(close_prices, timeperiod=14)
        macd, signal, hist = talib.MACD(close_prices, fastperiod=12, slowperiod=26, signalperiod=9)
        df['MACD'] = macd
        df['MACD_Signal'] = signal
        df['MACD_Hist'] = hist

        # Save indicators
        pkl_path = os.path.join(DATA_DIR, f"{symbol}_indicators.pkl")
        df.to_pickle(pkl_path, compression='gzip')
        logger.info("Saved indicators to %s", pkl_path)

        # Plot SMA
        plt.figure(figsize=(10, 5))
        plt.plot(df.index, df['Close'], label='Close', color='blue')
        plt.plot(df.index, df['SMA_20'], label='SMA 20', color='orange', ls='--')
        plt.title(f'{symbol} Price & SMA')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(PLOTS_DIR, f'{symbol}_sma.png'))
        plt.close()

        # Plot RSI
        if not df['RSI_14'].isnull().all():
            plt.figure(figsize=(10, 4))
            plt.plot(df.index, df['RSI_14'], label='RSI 14', color='purple')
            plt.axhline(70, color='red', ls='--', alpha=0.5)
            plt.axhline(30, color='green', ls='--', alpha=0.5)
            plt.title(f'{symbol} RSI')
            plt.xlabel('Date')
            plt.ylabel('RSI')
            plt.ylim(0, 100)
            plt.legend()
            plt.tight_layout()
            plt.savefig(os.path.join(PLOTS_DIR, f'{symbol}_rsi.png'))
            plt.close()

        # Plot MACD
        if not df['MACD'].isnull().all():
            fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
            ax1.plot(df.index, df['MACD'], label='MACD', color='blue')
            ax1.plot(df.index, df['MACD_Signal'], label='Signal', color='red', ls='--')
            ax1.legend()
            ax2.bar(df.index, df['MACD_Hist'], color=['green' if v >= 0 else 'red' for v in df['MACD_Hist']], alpha=0.7)
            ax2.axhline(0, color='grey', ls='--')
            ax2.set_xlabel('Date')
            fig.suptitle(f'{symbol} MACD')
            plt.tight_layout()
            plt.savefig(os.path.join(PLOTS_DIR, f'{symbol}_macd.png'))
            plt.close()
    except Exception as e:
        logger.error("Error processing %s: %s", symbol, e, exc_info=True)
        continue

Loading news from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\raw_analyst_ratings.csv
Loading news from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\raw_analyst_ratings.csv
Columns in news data: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
Columns in news data: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
Saved processed news data to d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\processed_news_data.pkl
Saved processed news data to d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\processed_news_data.pkl
Processing TSLA
Processing TSLA
close_prices shape: (3545,)
close_prices shape: (3545,)
close_prices dtype: float64
close_prices dtype: float64
close_prices sample: [1.5926669836044312, 1.5886670351028442, 1.4639999866485596, 1.2799999713897705, 1.0740000009536743]
close_prices sample: [1.5926669836044312, 1.5886670351028442, 1.46399998

In [None]:
# Task 3 preparation
if news_df is not None and all_stocks_data:
    logger.info("Preparing for Task 3 correlation analysis")
    for symbol in all_stocks_data:
        df = all_stocks_data[symbol]
        df['Daily_Return'] = df['Close'].pct_change()
        logger.info("Computed daily returns for %s. Awaiting sentiment analysis.", symbol)

logger.info("Task 2 completed. Plots in %s", PLOTS_DIR)

Loading news from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\raw_analyst_ratings.csv
Loading news from d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\raw_analyst_ratings.csv
Saved processed news data to d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\processed_news_data.pkl
Saved processed news data to d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\data\processed_news_data.pkl
Processing TSLA
Processing TSLA
Error processing TSLA: input array has wrong dimensions
Traceback (most recent call last):
  File "C:\Users\Yoga i7\AppData\Local\Temp\ipykernel_22596\1591067256.py", line 19, in <module>
    df['SMA_20'] = talib.SMA(close_prices, timeperiod=20)
                   ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\venv\Lib\site-packages\talib\__init__.py", line 80, in wrapper
    result = func(*_args, **_kwds

In [7]:
# Task 3 preparation
if news_df is not None and all_stocks_data:
    logger.info("Preparing for Task 3 correlation analysis")
    for symbol in all_stocks_data:
        df = all_stocks_data[symbol]
        df['Daily_Return'] = df['Close'].pct_change()
        logger.info("Computed daily returns for %s. Awaiting sentiment analysis.", symbol)

logger.info("Task 2 completed. Plots in %s", PLOTS_DIR)

Task 2 completed. Plots in d:\Documents\Projects\10 Academy\Stock Market\financial-news-analysis\Plots-task2
