In [1]:
# notebooks/03_Correlation_Analysis.ipynb

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob # For sentiment analysis

# --- Project Setup: Ensure src module is discoverable ---
# This block is crucial for importing from src.config
def find_project_root(current_path):
    """
    Finds the project root by looking for common project directories.
    Assumes project_root contains 'src', 'data', and 'notebooks'.
    """
    path = current_path
    while path != os.path.dirname(path):
        if (os.path.isdir(os.path.join(path, 'src')) and
            os.path.isdir(os.path.join(path, 'data')) and
            os.path.isdir(os.path.join(path, 'notebooks'))):
            return path
        path = os.path.dirname(path)
    return current_path # Fallback if no specific root found

current_working_dir = os.getcwd()
project_root = find_project_root(current_working_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path for module imports.")
else:
    print(f"'{project_root}' already in sys.path.")

# Import configuration variables from your src.config
from src.config import NEWS_RAW_PATH, STOCK_DATA_DIR, STOCK_TICKERS

print("\n--- Starting Task 3: Correlation Analysis ---")

# --- Load News Data ---
print("\n--- Loading News Data ---")
try:
    news_df = pd.read_csv(NEWS_RAW_PATH, parse_dates=['date']) # Assuming 'date' column
    print("News data loaded successfully.")
    print("News DataFrame Info:")
    news_df.info()
    print("\nFirst 5 rows of News data:")
    print(news_df.head())
except FileNotFoundError:
    print(f"CRITICAL ERROR: News data file not found at {NEWS_RAW_PATH}.")
    sys.exit("Exiting: News data file not found.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load news data: {e}")
    sys.exit("Exiting: News data loading failed.")

if news_df.empty:
    sys.exit("Exiting: News DataFrame is empty after loading.")


# --- Load Stock Data (for a single ticker to start, e.g., AMZN) ---
# You can extend this to loop through all tickers later if needed for broader analysis.
print("\n--- Loading Stock Data (e.g., AMZN) ---")
ticker_to_analyze = 'AMZN' # Choose a specific ticker for correlation analysis
stock_file_name = f"{ticker_to_analyze}_historical_data.csv"
stock_file_path = os.path.join(STOCK_DATA_DIR, stock_file_name)

try:
    stock_df = pd.read_csv(stock_file_path, parse_dates=True, index_col='Date')
    stock_df.columns = [col.replace(' ', '_') for col in stock_df.columns]
    if 'Adj_Close' in stock_df.columns and 'Close' not in stock_df.columns:
        stock_df['Close'] = stock_df['Adj_Close']
    stock_df.dropna(inplace=True)
    print(f"Stock data for {ticker_to_analyze} loaded successfully.")
    print("Stock DataFrame Info:")
    stock_df.info()
    print("\nFirst 5 rows of Stock data:")
    print(stock_df.head())
except FileNotFoundError:
    print(f"CRITICAL ERROR: Stock data file not found at {stock_file_path}.")
    sys.exit("Exiting: Stock data file not found.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load stock data: {e}")
    sys.exit("Exiting: Stock data loading failed.")

if stock_df.empty:
    sys.exit("Exiting: Stock DataFrame is empty after loading.")

Added 'c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction' to sys.path for module imports.
Project structure setup complete and config.py created/updated.
Base Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction
News Raw Path: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction\data\raw_analyst_ratings.csv
Stock Data Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction\data\yfinance_data
Tickers to process: ['AAPL', 'AMZN', 'GOOG', 'META', 'NVDA', 'TSLA']

--- Starting Task 3: Correlation Analysis ---

--- Loading News Data ---
News data loaded successfully.
News DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1407328 non-null  int64 
 1   headline    1407328 non-null  object
