In [1]:
# notebooks/03_Correlation_Analysis.ipynb

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob # For sentiment analysis

# --- Project Setup: Ensure src module is discoverable ---
# This block is crucial for importing from src.config
def find_project_root(current_path):
    """
    Finds the project root by looking for common project directories.
    Assumes project_root contains 'src', 'data', and 'notebooks'.
    """
    path = current_path
    while path != os.path.dirname(path):
        if (os.path.isdir(os.path.join(path, 'src')) and
            os.path.isdir(os.path.join(path, 'data')) and
            os.path.isdir(os.path.join(path, 'notebooks'))):
            return path
        path = os.path.dirname(path)
    return current_path # Fallback if no specific root found

current_working_dir = os.getcwd()
project_root = find_project_root(current_working_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path for module imports.")
else:
    print(f"'{project_root}' already in sys.path.")

# Import configuration variables from your src.config
from src.config import NEWS_RAW_PATH, STOCK_DATA_DIR, STOCK_TICKERS

print("\n--- Starting Task 3: Correlation Analysis ---")

# --- Load News Data ---
print("\n--- Loading News Data ---")
try:
    news_df = pd.read_csv(NEWS_RAW_PATH, parse_dates=['date']) # Assuming 'date' column
    print("News data loaded successfully.")
    print("News DataFrame Info:")
    news_df.info()
    print("\nFirst 5 rows of News data:")
    print(news_df.head())
except FileNotFoundError:
    print(f"CRITICAL ERROR: News data file not found at {NEWS_RAW_PATH}.")
    sys.exit("Exiting: News data file not found.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load news data: {e}")
    sys.exit("Exiting: News data loading failed.")

if news_df.empty:
    sys.exit("Exiting: News DataFrame is empty after loading.")


# --- Load Stock Data (for a single ticker to start, e.g., AMZN) ---
# You can extend this to loop through all tickers later if needed for broader analysis.
print("\n--- Loading Stock Data (e.g., AMZN) ---")
ticker_to_analyze = 'AMZN' # Choose a specific ticker for correlation analysis
stock_file_name = f"{ticker_to_analyze}_historical_data.csv"
stock_file_path = os.path.join(STOCK_DATA_DIR, stock_file_name)

try:
    stock_df = pd.read_csv(stock_file_path, parse_dates=True, index_col='Date')
    stock_df.columns = [col.replace(' ', '_') for col in stock_df.columns]
    if 'Adj_Close' in stock_df.columns and 'Close' not in stock_df.columns:
        stock_df['Close'] = stock_df['Adj_Close']
    stock_df.dropna(inplace=True)
    print(f"Stock data for {ticker_to_analyze} loaded successfully.")
    print("Stock DataFrame Info:")
    stock_df.info()
    print("\nFirst 5 rows of Stock data:")
    print(stock_df.head())
except FileNotFoundError:
    print(f"CRITICAL ERROR: Stock data file not found at {stock_file_path}.")
    sys.exit("Exiting: Stock data file not found.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load stock data: {e}")
    sys.exit("Exiting: Stock data loading failed.")

if stock_df.empty:
    sys.exit("Exiting: Stock DataFrame is empty after loading.")

Added 'c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction' to sys.path for module imports.
Project structure setup complete and config.py created/updated.
Base Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction
News Raw Path: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction\data\raw_analyst_ratings.csv
Stock Data Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction\data\yfinance_data
Tickers to process: ['AAPL', 'AMZN', 'GOOG', 'META', 'NVDA', 'TSLA']

--- Starting Task 3: Correlation Analysis ---

--- Loading News Data ---
News data loaded successfully.
News DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1407328 non-null  int64 
 1   headline    1407328 non-null  object


In [None]:
# --- Date Alignment ---
print("\n--- Aligning News and Stock Data by Date ---")

# 1. Process News DataFrame 'date' column
# Step 1.1: Convert to datetime, coercing errors, and handling mixed formats
# This is the critical step where parsing happens.
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce', format='mixed')

# Step 1.2: IMMEDIATELY drop rows where date conversion failed (became NaT)
# This ensures that all remaining entries in 'date' are valid datetime objects.
initial_rows = len(news_df)
news_df.dropna(subset=['date'], inplace=True)
if len(news_df) < initial_rows:
    print(f"Dropped {initial_rows - len(news_df)} rows from news_df due to unparseable dates.")

# Step 1.3: Handle timezones to ensure consistency (e.g., all UTC then naive)
# Check if the 'date' column is timezone-aware
if news_df['date'].dt.tz is not None:
    # If it's already timezone-aware, convert it to UTC
    news_df['date'] = news_df['date'].dt.tz_convert('UTC')
    print("News dates were timezone-aware, converted to UTC.")
else:
    # If it's timezone-naive, localize it to UTC.
    # We assume naive timestamps in your raw data are implicitly UTC or local times
    # that you want to treat as UTC for global alignment.
    news_df['date'] = news_df['date'].dt.tz_localize('UTC', errors='coerce')
    print("News dates were timezone-naive, localized to UTC.")
    # Drop NaNs again in case localization failed for some entries (less common but safe)
    news_df.dropna(subset=['date'], inplace=True)

# Step 1.4: Convert all dates to timezone-naive (strip timezone) for daily alignment
# This is safe now because we've ensured all are consistently UTC-aware before stripping.
news_df['date'] = news_df['date'].dt.tz_localize(None)
print("News dates converted to timezone-naive UTC representation for daily alignment.")

# Step 1.5: Extract only the date part (YYYY-MM-DD) for daily alignment
news_df['date_only'] = news_df['date'].dt.floor('D') # Floors to the start of the day

# Step 1.6: Set 'date_only' as the index for news_df for merging
news_df.set_index('date_only', inplace=True)
news_df.sort_index(inplace=True)

# 2. Process Stock DataFrame Index
# Ensure stock_df index is also datetime (already done by parse_dates=True, index_col='Date')
# Convert stock_df index to date-only for daily alignment
# Assuming stock data is already in a consistent timezone or timezone-naive and represents market close of that day
stock_df.index = stock_df.index.floor('D') # Floors to the start of the day
stock_df.sort_index(inplace=True)

# 3. Aggregate news headlines by date (if multiple on same day) BEFORE merging
# This step is crucial because we want one sentiment score per day.
# We'll concatenate headlines for now; sentiment analysis will be applied to this concatenated string.
if 'headline' not in news_df.columns:
    print("CRITICAL ERROR: 'headline' column not found in news_df. Please check your news data.")
    sys.exit("Exiting: Missing 'headline' column.")

daily_news_headlines = news_df.groupby(news_df.index)['headline'].apply(lambda x: ' '.join(x)).rename('combined_headline')

print("\nCombined Daily News Headlines (first 5 entries):")
print(daily_news_headlines.head())


# 4. Merge aggregated daily news headlines with daily stock data
# Use an inner merge to keep only dates common to both datasets
merged_df = pd.merge(
    stock_df,
    daily_news_headlines,
    left_index=True,
    right_index=True,
    how='inner'
)

print(f"\nMerged DataFrame shape after date alignment and headline aggregation: {merged_df.shape}")
print("\nFirst 5 rows of Merged data:")
print(merged_df.head())

if merged_df.empty:
    sys.exit("Exiting: Merged DataFrame is empty. No common dates found between news and stock data after alignment.")


--- Aligning News and Stock Data by Date ---


  news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce', format='mixed')


AttributeError: Can only use .dt accessor with datetimelike values