# Import Independencies

In [2]:
# notebooks/03_Correlation_Analysis.ipynb

import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob # For sentiment analysis

*Project set up importing reusable modules*

In [3]:
# --- Project Setup: Ensure src module is discoverable ---
# importing from src.config
def find_project_root(current_path):
    """
    Finds the project root by looking for common project directories.
    Assumes project_root contains 'src', 'data', and 'notebooks'.
    """
    path = current_path
    while path != os.path.dirname(path):
        if (os.path.isdir(os.path.join(path, 'src')) and
            os.path.isdir(os.path.join(path, 'data')) and
            os.path.isdir(os.path.join(path, 'notebooks'))):
            return path
        path = os.path.dirname(path)
    return current_path # Fallback if no specific root found

current_working_dir = os.getcwd()
project_root = find_project_root(current_working_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path for module imports.")
else:
    print(f"'{project_root}' already in sys.path.")

# Import configuration variables from your src.config
from src.config import NEWS_RAW_PATH, STOCK_DATA_DIR, STOCK_TICKERS


Added 'c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction' to sys.path for module imports.
Project structure setup complete and config.py created/updated.
Base Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction
News Raw Path: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction\data\raw_analyst_ratings.csv
Stock Data Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-1\NewsSentiment-StockPrice-Prediction\data\yfinance_data
Tickers to process: ['AAPL', 'AMZN', 'GOOG', 'META', 'NVDA', 'TSLA']


In [4]:

print("\n--- Starting Task 3: Correlation Analysis ---")

# --- Load News Data ---
print("\n--- Loading News Data ---")
try:
    # Ensure this is the only place news_df is loaded/modified for now
    news_df = pd.read_csv(NEWS_RAW_PATH) # Do NOT parse_dates here, handle it in next step
    print("News data loaded successfully.")
    print(f"News DataFrame Info (initial load): {news_df.info()}")
    print("\nFirst 5 rows of News data (initial load):")
    print(news_df.head())
except FileNotFoundError:
    print(f"CRITICAL ERROR: News data file not found at {NEWS_RAW_PATH}.")
    sys.exit("Exiting: News data file not found.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load news data: {e}")
    sys.exit("Exiting: News data loading failed.")

if news_df.empty:
    sys.exit("Exiting: News DataFrame is empty after loading.")


# --- Load Stock Data (for a single ticker to start, e.g., AMZN) ---
print("\n--- Loading Stock Data (e.g., AMZN) ---")
ticker_to_analyze = 'AMZN' # Choose a specific ticker for correlation analysis
stock_file_name = f"{ticker_to_analyze}_historical_data.csv"
stock_file_path = os.path.join(STOCK_DATA_DIR, stock_file_name)

try:
    stock_df = pd.read_csv(stock_file_path, parse_dates=True, index_col='Date')
    stock_df.columns = [col.replace(' ', '_') for col in stock_df.columns]
    if 'Adj_Close' in stock_df.columns and 'Close' not in stock_df.columns:
        stock_df['Close'] = stock_df['Adj_Close']
    stock_df.dropna(inplace=True)
    print(f"Stock data for {ticker_to_analyze} loaded successfully.")
    print("Stock DataFrame Info:")
    stock_df.info()
    print("\nFirst 5 rows of Stock data:")
    print(stock_df.head())
except FileNotFoundError:
    print(f"CRITICAL ERROR: Stock data file not found at {stock_file_path}.")
    sys.exit("Exiting: Stock data file not found.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load stock data: {e}")
    sys.exit("Exiting: Stock data loading failed.")

if stock_df.empty:
    sys.exit("Exiting: Stock DataFrame is empty after loading.")

# --- Date Alignment ---
print("\n--- Aligning News and Stock Data by Date ---")

# Store original length for comparison
original_total_news_rows = len(news_df)
print(f"Original news_df rows at start of alignment process: {original_total_news_rows}")


# 1. Process News DataFrame 'date' column
# Step 1.1: Convert to datetime, coercing errors, and using 'mixed' format.
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce', format='mixed')
print(f"news_df rows after pd.to_datetime (before initial dropna): {len(news_df)}")


# Step 1.2: IMMEDIATELY drop rows where date conversion failed (became NaT)
# This is CRITICAL. Any NaT values will prevent .dt accessor from working.
rows_before_initial_dropna = len(news_df)
news_df.dropna(subset=['date'], inplace=True)
rows_after_initial_dropna = len(news_df)
if rows_after_initial_dropna < rows_before_initial_dropna:
    print(f"Dropped {rows_before_initial_dropna - rows_after_initial_dropna} rows due to unparseable dates after initial conversion.")
else:
    print("No rows dropped during initial date conversion and dropna.")
print(f"news_df rows after initial dropna: {len(news_df)}")

# --- CRITICAL FIX: Ensure dtype is datetime64[ns] after coercion and dropna ---
# If news_df['date'] is still 'object' dtype, it means some values couldn't be converted
# even after errors='coerce', or it's mixed with non-datetime objects.
# We explicitly cast it. This might introduce new NaNs if there are unconvertible types.
# We also want it to be timezone-naive (datetime64[ns]) before the timezone logic.
try:
    # First, ensure it's a generic datetime type, then convert to UTC and make naive
    # Handle potentially timezone-aware data first before forcing naive.
    if pd.api.types.is_datetime64_any_dtype(news_df['date']):
        # If it's already a datetime type, check for timezone and standardize
        if news_df['date'].dt.tz is not None:
            news_df['date'] = news_df['date'].dt.tz_convert('UTC')
            print("News dates were timezone-aware, converted to UTC.")
        else:
            # If naive, localize to UTC (assuming they are implicitly UTC or local times to be treated as UTC)
            news_df['date'] = news_df['date'].dt.tz_localize('UTC', errors='coerce')
            print("News dates were timezone-naive, localized to UTC.")
        
        # Finally, strip the timezone information to get a naive datetime object representing UTC date
        news_df['date'] = news_df['date'].dt.tz_localize(None)
        print("News dates converted to timezone-naive UTC representation for daily alignment.")
        
    else:
        # If it's not a datetime dtype yet, it must be 'object'. Try to convert again explicitly to datetime64[ns, UTC]
        # and then make naive. This is a failsafe.
        news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce', utc=True)
        news_df['date'] = news_df['date'].dt.tz_localize(None) # Then strip timezone
        print("News dates forced to datetime and converted to timezone-naive UTC representation.")

except Exception as e:
    print(f"Error during explicit datetime type conversion: {e}")
    # If conversion still fails, drop rows that couldn't be converted
    news_df.dropna(subset=['date'], inplace=True)

# Drop NaNs again after timezone standardization/explicit conversion in case new NaTs were introduced
current_rows = len(news_df)
news_df.dropna(subset=['date'], inplace=True)
if len(news_df) < current_rows:
    print(f"Dropped {current_rows - len(news_df)} rows from news_df after full datetime standardization due to new NaTs.")
else:
    print("No additional rows dropped after full datetime standardization.")
print(f"news_df rows after full datetime standardization: {len(news_df)}")

# VERIFY DTYPE AGAIN BEFORE .dt.floor('D')
print(f"news_df['date'] dtype AFTER all timezone standardization: {news_df['date'].dtype}")
if not pd.api.types.is_datetime64_any_dtype(news_df['date']):
    print("CRITICAL: 'date' column is NOT datetime dtype after standardization. Cannot proceed with .dt accessor.")
    sys.exit("Exiting: 'date' column not datetime type.")
print(f"Final news_df rows before floor/set_index: {len(news_df)}") # Final check before setting index


# Step 1.5: Extract only the date part (YYYY-MM-DD) for daily alignment
# This line should now work without AttributeError
news_df['date_only'] = news_df['date'].dt.floor('D') # Floors to the start of the day
print("Successfully extracted date_only column.")

# Step 1.6: Set 'date_only' as the index for news_df for merging
news_df.set_index('date_only', inplace=True)
news_df.sort_index(inplace=True)
print(f"news_df rows after setting index: {len(news_df)}")


# 2. Process Stock DataFrame Index
stock_df.index = stock_df.index.floor('D') # Floors to the start of the day
stock_df.sort_index(inplace=True)
print(f"stock_df rows after floor: {len(stock_df)}")


# 3. Aggregate news headlines by date (if multiple on same day) BEFORE merging
if 'headline' not in news_df.columns:
    print("CRITICAL ERROR: 'headline' column not found in news_df. Please check your news data.")
    sys.exit("Exiting: Missing 'headline' column.")

daily_news_headlines = news_df.groupby(news_df.index)['headline'].apply(lambda x: ' '.join(x)).rename('combined_headline')
print(f"daily_news_headlines series length: {len(daily_news_headlines)}")

print("\nCombined Daily News Headlines (first 5 entries):")
print(daily_news_headlines.head())


# 4. Merge aggregated daily news headlines with daily stock data
merged_df = pd.merge(
    stock_df,
    daily_news_headlines,
    left_index=True,
    right_index=True,
    how='inner'
)

print(f"\nMerged DataFrame shape after date alignment and headline aggregation: {merged_df.shape}")
print("\nFirst 5 rows of Merged data:")
print(merged_df.head())

if merged_df.empty:
    sys.exit("Exiting: Merged DataFrame is empty. No common dates found between news and stock data after alignment.")

# ... (rest of your sentiment analysis, correlation, visualization code remains the same)


--- Starting Task 3: Correlation Analysis ---

--- Loading News Data ---
News data loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1407328 non-null  int64 
 1   headline    1407328 non-null  object
 2   url         1407328 non-null  object
 3   publisher   1407328 non-null  object
 4   date        1407328 non-null  object
 5   stock       1407328 non-null  object
dtypes: int64(1), object(5)
memory usage: 64.4+ MB
News DataFrame Info (initial load): None

First 5 rows of News data (initial load):
   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In 

  news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce', format='mixed')


news_df rows after pd.to_datetime (before initial dropna): 1407328
No rows dropped during initial date conversion and dropna.
news_df rows after initial dropna: 1407328
News dates forced to datetime and converted to timezone-naive UTC representation.
No additional rows dropped after full datetime standardization.
news_df rows after full datetime standardization: 1407328
news_df['date'] dtype AFTER all timezone standardization: datetime64[ns]
Final news_df rows before floor/set_index: 1407328
Successfully extracted date_only column.
news_df rows after setting index: 1407328
stock_df rows after floor: 6846
daily_news_headlines series length: 3955

Combined Daily News Headlines (first 5 entries):
date_only
2009-02-14                         How Treasuries and ETFs Work
2009-04-27    Update on the Luxury Sector: 2nd Quarter 2009 ...
2009-04-29                               Going Against the Herd
2009-05-22    Charles Sizemore Radio Interview Saturday Morning
2009-05-27    JVA perks to 39% 

In [None]:
# notebooks/03_Correlation_Analysis.ipynb - New Cell

# --- Sentiment Analysis ---
print("\n--- Performing Sentiment Analysis on News Headlines ---")

# Ensure 'combined_headline' column exists. It should from the previous step.
if 'combined_headline' not in merged_df.columns:
    print("CRITICAL ERROR: 'combined_headline' column not found in merged_df. Please check previous steps.")
    sys.exit("Exiting: Missing 'combined_headline' column for sentiment analysis.")

# Function to get sentiment polarity using TextBlob
# TextBlob's polarity ranges from -1.0 (negative) to 1.0 (positive)
def get_sentiment_polarity(text):
    if pd.isna(text): # Handle NaN or missing text
        return 0.0 # Assign a neutral score for missing headlines
    return TextBlob(str(text)).sentiment.polarity

# Apply sentiment analysis to each combined headline
merged_df['sentiment_score'] = merged_df['combined_headline'].apply(get_sentiment_polarity)

print("\nMerged DataFrame with Sentiment Scores (last 5 rows):")
print(merged_df.tail())

print(f"Sentiment analysis completed. Added 'sentiment_score' column. Null sentiment scores: {merged_df['sentiment_score'].isnull().sum()}")

# Basic check on sentiment distribution
print("\nBasic sentiment score distribution:")
print(merged_df['sentiment_score'].describe())