In [None]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Load Files

# Path to the folder containing the stock CSV files
stock_data_folder = '../Data/yfinance_data/'

# Load all CSV files into a list of DataFrames
csv_files = glob.glob(os.path.join(stock_data_folder, "*.csv"))

# Combine all the CSV files into one DataFrame
stock_df_list = [pd.read_csv(file) for file in csv_files]
combined_stock_df = pd.concat(stock_df_list, ignore_index=True)

# Convert Date column to datetime64[ns] format
combined_stock_df['Date'] = pd.to_datetime(combined_stock_df['Date'], errors='coerce')

# Check if the conversion worked and if there are any NaT values
print(combined_stock_df['Date'].dtypes)  # Should be datetime64[ns]
print(combined_stock_df['Date'].isnull().sum())  # Check for NaT values


# Load the news data
news_df = pd.read_csv('../Data/raw_analyst_ratings.csv')

# Convert news date column to datetime64[ns], removing timezones if necessary
news_df = news_df.rename(columns={'date': 'Date'})
news_df['Date'] = pd.to_datetime(news_df['Date'], errors='coerce')

# If there is timezone information, remove it
news_df['Date'] = news_df['Date'].dt.tz_localize(None)

# Check the types to ensure date columns match
print(news_df['Date'].dtypes)  # Should be datetime64[ns]
print(news_df['Date'].isnull().sum())  # Check for NaT values

# Drop rows with NaT in date columns
combined_stock_df.dropna(subset=['Date'], inplace=True)
news_df.dropna(subset=['Date'], inplace=True)

# Merge the news and combined stock data based on the date
merged_df = pd.merge(combined_stock_df, news_df, left_on='Date', right_on='Date', how='inner')

# Check the merged dataset
print("Merged Data Shape:", merged_df.shape)
print(merged_df.head())

In [None]:
# Ensure both Date columns are datetime without timezone or time components
news_df['Date'] = pd.to_datetime(news_df['Date']).dt.date
combined_stock_df['Date'] = pd.to_datetime(combined_stock_df['Date']).dt.date

# Confirm both columns are now properly aligned
print(news_df['Date'].dtype)
print(combined_stock_df['Date'].dtype)

In [None]:
# Strip any leading/trailing whitespace
news_df['Date'] = news_df['Date'].astype(str).str.strip()
combined_stock_df['Date'] = combined_stock_df['Date'].astype(str).str.strip()

# Convert them back to datetime
news_df['Date'] = pd.to_datetime(news_df['Date'])
combined_stock_df['Date'] = pd.to_datetime(combined_stock_df['Date'])

In [None]:
# Get unique dates in both datasets
news_dates = set(news_df['Date'].unique())
stock_dates = set(combined_stock_df['Date'].unique())

# Check if there are any overlaps
common_dates = news_dates.intersection(stock_dates)
print(f"Number of common dates: {len(common_dates)}")

# Optionally, print a few common dates
print(list(common_dates)[:10])

In [None]:
# Perform a left join
debug_merge = pd.merge(news_df, combined_stock_df, on='Date', how='left')

# Display rows where there are no matches from the stock data
missing_matches = debug_merge[debug_merge['Open'].isnull()]
print(missing_matches[['Date', 'headline', 'publisher']].head())

# Check the shape of the missing_matches DataFrame
print("Number of missing matches:", missing_matches.shape[0])

 sentiment analysis using TextBlob

In [None]:
from textblob import TextBlob

# Function to calculate sentiment using TextBlob
def get_sentiment(headline):
    analysis = TextBlob(headline)
    return analysis.sentiment.polarity

# Apply sentiment analysis on the news headlines
debug_merge['sentiment'] = debug_merge['headline'].apply(get_sentiment)

# Check sentiment scores
print(debug_merge[['headline', 'sentiment']].head())

Calculate Daily Stock Returns

In [None]:
# Calculate daily returns
debug_merge['Daily_Return'] = debug_merge['Close'].pct_change()

# Drop rows with NaN values (resulting from the pct_change calculation)
debug_merge = debug_merge.dropna()

# Display the first few rows with daily returns
print(debug_merge[['Date', 'Close', 'Daily_Return']].head())

Perform Correlation Analysis

In [None]:
# Aggregate sentiment scores by date (average sentiment)
daily_sentiment = debug_merge.groupby('Date')['sentiment'].mean().reset_index()

# Merge aggregated sentiment scores with daily returns
final_df = pd.merge(daily_sentiment, debug_merge[['Date', 'Daily_Return']], on='Date')

# Display the final dataset for correlation analysis

In [None]:
# Calculate correlation
correlation = final_df['sentiment'].corr(final_df['Daily_Return'])

print(f'Correlation between news sentiment and stock returns: {correlation:.2f}')

In [None]:
# Plot a scatter plot with a regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='sentiment', y='Daily_Return', data=final_df, scatter_kws={'alpha':0.5}, line_kws={"color":"red"})
plt.title('Sentiment vs. Stock Returns')
plt.xlabel('Average Daily Sentiment')
plt.ylabel('Daily Stock Return (%)')
plt.show()