In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#for Sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


**Data Preparation**

In [15]:

df_stocks = pd.concat([
    pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/AAPL_historical_data.csv"),
    pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/AMZN_historical_data.csv"),
    pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/GOOG_historical_data.csv"),
    pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/MSFT_historical_data.csv"),
    pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/MSFT_historical_data.csv"),
    pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/NVDA_historical_data.csv"),
    pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/TSLA_historical_data.csv")
], keys=['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA', 'TSLA'])

df_news = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/raw_analyst_ratings.csv")

**Check for missing values**

In [16]:
df_stocks.isnull().sum()
df_news.isnull().sum()
df_news.drop(columns=['Unnamed: 0'], inplace=True)

**Normalized Date**

In [17]:
df_stocks['Date'] = pd.to_datetime(df_stocks['Date'], errors='coerce')
df_news['date'] = pd.to_datetime(df_news['date'], errors='coerce')

df_stocks['Date'] = df_stocks['Date'].dt.strftime('%Y-%m-%d')
df_news['date'] = df_news['date'].dt.strftime('%Y-%m-%d')

**Align Date in Both news and stock data**

In [18]:
df_stocks = df_stocks[df_stocks['Date'].isin(df_news['date'])]

**Aggregate News Sentiment**

In [23]:
import nltk
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
df_news['SentimentScore'] = df_news['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [35]:
daily_sentiment = df_news.groupby('date')['SentimentScore'].mean().reset_index()
daily_sentiment.rename(columns={'date': 'Date'}, inplace=True)
daily_sentiment.head()

Unnamed: 0,Date,SentimentScore
0,2011-04-27,0.0
1,2011-04-28,0.125
2,2011-04-29,0.36755
3,2011-04-30,0.2023
4,2011-05-01,0.0


**Merge the aggregated news sentiment with the stock price data on the date column.**

In [36]:
df_stocks.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')

In [37]:
daily_sentiment.columns

Index(['Date', 'SentimentScore'], dtype='object')

In [40]:
df_normalized = pd.merge(df_stocks, daily_sentiment, on='Date', how='inner')
df_normalized.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,SentimentScore
0,2011-04-27,12.58,12.583929,12.396429,12.505357,10.560461,356213200,0.0,0.0,0.0
1,2011-04-28,12.363929,12.491071,12.34,12.383929,10.457921,360959200,0.0,0.0,0.125
2,2011-04-29,12.385,12.641071,12.381071,12.504643,10.559862,1006345200,0.0,0.0,0.36755
3,2011-05-02,12.490714,12.516786,12.339286,12.367143,10.443746,442713600,0.0,0.0,0.136444
4,2011-05-03,12.428214,12.496071,12.343571,12.435714,10.50165,313348000,0.0,0.0,0.0


**Calculate Daily Return**

In [43]:
df_normalized['DailyReturn'] = df_normalized['Close'].pct_change()
df_normalized['DailyReturn'].head(2)

Unnamed: 0,DailyReturn
0,
1,-0.00971


**Drop any NaN values**

In [44]:
df_normalized.dropna(inplace=True)

**Final Normalized Data**

In [45]:
df_normalized.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,SentimentScore,DailyReturn
1,2011-04-28,12.363929,12.491071,12.34,12.383929,10.457921,360959200,0.0,0.0,0.125,-0.00971
2,2011-04-29,12.385,12.641071,12.381071,12.504643,10.559862,1006345200,0.0,0.0,0.36755,0.009748
3,2011-05-02,12.490714,12.516786,12.339286,12.367143,10.443746,442713600,0.0,0.0,0.136444,-0.010996
4,2011-05-03,12.428214,12.496071,12.343571,12.435714,10.50165,313348000,0.0,0.0,0.0,0.005545
5,2011-05-05,12.442857,12.533929,12.358929,12.383929,10.457921,335969200,0.0,0.0,-0.042667,-0.004164


**Calculate Correlation**

In [46]:
correlation = df_normalized[['SentimentScore', 'DailyReturn']].corr(method='pearson').iloc[0, 1]
print(f"Pearson correlation coefficient: {correlation}")

Pearson correlation coefficient: 0.01726183605568182
