In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#for Sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


**Data Preparation**

In [72]:
df_AAPL = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/AAPL_historical_data.csv")
df_AMZN = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/AMZN_historical_data.csv")
df_GOOG = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/GOOG_historical_data.csv")
df_META = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/MSFT_historical_data.csv")
df_MSFT = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/MSFT_historical_data.csv")
df_NVDA = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/NVDA_historical_data.csv")
df_TSLA = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/yfinance_data/TSLA_historical_data.csv")

#news data
df_news = pd.read_csv("/content/drive/MyDrive/10Academy/week 1/raw_analyst_ratings.csv")

In [73]:
#combine data
dfs = [df_AAPL, df_AMZN, df_GOOG, df_META, df_MSFT, df_NVDA, df_TSLA]
combined_df = pd.concat(dfs, ignore_index=True)

In [74]:
df_AAPL['Company'] = 'AAPL'
df_AMZN['Company'] = 'AMZN'
df_GOOG['Company'] = 'GOOG'
df_META['Company'] = 'META'
df_MSFT['Company'] = 'MSFT'
df_NVDA['Company'] = 'NVDA'
df_TSLA['Company'] = 'TSLA'
combined_df = pd.concat(dfs, ignore_index=True)

**Normalize date in the news dataset**

In [75]:
#STock
combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce', utc=True)
combined_df.set_index('Date', inplace=True)

#News
df_news['date'] = pd.to_datetime(df_news['date'], errors='coerce', utc=True)
df_news.set_index('date', inplace=True)

**Map news with stcok**

In [65]:
symbol_mapping = {
    'AAPL': 'A',
    'AMZN': 'AA',
    'GOOG': 'AAC',
    'META': 'META',
    'MSFT': 'MSFT',
    'NVDA': 'NVDA',
    'TSLA': 'TSLA'
}

df_news['stock_symbol'] = df_news['stock'].map(symbol_mapping)

**Perform Sentiment Analysis**

In [76]:
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
df_news['Sentiment'] = df_news['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])

#print
df_news['Sentiment']

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0_level_0,Sentiment
date,Unnamed: 1_level_1
2020-06-05 14:30:54+00:00,0.0000
2020-06-03 14:45:20+00:00,0.0000
2020-05-26 08:30:07+00:00,0.0000
2020-05-22 16:45:06+00:00,0.0000
2020-05-22 15:38:59+00:00,0.2960
...,...
NaT,0.2023
NaT,-0.3818
NaT,0.0000
NaT,0.0000


**Calculate Daily Returns**

In [77]:
combined_df['Daily Return'] = combined_df['Close'].pct_change()
combined_df['Daily Return']

Unnamed: 0_level_0,Daily Return
Date,Unnamed: 1_level_1
1980-12-12 00:00:00+00:00,
1980-12-15 00:00:00+00:00,-0.052171
1980-12-16 00:00:00+00:00,-0.073398
1980-12-17 00:00:00+00:00,0.024751
1980-12-18 00:00:00+00:00,0.028992
...,...
2024-07-24 00:00:00+00:00,-0.123346
2024-07-25 00:00:00+00:00,0.019723
2024-07-26 00:00:00+00:00,-0.002043
2024-07-29 00:00:00+00:00,0.055960


**Aggregate Sentiments**

In [79]:
daily_sentiment = df_news.groupby([df_news.index, 'stock'])['Sentiment'].mean().reset_index()
daily_sentiment

Unnamed: 0,date,stock,Sentiment
0,2011-04-28 01:01:48+00:00,DGP,0.0000
1,2011-04-28 17:49:29+00:00,ESR,0.0000
2,2011-04-28 19:00:36+00:00,DEJ,0.2500
3,2011-04-29 17:47:06+00:00,AIA,0.0000
4,2011-04-29 20:11:05+00:00,GDL,0.7351
...,...,...,...
55973,2020-06-11 20:49:41+00:00,PVH,0.0000
55974,2020-06-11 20:51:33+00:00,WMT,0.2732
55975,2020-06-11 21:01:39+00:00,TWTR,-0.2500
55976,2020-06-11 21:11:20+00:00,PCG,0.4215


**Merge sentiment data with stock data**

In [81]:
combined_df.index = combined_df.index.tz_localize(None)
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date']).dt.tz_localize(None)

merged_data = pd.merge(
    combined_df.reset_index(),
    daily_sentiment,
    left_on=['Date', 'Company'],
    right_on=['date', 'stock'],
    how='inner'
)

#drop
merged_data.drop(columns=['stock'], inplace=True)
merged_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Company,Daily Return,date,Sentiment


**Correlation Analysis**

In [82]:
correlation = merged_data['Sentiment'].corr(merged_data['Daily Return'])
print(f"Correlation between daily returns and sentiment: {correlation}")

Correlation between daily returns and sentiment: nan
