In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
!pip install demoji
import demoji
import re
import string
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
import attr
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng') # Download the missing resource
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
# Cleaning tweets
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Trying to avoid deleting the negative verbs as it affects the meaning of the tweets.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ]

    # Lowering tweets
    lower_tweet = text.lower()
    # Removing hashtag and cashtag symbols
    tweet = re.sub(r"[#$]"," ",lower_tweet)
    # Removing links from tweets
    tweet = re.sub(r"https?:\/\/.*[\r\n]*"," ", tweet)
    # Translating emojies into thier descriptions
    tweet = demoji.replace_with_desc(tweet)
    # removing numerical values
    tweet = re.sub(r"[0-9]|-->","",tweet)
    # Tokenize the tweets by twitter tokenzier.
    tweet = tk.tokenize(tweet)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence
    tweet = " ".join(tweet)

    return tweet

In [None]:
# Read the scrapped file
tweets = pd.read_csv("BA_tweets.csv")
tweets.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-26 00:01:01+00:00,$TSLA - Above 273 - Trade Idea 💡 - Sept 30 280...,BA,The Boeing Company
1,2022-09-25 02:00:37+00:00,Boeing $BA Stock Drops as the Company Agrees t...,BA,The Boeing Company
2,2022-09-23 06:55:00+00:00,🐳 news:\n\n- $AAPL x NFL superbowl\n- Stock tr...,BA,The Boeing Company
3,2022-09-22 21:24:13+00:00,Here's the 34 S&amp;P 500 stocks with market c...,BA,The Boeing Company
4,2022-09-22 21:09:08+00:00,JUST IN: The @usairforce has awarded @Raytheon...,BA,The Boeing Company


In [None]:
# Applying text cleaning and then downloading it on the current folder
tweets['cleaned'] = tweets["Tweet"].apply(lambda row:clean_text(row))
tweets.to_csv("Cleaned_BA_Tweets.csv",index=False)

In [None]:
tweets = pd.read_csv("Cleaned_BA_Tweets.csv")
tweets.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name,cleaned
0,2022-09-26 00:01:01+00:00,$TSLA - Above 273 - Trade Idea 💡 - Sept 30 280...,BA,The Boeing Company,tsla trade idea light bulb sept close let's se...
1,2022-09-25 02:00:37+00:00,Boeing $BA Stock Drops as the Company Agrees t...,BA,The Boeing Company,boeing stock drop company agrees pay million r...
2,2022-09-23 06:55:00+00:00,🐳 news:\n\n- $AAPL x NFL superbowl\n- Stock tr...,BA,The Boeing Company,spout whale news aapl nfl superbowl stock trad...
3,2022-09-22 21:24:13+00:00,Here's the 34 S&amp;P 500 stocks with market c...,BA,The Boeing Company,here's stock market cap billion last year tota...
4,2022-09-22 21:09:08+00:00,JUST IN: The @usairforce has awarded @Raytheon...,BA,The Boeing Company,award contract continue development hypersonic...


Sentiment analysis by pretrained model

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from scipy.special import softmax
import numpy as np

In [None]:
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
# Label mapping
config = {
    "id2label": {
        0: "Negative",
        1: "Neutral",
        2: "Positive"
    }
}

# Function to calculate polarity
def polarity(text):
    # Tokenize and encode the input text
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Get the model's output
    with torch.no_grad():
        output = model(**encoded_input)

    # Convert logits to numpy and apply softmax to get probabilities
    scores = output.logits[0].numpy()
    scores = softmax(scores)

    # Get the highest probability and corresponding sentiment label
    ranking = np.argsort(scores)[::-1]  # Sort scores in descending order
    label = config['id2label'][ranking[0]]  # Get the top-ranked label

    # Determine polarity: -1 for Negative, 0 for Neutral, 1 for Positive
    polarity_score = -1 if label == "Negative" else 1 if label == "Positive" else 0
    sentiment_confidence = np.round(float(scores[ranking[0]]), 4)  # Confidence score

    return (label, polarity_score)

In [None]:
# Apply the polarity function to the 'cleaned' column in the DataFrame
tweets['label'], tweets['Polarity'] = zip(*tweets['cleaned'].apply(lambda txt: polarity(txt)))

# Save the DataFrame with sentiment labels and polarity scores
tweets.to_csv("Polarized_BA_Tweets.csv", index=False)

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Preparing Data For Time Series Model

In [None]:
ptweets = pd.read_csv("Polarized_BA_Tweets.csv")
ptweets

Unnamed: 0,Date,Tweet,Stock Name,Company Name,cleaned,label,Polarity
0,2022-09-26 00:01:01+00:00,$TSLA - Above 273 - Trade Idea 💡 - Sept 30 280...,BA,The Boeing Company,tsla trade idea light bulb sept close let's se...,Negative,-1
1,2022-09-25 02:00:37+00:00,Boeing $BA Stock Drops as the Company Agrees t...,BA,The Boeing Company,boeing stock drop company agrees pay million r...,Negative,-1
2,2022-09-23 06:55:00+00:00,🐳 news:\n\n- $AAPL x NFL superbowl\n- Stock tr...,BA,The Boeing Company,spout whale news aapl nfl superbowl stock trad...,Positive,1
3,2022-09-22 21:24:13+00:00,Here's the 34 S&amp;P 500 stocks with market c...,BA,The Boeing Company,here's stock market cap billion last year tota...,Negative,-1
4,2022-09-22 21:09:08+00:00,JUST IN: The @usairforce has awarded @Raytheon...,BA,The Boeing Company,award contract continue development hypersonic...,Neutral,0
...,...,...,...,...,...,...,...
394,2021-10-06 15:20:50+00:00,NASA confirms the reassignment of astronauts N...,BA,The Boeing Company,nasa confirms reassignment astronaut nicole ma...,Negative,-1
395,2021-10-06 11:45:24+00:00,$CYBL Acquisition this week\n\nAnother acquisi...,BA,The Boeing Company,cybl acquisition week another acquisition end ...,Negative,-1
396,2021-10-05 20:21:16+00:00,$BA Toying with the idea of taking flight. Ins...,BA,The Boeing Company,toy idea take flight inside day,Negative,-1
397,2021-10-03 23:02:30+00:00,"10/3 Options Watchlist\n\n$ROKU over 317.5, 32...",BA,The Boeing Company,option watchlist roku nflx spot mrna afrm drop...,Negative,-1


In [None]:
# Extracting the date and polarized values from the previous dataframe
ptweets_df = ptweets.loc[:,["Date","Polarity"]]
ptweets_df.head()

Unnamed: 0,Date,Polarity
0,2022-09-26 00:01:01+00:00,-1
1,2022-09-25 02:00:37+00:00,-1
2,2022-09-23 06:55:00+00:00,1
3,2022-09-22 21:24:13+00:00,-1
4,2022-09-22 21:09:08+00:00,0


In [None]:
# Change the date format to match with the next csv file date format
ptweets_df['Date'] =pd.to_datetime(ptweets_df['Date'],infer_datetime_format=True)
ptweets_df['Date'] =pd.to_datetime(ptweets_df['Date'].dt.strftime("%m/%d/%y"))

# Aggregate the tweets polarization by avergae, sum and counts
Pol_df = pd.DataFrame(ptweets_df.groupby('Date')['Polarity'].mean())
Pol_df.rename(columns={"Polarity":"P_mean"},inplace=True)
Pol_df['P_sum'] = ptweets_df.groupby('Date')['Polarity'].sum()
Pol_df['twt_count'] = ptweets_df.groupby('Date')['Polarity'].count()
Pol_df

  ptweets_df['Date'] =pd.to_datetime(ptweets_df['Date'],infer_datetime_format=True)
  ptweets_df['Date'] =pd.to_datetime(ptweets_df['Date'].dt.strftime("%m/%d/%y"))


Unnamed: 0_level_0,P_mean,P_sum,twt_count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-10-03,-1.0,-2,2
2021-10-05,-1.0,-1,1
2021-10-06,-0.8,-4,5
2021-10-07,-0.5,-1,2
2021-10-08,-0.5,-1,2
...,...,...,...
2022-09-15,-1.0,-2,2
2022-09-22,0.0,0,3
2022-09-23,1.0,1,1
2022-09-25,-1.0,-1,1


In [None]:
stock_df = pd.read_csv("BA.csv")
stock_df['Date'] = pd.to_datetime(stock_df['Date'],infer_datetime_format=True)
stock_df.set_index("Date")
# Adding the polarization column in the netflix dataframe.
final_df = stock_df.join(Pol_df,on='Date',how="inner")
final_df.head()

  stock_df['Date'] = pd.to_datetime(stock_df['Date'],infer_datetime_format=True)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,P_mean,P_sum,twt_count
3,2021-10-05,224.270004,226.449997,222.419998,224.419998,224.419998,5687300,BA,-1.0,-1,1
4,2021-10-06,221.080002,225.070007,219.199997,224.990005,224.990005,7157600,BA,-0.8,-4,5
5,2021-10-07,227.339996,229.0,225.899994,226.479996,226.479996,7748500,BA,-0.5,-1,2
6,2021-10-08,226.75,228.710007,225.399994,226.389999,226.389999,4515000,BA,-0.5,-1,2
7,2021-10-11,226.5,232.100006,225.320007,226.449997,226.449997,7287000,BA,0.0,0,1


In [None]:
final_df.to_csv("Final_BA.csv",index=False)