# Sentiment Analysis of Twitter Data for Predicting Stock Market Movements

#### Data Source: https://www.kaggle.com/code/saadusama/twitter-s-impact-on-stock-market-prices/input

### Why Twitter data?
#### * Twitter like a corpus with valuable data for researchers
#### * Each tweet is of 140 characters long and speaks public opinion on a topic concisely.
#### * Sentiment classiﬁcation is the task of judging opinion in a piece of text as positive, negative or neutral.


## Import important Libraries 

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Read the CSV Files

In [6]:
company = pd.read_csv('Company.csv')
tweet_id = pd.read_csv('Company_Tweet.csv')
tweet = pd.read_csv('Tweet.csv')
company_value = pd.read_csv('CompanyValues.csv')

company.head(10)

Unnamed: 0,ticker_symbol,company_name
0,AAPL,apple
1,GOOG,Google Inc
2,GOOGL,Google Inc
3,AMZN,Amazon.com
4,TSLA,Tesla Inc
5,MSFT,Microsoft


In [7]:
tweet.head()

Unnamed: 0,tweet_id,writer,post_date,body,comment_num,retweet_num,like_num
0,550441509175443456,VisualStockRSRC,1420070457,"lx21 made $10,008 on $AAPL -Check it out! htt...",0,0,1
1,550441672312512512,KeralaGuy77,1420070496,Insanity of today weirdo massive selling. $aap...,0,0,0
2,550441732014223360,DozenStocks,1420070510,S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...,0,0,0
3,550442977802207232,ShowDreamCar,1420070807,$GM $TSLA: Volkswagen Pushes 2014 Record Recal...,0,0,1
4,550443807834402816,i_Know_First,1420071005,Swing Trading: Up To 8.91% Return In 14 Days h...,0,0,1


In [8]:
tweet_id.head()

Unnamed: 0,tweet_id,ticker_symbol
0,550803612197457920,AAPL
1,550803610825928706,AAPL
2,550803225113157632,AAPL
3,550802957370159104,AAPL
4,550802855129382912,AAPL


In [9]:
company_value.head()

Unnamed: 0,ticker_symbol,day_date,close_value,volume,open_value,high_value,low_value
0,AAPL,2020-05-29,317.94,38399530,319.25,321.15,316.47
1,AAPL,2020-05-28,318.25,33449100,316.77,323.44,315.63
2,AAPL,2020-05-27,318.11,28236270,316.14,318.71,313.09
3,AAPL,2020-05-26,316.73,31380450,323.5,324.24,316.5
4,AAPL,2020-05-22,318.89,20450750,315.77,319.23,315.35


In [10]:
company_value['day_date'] = pd.to_datetime(company_value["day_date"]).dt.strftime('%d-%m-%Y')
company_value['day_date'] = pd.to_datetime(company_value["day_date"])
company_value.head()

  company_value['day_date'] = pd.to_datetime(company_value["day_date"])


Unnamed: 0,ticker_symbol,day_date,close_value,volume,open_value,high_value,low_value
0,AAPL,2020-05-29,317.94,38399530,319.25,321.15,316.47
1,AAPL,2020-05-28,318.25,33449100,316.77,323.44,315.63
2,AAPL,2020-05-27,318.11,28236270,316.14,318.71,313.09
3,AAPL,2020-05-26,316.73,31380450,323.5,324.24,316.5
4,AAPL,2020-05-22,318.89,20450750,315.77,319.23,315.35


In [11]:
tweets = pd.merge(tweet_id , tweet , on="tweet_id", how= "inner")
total = tweets["comment_num"] + tweets["retweet_num"] + tweets["like_num"]
tweets["Date"] = pd.to_datetime(tweets['post_date'],unit='s').dt.strftime('%d-%m-%Y')
tweets["total_engangement"] = total
tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement
0,550803612197457920,AAPL,SentiQuant,1420156789,#TOPTICKERTWEETS $AAPL $IMRS $BABA $EBAY $AMZN...,0,0,1,01-01-2015,1
1,550803610825928706,AAPL,SentiQuant,1420156788,#SENTISHIFTUP $K $FB $GOOGL $GS $GOLD $T $AAPL...,0,0,1,01-01-2015,1
2,550803225113157632,AAPL,MacHashNews,1420156696,Rumor Roundup: What to expect when you're expe...,0,0,0,01-01-2015,0
3,550802957370159104,AAPL,WaltLightShed,1420156633,"An $AAPL store line in Sapporo Japan for the ""...",2,4,4,01-01-2015,10
4,550802855129382912,AAPL,2waystrading,1420156608,$AAPL - Will $AAPL Give Second entry opportuni...,0,0,0,01-01-2015,0


In [12]:
tweets = tweets.loc[tweets["total_engangement"] > 200]
tweets = tweets.sort_values(["total_engangement"] , ascending = False)
tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement
384936,692169663577485315,AAPL,ValaAfshar,1453861082,Apple has $216 billion in cash. It could buy a...,42,984,677,27-01-2016,1703
625471,770310550991605760,AAPL,cnntech,1472491321,Apple's next iPhone will likely be unveiled Se...,11,729,918,29-08-2016,1658
79388,575014851363405824,AAPL,RANsquawk,1425929198,Loving my Apple Watch $AAPL,66,882,654,09-03-2015,1602
2400850,816359802733555712,AMZN,DavidSchawel,1483470318,Sometimes hard to wrap your head around $AMZN,14,646,900,03-01-2017,1560
2443148,854690001866686464,AMZN,philstockworld,1492608950,"Will We Hold It Wednesday - Nasdaq 5,400 Editi...",0,969,520,19-04-2017,1489


In [13]:
def getSentiment(body):
    
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    
    assert body is not None
    vs = analyzer.polarity_scores(body)
    score = vs['compound']
    
    if (score >= 0.05): 
        return "Positive"
    
    elif (score < 0.05 and score > -0.05):
        return "Neutral"
    
    elif (score <= -0.05):    
        return "Negetive"
    
    print(score)

In [14]:
tweets['Sentiment'] = tweets['body'].apply(lambda x: getSentiment(x))
tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement,Sentiment
384936,692169663577485315,AAPL,ValaAfshar,1453861082,Apple has $216 billion in cash. It could buy a...,42,984,677,27-01-2016,1703,Neutral
625471,770310550991605760,AAPL,cnntech,1472491321,Apple's next iPhone will likely be unveiled Se...,11,729,918,29-08-2016,1658,Neutral
79388,575014851363405824,AAPL,RANsquawk,1425929198,Loving my Apple Watch $AAPL,66,882,654,09-03-2015,1602,Positive
2400850,816359802733555712,AMZN,DavidSchawel,1483470318,Sometimes hard to wrap your head around $AMZN,14,646,900,03-01-2017,1560,Negetive
2443148,854690001866686464,AMZN,philstockworld,1492608950,"Will We Hold It Wednesday - Nasdaq 5,400 Editi...",0,969,520,19-04-2017,1489,Neutral


In [15]:
tweets = tweets.loc[tweets["Sentiment"] != "Neutral"]
tweets.head(15)

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement,Sentiment
79388,575014851363405824,AAPL,RANsquawk,1425929198,Loving my Apple Watch $AAPL,66,882,654,09-03-2015,1602,Positive
2400850,816359802733555712,AMZN,DavidSchawel,1483470318,Sometimes hard to wrap your head around $AMZN,14,646,900,03-01-2017,1560,Negetive
3744176,1021481848403382272,TSLA,QTRResearch,1532375225,"Guys - I'm beside myself & before you ask, thi...",207,317,899,23-07-2018,1423,Negetive
2465942,875518367003791362,AMZN,SJosephBurns,1497574819,$AMZN has no stores $Uber no cars $FB creates ...,40,509,837,16-06-2017,1386,Negetive
193008,613718497219076096,AAPL,Carl_C_Icahn,1435156866,Sold last of our $NFLX today. Believe $AAPL c...,153,671,533,24-06-2015,1357,Positive
3892691,1054728662786826240,TSLA,CitronResearch,1540301883,$TSLA dropping earnings on top of $F tomorrow ...,148,308,861,23-10-2018,1317,Negetive
3739886,1020077355346169857,TSLA,vincent13031925,1532040368,"Tesla Spokesperson, Regarding The Earlier Need...",38,256,986,19-07-2018,1280,Negetive
1794434,1135604016015060993,GOOG,willchamberlain,1559584070,"FACEBOOK, GOOGLE STOCK TANK: Trump is punishin...",58,389,826,03-06-2019,1273,Negetive
4303338,1199424478536753155,TSLA,AlexSibila,1574800054,~Tesla feature request thread~Feel free to rep...,563,48,662,26-11-2019,1273,Positive
3851099,1045404879341137921,TSLA,Reuters,1538078920,SEC files lawsuit against Tesla CEO Elon Musk ...,56,630,585,27-09-2018,1271,Negetive


In [16]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4090 entries, 79388 to 3853101
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   tweet_id           4090 non-null   int64 
 1   ticker_symbol      4090 non-null   object
 2   writer             4072 non-null   object
 3   post_date          4090 non-null   int64 
 4   body               4090 non-null   object
 5   comment_num        4090 non-null   int64 
 6   retweet_num        4090 non-null   int64 
 7   like_num           4090 non-null   int64 
 8   Date               4090 non-null   object
 9   total_engangement  4090 non-null   int64 
 10  Sentiment          4090 non-null   object
dtypes: int64(6), object(5)
memory usage: 383.4+ KB


In [17]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/bagiya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/bagiya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bagiya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
tweets['body'] = tweets['body'].str.lower()
tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement,Sentiment
79388,575014851363405824,AAPL,RANsquawk,1425929198,loving my apple watch $aapl,66,882,654,09-03-2015,1602,Positive
2400850,816359802733555712,AMZN,DavidSchawel,1483470318,sometimes hard to wrap your head around $amzn,14,646,900,03-01-2017,1560,Negetive
3744176,1021481848403382272,TSLA,QTRResearch,1532375225,"guys - i'm beside myself & before you ask, thi...",207,317,899,23-07-2018,1423,Negetive
2465942,875518367003791362,AMZN,SJosephBurns,1497574819,$amzn has no stores $uber no cars $fb creates ...,40,509,837,16-06-2017,1386,Negetive
193008,613718497219076096,AAPL,Carl_C_Icahn,1435156866,sold last of our $nflx today. believe $aapl c...,153,671,533,24-06-2015,1357,Positive


## DATA COLLECTION AND PREPROCESSING

#### Tweets consists of many acronyms, emoticons and unnecessary data like pictures and URL’s. So tweets are preprocessed to represent correct emotions of public.

#### - Tokenization
#### Tweets are split into individual words based on the space and irrelevant symbols like emoticons are removed. We form a list of individual words for each tweet.
#### - Stopwords removal 
#### Words that do not express any emotion are called Stopwords. After splitting a tweet, words like a,is, the, with etc. are removed from the list of words.
#### - regex matching for removing special characters.
#### Regex
#### matching in Python is performed to match URLs and are replaced by the term URL. Often tweets consists of hash- tags(#) and @ addressing other users. They are also replaced suitably.

### Removing all characters that are not letters or spaces

In [19]:
import re

# Variable to replace all characters that are not letters or whitespace
regex = re.compile('[^a-z\s]')
# Removes all characters that are not letters or spaces
tweets['body'] = tweets['body'].apply(lambda x: regex.sub('', x))
tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement,Sentiment
79388,575014851363405824,AAPL,RANsquawk,1425929198,loving my apple watch aapl,66,882,654,09-03-2015,1602,Positive
2400850,816359802733555712,AMZN,DavidSchawel,1483470318,sometimes hard to wrap your head around amzn,14,646,900,03-01-2017,1560,Negetive
3744176,1021481848403382272,TSLA,QTRResearch,1532375225,guys im beside myself before you ask this is...,207,317,899,23-07-2018,1423,Negetive
2465942,875518367003791362,AMZN,SJosephBurns,1497574819,amzn has no stores uber no cars fb creates no ...,40,509,837,16-06-2017,1386,Negetive
193008,613718497219076096,AAPL,Carl_C_Icahn,1435156866,sold last of our nflx today believe aapl curr...,153,671,533,24-06-2015,1357,Positive


### Remove words with less than 3 characters

In [20]:
tweets['body'] = tweets['body'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2]))
tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement,Sentiment
79388,575014851363405824,AAPL,RANsquawk,1425929198,loving apple watch aapl,66,882,654,09-03-2015,1602,Positive
2400850,816359802733555712,AMZN,DavidSchawel,1483470318,sometimes hard wrap your head around amzn,14,646,900,03-01-2017,1560,Negetive
3744176,1021481848403382272,TSLA,QTRResearch,1532375225,guys beside myself before you ask this not jok...,207,317,899,23-07-2018,1423,Negetive
2465942,875518367003791362,AMZN,SJosephBurns,1497574819,amzn has stores uber cars creates content baba...,40,509,837,16-06-2017,1386,Negetive
193008,613718497219076096,AAPL,Carl_C_Icahn,1435156866,sold last our nflx today believe aapl currentl...,153,671,533,24-06-2015,1357,Positive


### Tokenization


In [21]:
import nltk
from nltk.tokenize import word_tokenize
tweets['text_without_stopwords'] = tweets['body'].apply(word_tokenize)

### Removing Stop Words

In [22]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tweets['text_without_stopwords'] = tweets['text_without_stopwords'].apply(lambda x: [word for word in x if word not in stop_words])
tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement,Sentiment,text_without_stopwords
79388,575014851363405824,AAPL,RANsquawk,1425929198,loving apple watch aapl,66,882,654,09-03-2015,1602,Positive,"[loving, apple, watch, aapl]"
2400850,816359802733555712,AMZN,DavidSchawel,1483470318,sometimes hard wrap your head around amzn,14,646,900,03-01-2017,1560,Negetive,"[sometimes, hard, wrap, head, around, amzn]"
3744176,1021481848403382272,TSLA,QTRResearch,1532375225,guys beside myself before you ask this not jok...,207,317,899,23-07-2018,1423,Negetive,"[guys, beside, ask, joke, got, phone, montana,..."
2465942,875518367003791362,AMZN,SJosephBurns,1497574819,amzn has stores uber cars creates content baba...,40,509,837,16-06-2017,1386,Negetive,"[amzn, stores, uber, cars, creates, content, b..."
193008,613718497219076096,AAPL,Carl_C_Icahn,1435156866,sold last our nflx today believe aapl currentl...,153,671,533,24-06-2015,1357,Positive,"[sold, last, nflx, today, believe, aapl, curre..."


### Word Normalization using Lemmatization¶

In [23]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tweets['normalized_text'] = tweets['text_without_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
tweets.head()

Unnamed: 0,tweet_id,ticker_symbol,writer,post_date,body,comment_num,retweet_num,like_num,Date,total_engangement,Sentiment,text_without_stopwords,normalized_text
79388,575014851363405824,AAPL,RANsquawk,1425929198,loving apple watch aapl,66,882,654,09-03-2015,1602,Positive,"[loving, apple, watch, aapl]","[loving, apple, watch, aapl]"
2400850,816359802733555712,AMZN,DavidSchawel,1483470318,sometimes hard wrap your head around amzn,14,646,900,03-01-2017,1560,Negetive,"[sometimes, hard, wrap, head, around, amzn]","[sometimes, hard, wrap, head, around, amzn]"
3744176,1021481848403382272,TSLA,QTRResearch,1532375225,guys beside myself before you ask this not jok...,207,317,899,23-07-2018,1423,Negetive,"[guys, beside, ask, joke, got, phone, montana,...","[guy, beside, ask, joke, got, phone, montana, ..."
2465942,875518367003791362,AMZN,SJosephBurns,1497574819,amzn has stores uber cars creates content baba...,40,509,837,16-06-2017,1386,Negetive,"[amzn, stores, uber, cars, creates, content, b...","[amzn, store, uber, car, creates, content, bab..."
193008,613718497219076096,AAPL,Carl_C_Icahn,1435156866,sold last our nflx today believe aapl currentl...,153,671,533,24-06-2015,1357,Positive,"[sold, last, nflx, today, believe, aapl, curre...","[sold, last, nflx, today, believe, aapl, curre..."


In [24]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4090 entries, 79388 to 3853101
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   tweet_id                4090 non-null   int64 
 1   ticker_symbol           4090 non-null   object
 2   writer                  4072 non-null   object
 3   post_date               4090 non-null   int64 
 4   body                    4090 non-null   object
 5   comment_num             4090 non-null   int64 
 6   retweet_num             4090 non-null   int64 
 7   like_num                4090 non-null   int64 
 8   Date                    4090 non-null   object
 9   total_engangement       4090 non-null   int64 
 10  Sentiment               4090 non-null   object
 11  text_without_stopwords  4090 non-null   object
 12  normalized_text         4090 non-null   object
dtypes: int64(6), object(7)
memory usage: 447.3+ KB


In [25]:
tweets.to_csv('final_dataset.csv', index=False)


In [26]:
def return_sequence(tokens):
  return " ".join([token for token in tokens])

tweets['pre_processed_text'] = tweets['normalized_text'].apply(return_sequence)

## Generate representations

### Bag of Words

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(tweets['pre_processed_text'].values.tolist())

In [28]:
count_matrix.toarray().shape

(4090, 11129)

In [29]:
count_matrix

<4090x11129 sparse matrix of type '<class 'numpy.int64'>'
	with 66851 stored elements in Compressed Sparse Row format>

In [30]:
# Visualizing the BOW representation

bow_matrix = count_matrix.toarray()

# Get the feature names 
feature_names = cv.get_feature_names_out()

# Create a DataFrame for visualization
# import pandas as pd
# df_bow = pd.DataFrame(bow_matrix, columns=feature_names)
# print(df_bow)

### Drawbacks 
#### - No Semantic Understanding
#### - Difficulty in modelling sparse Representations

### TF-IDF

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(tweets['pre_processed_text'].values.tolist())

In [32]:
tfidf_array = tfidf_matrix.toarray()

In [33]:
# Get the feature names 
# feature_names = tfidf.get_feature_names_out()

# Create a DataFrame for visualization
# import pandas as pd
# df_tfidf = pd.DataFrame(tfidf_array, columns=feature_names)
# print(df_tfidf)

### Continuous Bag of Words (CBOW)

In [34]:

import numpy as np
from gensim.models.word2vec import Word2Vec
cbow = Word2Vec(tweets['pre_processed_text'].values.tolist(), vector_size=100, window=5, min_count=2, sg=0)
vocab = cbow.wv.index_to_key

def get_mean_vector(model, sentence):
    words = [word for word in sentence if word in vocab]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    return np.zeros((100,))

cbow_array = []
for sentence in tweets['pre_processed_text'].values.tolist():
    cbow_array.append(get_mean_vector(cbow, sentence))

In [35]:
cbow_array = np.array(cbow_array)
cbow_array.shape

(4090, 100)

In [36]:
word_vectors = cbow.wv
word_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x7f05b2f49130>

### Skipgram

In [37]:
sg = Word2Vec(tweets['pre_processed_text'].values.tolist(), vector_size=100, window=5, min_count=2, sg=1)
vocab = sg.wv.index_to_key

def get_mean_vector(model, sentence):
    words = [word for word in sentence if word in vocab]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    return np.zeros((100,))

sg_array = []
for sentence in tweets['pre_processed_text'].values.tolist():
    sg_array.append(get_mean_vector(sg, sentence))

In [38]:
sg_array = np.array(sg_array)
sg_array.shape

(4090, 100)

### BERT

In [43]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [44]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [45]:
def text_to_vector(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    input_ids = torch.tensor([input_ids])
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

In [46]:
import numpy as np

texts = tweets['pre_processed_text']

text_vectors = np.array([text_to_vector(text) for text in texts])

# ratings = ratings.values.reshape(-1, 1)

X = text_vectors
y = tweets['Sentiment']

### Glove

In [56]:
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

In [57]:
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)


  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [58]:
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)


In [61]:
def document_vector(doc):
    words = [word for word in doc if word in model.key_to_index]  # Use key_to_index to check word existence
    if not words:
        # If no words in the doc are in the model's vocabulary, return a zero vector.
        return np.zeros(model.vector_size)
    return np.mean(model[words], axis=0)


In [62]:
tweets_vector_glove = np.array([document_vector(doc.split()) for doc in tweets['pre_processed_text'].values.tolist()])


### Feature Engineering

In [48]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [49]:
lb = LabelEncoder()
tweets['Sentiment'] = lb.fit_transform(tweets['Sentiment'])

In [50]:
# Split data into training and testing sets
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(X, y, test_size=0.2, random_state=9)

print("Shape of X_train_bert:", X_train_bert.shape)
print("Shape of X_test_bert:", X_test_bert.shape)
print("Shape of y_train_bert:", y_train_bert.shape)
print("Shape of y_test_bert:", y_test_bert.shape)

Shape of X_train_bert: (3272, 768)
Shape of X_test_bert: (818, 768)
Shape of y_train_bert: (3272,)
Shape of y_test_bert: (818,)


In [40]:
y = tweets['Sentiment']

In [41]:
x_train_bow, x_test_bow, y_train_bow, y_test_bow = train_test_split(count_matrix, y, test_size=0.2, random_state=9)

In [42]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_array, y, test_size=0.2, random_state=9)

In [43]:
x_train_cbow, x_test_cbow, y_train_cbow, y_test_cbow = train_test_split(cbow_array, y, test_size=0.2, random_state=9)

In [44]:
x_train_skg, x_test_skg, y_train_skg, y_test_skg = train_test_split(sg_array, y, test_size=0.2, random_state=9)

In [63]:
x_train_glove, x_test_glove, y_train_glove, y_test_glove = train_test_split(tweets_vector_glove, y, test_size=0.2, random_state=9)


## Model Building

In [45]:
def train_and_evaluate_decision_tree(x_train, x_test, y_train, y_test, representation):
    
    dtclassifier = DecisionTreeClassifier(random_state=9,max_depth=5)
    dtclassifier.fit(x_train, y_train)
    y_pred = dtclassifier.predict(x_test)

    print(f"\nMetrics for {representation}:")
    print(f"Model Score: {dtclassifier.score(x_train,y_train)}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

In [46]:
def train_and_evaluate_svm(x_train, x_test, y_train, y_test, representation):
    # Initialize SVM model
    svm = SVC(random_state=9)

    # Train the model
    svm.fit(x_train, y_train)

    # Predict on test set
    y_pred = svm.predict(x_test)

    # Print evaluation metrics
    print(f"\nMetrics for {representation} (SVM):")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



In [51]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

def train_and_evaluate_knn(x_train, x_test, y_train, y_test, representation):
    # Initialize KNN model
    knn = KNeighborsClassifier()

    # Train the model
    knn.fit(x_train, y_train)

    # Predict on test set
    y_pred = knn.predict(x_test)

    # Print evaluation metrics
    print(f"\nMetrics for {representation} (KNN):")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



In [47]:
def train_and_evaluate_random_forest(x_train, x_test, y_train, y_test, representation):
    rf_classifier = RandomForestClassifier()
    rf_classifier.fit(x_train, y_train)
    y_pred = rf_classifier.predict(x_test)

    print(f"\nMetrics for {representation}:")
    print(f"Model Score: {rf_classifier.score(x_train, y_train)}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    return rf_classifier

In [70]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

def train_and_evaluate_ensemble(x_train, x_test, y_train, y_test, representation):

    dtclassifier = DecisionTreeClassifier(random_state=9,max_depth=5)
    svm = SVC(random_state=9)
    knn = KNeighborsClassifier()
    final_ensemble_model = VotingClassifier(estimators=[('dt', dtclassifier), ('svm', svm), ('knn', knn)],voting='hard')


    # Train the model
    final_ensemble_model.fit(x_train, y_train)

    # Predict on test set
    y_pred = final_ensemble_model.predict(x_test)

    # Print evaluation metrics
    print(f"\nMetrics for {representation}:")
    print(f"Model Score: {final_ensemble_model.score(x_train, y_train)}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


In [73]:
dtclassifier = DecisionTreeClassifier(random_state=9,max_depth=5)
svm = SVC(random_state=9)
knn = KNeighborsClassifier()
final_ensemble_model = VotingClassifier(estimators=[('dt', dtclassifier), ('svm', svm), ('knn', knn)],voting='hard')

final_ensemble_model.fit(X_train_bert, y_train_bert)

In [64]:
train_and_evaluate_ensemble(x_train_glove, x_test_glove, y_train_glove, y_test_glove, "Glove")


Metrics for Glove:
Model Score: 0.8236552567237164
Accuracy: 0.7273838630806846
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.38      0.48       278
           1       0.74      0.91      0.81       540

    accuracy                           0.73       818
   macro avg       0.71      0.64      0.65       818
weighted avg       0.72      0.73      0.70       818



In [53]:
train_and_evaluate_ensemble(X_train_bert, X_test_bert, y_train_bert, y_test_bert, "BERT")


Metrics for BERT:
Model Score: 0.8242665036674817
Accuracy: 0.7371638141809291
Classification Report:
               precision    recall  f1-score   support

    Negetive       0.70      0.40      0.51       278
    Positive       0.75      0.91      0.82       540

    accuracy                           0.74       818
   macro avg       0.72      0.66      0.67       818
weighted avg       0.73      0.74      0.71       818



In [54]:
train_and_evaluate_ensemble(x_train_bow, x_test_bow, y_train_bow, y_test_bow, "CBOW")


Metrics for CBOW:
Model Score: 0.8685819070904646
Accuracy: 0.7310513447432763
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.29      0.42       278
           1       0.72      0.96      0.82       540

    accuracy                           0.73       818
   macro avg       0.75      0.62      0.62       818
weighted avg       0.74      0.73      0.69       818



In [55]:
train_and_evaluate_ensemble(x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf, "TFIDF")


Metrics for TFIDF:
Model Score: 0.8737775061124694
Accuracy: 0.7322738386308069
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.24      0.38       278
           1       0.72      0.98      0.83       540

    accuracy                           0.73       818
   macro avg       0.80      0.61      0.61       818
weighted avg       0.77      0.73      0.68       818



## -----------------------------------

In [48]:
train_and_evaluate_decision_tree(X_train_bert, X_test_bert, y_train_bert, y_test_bert, "BERT")


Metrics for BERT:
Model Score: 0.7564180929095354
Accuracy: 0.6662591687041565
Classification Report:
               precision    recall  f1-score   support

    Negetive       0.51      0.31      0.39       278
    Positive       0.71      0.85      0.77       540

    accuracy                           0.67       818
   macro avg       0.61      0.58      0.58       818
weighted avg       0.64      0.67      0.64       818



In [67]:
train_and_evaluate_decision_tree(x_train_bow, x_test_bow, y_train_bow, y_test_bow, "BoW")


Metrics for BoW:
Model Score: 0.7145476772616137
Accuracy: 0.7029339853300733
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.25      0.37       278
           1       0.71      0.94      0.81       540

    accuracy                           0.70       818
   macro avg       0.69      0.59      0.59       818
weighted avg       0.69      0.70      0.66       818



In [68]:
train_and_evaluate_decision_tree(x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf, "TF-IDF")


Metrics for TF-IDF:
Model Score: 0.7130195599022005
Accuracy: 0.6931540342298288
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.14      0.24       278
           1       0.69      0.98      0.81       540

    accuracy                           0.69       818
   macro avg       0.73      0.56      0.52       818
weighted avg       0.71      0.69      0.61       818



In [69]:
train_and_evaluate_decision_tree(x_train_cbow, x_test_cbow, y_train_cbow, y_test_cbow, "CBOW")


Metrics for CBOW:
Model Score: 0.7078239608801956
Accuracy: 0.6589242053789731
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.10      0.17       278
           1       0.67      0.94      0.79       540

    accuracy                           0.66       818
   macro avg       0.58      0.52      0.48       818
weighted avg       0.61      0.66      0.58       818



In [70]:
train_and_evaluate_decision_tree(x_train_skg, x_test_skg, y_train_skg, y_test_skg, "Skip-Gram")


Metrics for Skip-Gram:
Model Score: 0.7084352078239609
Accuracy: 0.6601466992665037
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.07      0.13       278
           1       0.67      0.96      0.79       540

    accuracy                           0.66       818
   macro avg       0.58      0.52      0.46       818
weighted avg       0.61      0.66      0.56       818



In [71]:
train_and_evaluate_decision_tree(x_train_skg, x_test_skg, y_train_skg, y_test_skg, "Skip-Gram")


Metrics for Skip-Gram:
Model Score: 0.7084352078239609
Accuracy: 0.6601466992665037
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.07      0.13       278
           1       0.67      0.96      0.79       540

    accuracy                           0.66       818
   macro avg       0.58      0.52      0.46       818
weighted avg       0.61      0.66      0.56       818



In [72]:
nbc_1 = train_and_evaluate_navie_bayes(x_train_bow, x_test_bow, y_train_bow, y_test_bow, "BoW")


Metrics for BoW:
Model Score: 0.9517114914425427
Accuracy: 0.8007334963325183
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.63      0.68       278
           1       0.82      0.89      0.86       540

    accuracy                           0.80       818
   macro avg       0.78      0.76      0.77       818
weighted avg       0.80      0.80      0.80       818



In [73]:
nbc_2 = train_and_evaluate_navie_bayes(x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf, "Tf-IDF")


Metrics for Tf-IDF:
Model Score: 0.7937041564792175
Accuracy: 0.6882640586797066
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.09      0.16       278
           1       0.68      1.00      0.81       540

    accuracy                           0.69       818
   macro avg       0.82      0.54      0.48       818
weighted avg       0.77      0.69      0.59       818



In [74]:
from sklearn.preprocessing import MinMaxScaler #fixed import

scaler = MinMaxScaler()
X_train = scaler.fit_transform(x_train_cbow)
X_test = scaler.transform(x_test_cbow)

In [75]:
nbc_3 = train_and_evaluate_navie_bayes(X_train, X_test, y_train_cbow, y_test_cbow, "CBOW")


Metrics for CBOW:
Model Score: 0.676039119804401
Accuracy: 0.6601466992665037
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       278
           1       0.66      1.00      0.80       540

    accuracy                           0.66       818
   macro avg       0.33      0.50      0.40       818
weighted avg       0.44      0.66      0.53       818



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [76]:
x_train_skg = scaler.fit_transform(x_train_skg)
x_test_skg = scaler.transform(x_test_skg)

In [77]:
nbc_4 = train_and_evaluate_navie_bayes(x_train_skg, x_test_skg, y_train_skg, y_test_skg, "Skip-Gram")


Metrics for Skip-Gram:
Model Score: 0.676039119804401
Accuracy: 0.6601466992665037
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       278
           1       0.66      1.00      0.80       540

    accuracy                           0.66       818
   macro avg       0.33      0.50      0.40       818
weighted avg       0.44      0.66      0.53       818



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [78]:
train_and_evaluate_random_forest(x_train_bow, x_test_bow, y_train_bow, y_test_bow, "BoW")


Metrics for BoW:
Model Score: 1.0
Accuracy: 0.7762836185819071
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.46      0.58       278
           1       0.77      0.94      0.85       540

    accuracy                           0.78       818
   macro avg       0.78      0.70      0.71       818
weighted avg       0.78      0.78      0.76       818



In [50]:
train_and_evaluate_random_forest(X_train_bert, X_test_bert, y_train_bert, y_test_bert, "BERT")


Metrics for BERT:
Model Score: 1.0
Accuracy: 0.7383863080684596
Classification Report:
               precision    recall  f1-score   support

    Negetive       0.77      0.33      0.46       278
    Positive       0.73      0.95      0.83       540

    accuracy                           0.74       818
   macro avg       0.75      0.64      0.64       818
weighted avg       0.75      0.74      0.70       818



In [79]:
train_and_evaluate_random_forest(x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf, "TF-IDF")


Metrics for TF-IDF:
Model Score: 1.0
Accuracy: 0.7701711491442543
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.41      0.55       278
           1       0.76      0.96      0.85       540

    accuracy                           0.77       818
   macro avg       0.79      0.68      0.70       818
weighted avg       0.78      0.77      0.74       818



In [80]:
train_and_evaluate_random_forest(x_train_cbow, x_test_cbow, y_train_cbow, y_test_cbow, "CBOW")


Metrics for CBOW:
Model Score: 1.0
Accuracy: 0.6833740831295844
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.17      0.27       278
           1       0.69      0.95      0.80       540

    accuracy                           0.68       818
   macro avg       0.66      0.56      0.53       818
weighted avg       0.67      0.68      0.62       818



In [81]:
train_and_evaluate_random_forest(x_train_skg, x_test_skg, y_train_skg, y_test_skg, "Skip-Gram")


Metrics for Skip-Gram:
Model Score: 1.0
Accuracy: 0.6858190709046454
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.19      0.30       278
           1       0.69      0.94      0.80       540

    accuracy                           0.69       818
   macro avg       0.66      0.57      0.55       818
weighted avg       0.67      0.69      0.63       818



In [None]:
xg_boost


In [41]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


In [2]:
# !pip install xgboost


Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:08[0mm
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [42]:

X = text_vectors
y = tweets['Sentiment']

y = [s.replace('Negetive', 'Negative') for s in y]



NameError: name 'text_vectors' is not defined

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# from sklearn.preprocessing import MinMaxScaler #fixed import

# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(x_train_cbow)
# X_test = scaler.transform(x_test_cbow)