# Tweet Sentiment Analysis

This project uses Machine Learning models to classify tweets into a labelled sentiment using the Twitter Sentiment Analysis dataset from Kaggle (https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis?resource=download) which focuses on Entity-level sentiment analysis on multi-lingual tweets.

### Importing libraries

First we import the libraries we will need in the project and check the dataset.

In [30]:
import pandas as pd
import numpy as np
import nltk
from nltk.collocations import *
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter
from collections import defaultdict
import re
import string
import itertools as it
import emoji
import fileinput
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import confusion_matrix,roc_auc_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

The NLTK resources will be downloaded (which is why this cell should only be run *once*) and used to tokenize the words in each tweet.

In [None]:
#Download resources
#pre-trained sentence tokenizer
nltk.download('punkt')
#list of stopwords
nltk.download('stopwords')

## Exploratory Data Analysis

We begin by looking at our data so we know what we have to work with and what we will cleanup. We begin our EDA by opening both of our datasets as Pandas Dataframes.

In [None]:
df = pd.read_csv('twitter_training.csv', sep=',', names=["Tweet ID", "Entity", "Sentiment", "Tweet Content"])
df.head()
valid_df = pd.read_csv('twitter_validation.csv', sep=',', names=["Tweet ID", "Entity", "Sentiment", "Tweet Content"])

In [None]:
df["Tweet ID"] = range(1, len(df) + 1)
df["Tweet ID"] = range(1, len(valid_df) + 1)
print(df.head(), valid_df.head())

In [None]:
df['Tweet Content'][0]

### Data cleaning

We begin our cleaning by dropping any duplicate and NAN values present in both dataframes.

In [None]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
valid_df.dropna(inplace=True)
valid_df.drop_duplicates(inplace=True)

In [None]:
df['Sentiment'].replace(to_replace='Irrelevant', value='Neutral', inplace=True)
valid_df['Sentiment'].replace(to_replace='Irrelevant', value='Neutral', inplace=True)
print(set(df['Sentiment']))

In [None]:
non_strings_mask = pd.to_numeric(df['Tweet Content'], errors='coerce').isna()
df.loc[non_strings_mask]
non_strings_mask = pd.to_numeric(valid_df['Tweet Content'], errors='coerce').isna()
valid_df.loc[non_strings_mask]

In [11]:
#remove urls & special characters
def remove_urls(text):
    """Berilgan matndan URL larini o'chiradi"""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_emojis(text):
    """Berilgan matndan emojilarni o'chiradi"""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emojilar
                               u"\U0001F300-\U0001F5FF"  # simvollar va diagrammalar
                               u"\U0001F680-\U0001F6FF"  # transport va turli joylar
                               u"\U0001F1E0-\U0001F1FF"  # davlat bayroqlari
                               u"\U00002702-\U000027B0"  # dingbats
                               u"\U000024C2-\U0001F251"  # alamatchilik belgilari
                               u"\U0001f300-\U0001f650"  
                               u"\u2000-\u3000" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['Tweet Content'] = df['Tweet Content'].apply(lambda x: remove_emojis(x))
df['Tweet Content'] = df['Tweet Content'].apply(lambda x: remove_urls(x))

df['Tweet Content'] = df['Tweet Content'].apply(str.lower)

stopwords_list = stopwords.words('english')
stopwords_list += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
stopwords_list += list(string.punctuation)
stopwords_list += ['....','...', '..', '’', "''", '``', '-', "'", "([a-zA-Z]+(?:'[a-z]+)?)"]

df.tail()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
74677,74678,Nvidia,Positive,just realized that the windows partition of my...
74678,74679,Nvidia,Positive,just realized that my mac window partition is ...
74679,74680,Nvidia,Positive,just realized the windows partition of my mac ...
74680,74681,Nvidia,Positive,just realized between the windows partition of...
74681,74682,Nvidia,Positive,just like the windows partition of my mac is l...


In [7]:
word_tokenize(df['Tweet Content'][74677])

['Just',
 'realized',
 'that',
 'the',
 'Windows',
 'partition',
 'of',
 'my',
 'Mac',
 'is',
 'like',
 '6',
 'years',
 'behind',
 'Nvidia',
 'drivers',
 'and',
 'I',
 'have',
 'no',
 'idea',
 'how',
 'I',
 'did',
 'not',
 'notice']

In [6]:
#Defining entity dictionary
entity_dict = { 
                "RedDeadRedemption(RDR)" : ['rdr', 'red dead redemption', 'red dead'], 
                "Microsoft": ['microsoft'],
                "Xbox(XSeries)":['xbox', 'series x', 'series s', 'xbox one', 'xseries'], 
                "AssassinsCreed": ['assassinscreed', 'assassins creed'], 
                "CallOfDutyBlackopsColdWar": ['black ops', 'cold war', 'callOfdutyblackopscoldWar'],
                "FIFA": ['fifa'],
                "TomClancysGhostRecon": ['ghost recon', 'ghostrecon'],
                "Google": ['google'],
                "PlayStation(PS)": ['ps5', 'playstation', 'ps4', 'PS'],
                "Facebook": ['facebook'],
                "GrandTheftAuto(GTA)": ['gta', 'grand theft auto'],
                "PlayerUnknownsBattlegrounds(PUBG)": ['pubg', 'player unknowns battlegrounds', 'PlayerUnknownsBattlegrounds'],
                "Hearthstone": ['hearthstone'],
                "MaddenNFL": ['madden'],
                "CallOfDuty": ['modern warfare', 'call of duty', 'cod'],
                "Fortnite": ['fortnitegame', 'fortnite'],
                "Verizon": ['verizon'],
                "Nvidia": ['nvidia'],
                "Amazon": ['amazon'],
                "WorldOfCraft": ['wow', 'world of warcraft'],
                "ApexLegends": ['apex legends', 'apex', 'apexlegends'],
                "CS-GO": ['csgo', 'counter strike'],
                "johnson&johnson": ['johnson&johnson', 'johnson & johnson'],
                "HomeDepot": ['homedepot', 'home depot'],
                "NBA2K": ['nba'],
                "Overwatch": ['overwatch'],
                "LeagueOfLegends": ['lol', 'league of legends'],
                "Borderlands": ['borderlands'],
                "TomClancysRainbowSix": ['rainbow six', 'rainbow six siege', 'rainbowsix'],
                "Dota": ['dota'],
                "Battlefield": ['battlefield'],
                "Cyberpunk2077": ['cyberpunkgame', 'cyberpunk2077', 'cyberpunk'],
                "NintendoSwitch": ['nintendo switch', 'nintendo'],
                "Windows": ['windows', 'window']
              }

In [12]:
def tokenize_text(text):
    return word_tokenize(text)

df['Tokenized Text'] = df['Tweet Content'].apply(tokenize_text)


df['Tokenized Text'] = [word for word in df['Tokenized Text'] if word not in stopwords_list]
df['Tokenized Text'] = [word for word in df['Tokenized Text'] if word not in stopwords_list]
df.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content,Tokenized Text
0,1,Borderlands,Positive,im getting on borderlands and i will murder yo...,"[im, getting, on, borderlands, and, i, will, m..."
1,2,Borderlands,Positive,i am coming to the borders and i will kill you...,"[i, am, coming, to, the, borders, and, i, will..."
2,3,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[im, getting, on, borderlands, and, i, will, k..."
3,4,Borderlands,Positive,im coming on borderlands and i will murder you...,"[im, coming, on, borderlands, and, i, will, mu..."
4,5,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, on, borderlands, 2, and, i, will..."


In [13]:
df['Tokenized Text'][0]

['im',
 'getting',
 'on',
 'borderlands',
 'and',
 'i',
 'will',
 'murder',
 'you',
 'all',
 ',']

## Creating a Long Short-Term Memory (LTSM) Model

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Tokenized Text'])
sequences = tokenizer.texts_to_sequences(df['Tokenized Text'])
X = pad_sequences(sequences)

In [15]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Sentiment'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
vocab_size = len(tokenizer.word_index) + 1
max_sequence_length = X.shape[1]
embedding_dim = 100
lstm_units = 128

In [18]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
y_train_encoded = np.eye(len(label_encoder.classes_))[y_train]
y_test_encoded = np.eye(len(label_encoder.classes_))[y_test]

In [23]:
batch_size = 64
epochs = 10
model.fit(X_train, y_train_encoded, batch_size=batch_size, epochs=epochs, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2210f63b7f0>

In [24]:
loss, accuracy = model.evaluate(X_test, y_test_encoded)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.4390312433242798, Test Accuracy: 0.8862162232398987


In [27]:
model.save('tweetsentiment_lstm_model.h5')

In [None]:
loaded_model = load_model("tweetsentiment_lstm_model.h5")

In [None]:
new_sequences = tokenizer.texts_to_sequences(new_data)
new_X = pad_sequences(new_sequences, maxlen=max_sequence_length)