# Sentiment of Analysis of Tweets

---

## 1. Pre-Processing the text

### Import required libraries

In [1]:
import re
import string
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import gensim.downloader as api

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM

from sklearn.metrics import classification_report




### Read Dataset

In [2]:
df = pd.read_csv('Datasets/sentiment140.csv')
df.head()

Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by ...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Man...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all....",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY


### Check Label Balance

In [3]:
print(df['sentiment'].value_counts())

0    800000
4    800000
Name: sentiment, dtype: int64


### Pre-process text

In [4]:
def preprocess_text(text):
    text = re.sub(r'http\S+', 'URL', text)
    text = re.sub(r'@\w+', 'MENTION', text)
    text = re.sub(r'#\w+', 'HASHTAG', text)

    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,text,date,user,sentiment,query
0,MENTION URL Awww thats a bummer You shoulda ...,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he cant update his Facebook by t...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,MENTION I dived many times for the ball Manage...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,MENTION no its not behaving at all im mad why ...,Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY


### Tokenization and Lemmatization

In [5]:
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

df['text'] = df['text'].apply(tokenize_and_lemmatize)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,date,user,sentiment,query
0,"[MENTION, URL, Awww, thats, a, bummer, You, sh...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,"[is, upset, that, he, cant, update, his, Faceb...",Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,"[MENTION, I, dived, many, time, for, the, ball...",Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,"[my, whole, body, feel, itchy, and, like, it, ...",Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"[MENTION, no, it, not, behaving, at, all, im, ...",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY


### Label Conversion

In [6]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 4 else 0)

print(set(df['sentiment']))

{0, 1}


### Print first 20 tweets

In [7]:
print(df['text'].head(20))

0     [MENTION, URL, Awww, thats, a, bummer, You, sh...
1     [is, upset, that, he, cant, update, his, Faceb...
2     [MENTION, I, dived, many, time, for, the, ball...
3     [my, whole, body, feel, itchy, and, like, it, ...
4     [MENTION, no, it, not, behaving, at, all, im, ...
5                      [MENTION, not, the, whole, crew]
6                                        [Need, a, hug]
7     [MENTION, hey, long, time, no, see, Yes, Rains...
8                [MENTION, nope, they, didnt, have, it]
9                             [MENTION, que, me, muera]
10        [spring, break, in, plain, city, it, snowing]
11                        [I, just, repierced, my, ear]
12    [MENTION, I, couldnt, bear, to, watch, it, And...
13    [MENTION, It, it, count, idk, why, I, did, eit...
14    [MENTION, i, wouldve, been, the, first, but, i...
15    [MENTION, I, wish, I, got, to, watch, it, with...
16    [Hollis, death, scene, will, hurt, me, severel...
17                               [about, to, fil

### Train Test Split

In [8]:
X = df['text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

---

## 2. Vectorization and Padding

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = max([len(seq) for seq in X_train_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

---

## 3. Word Embedding

In [10]:
w2v_model = api.load('word2vec-google-news-300')

embedding_dim = 300
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in w2v_model:
        embedding_matrix[i] = w2v_model[word]

---

## 4. RNN Model

In [11]:
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 58, 300)           113484300 
                                                                 
 lstm (LSTM)                 (None, 128)               219648    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 113704077 (433.75 MB)
Trainable params: 219777 (858.50 KB)
Non-trainable params: 113484300 (432.91 MB)
_________________________________________________________________


### Train the model

In [12]:
history = model.fit(X_train_padded, y_train, epochs=5, batch_size=512, validation_data=(X_test_padded, y_test))

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Reports

In [13]:
y_pred = (model.predict(X_test_padded) > 0.5).astype("int64")

report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'], zero_division=1)
print(report)

              precision    recall  f1-score   support

    Negative       0.82      0.83      0.82    159494
    Positive       0.83      0.83      0.83    160506

    accuracy                           0.83    320000
   macro avg       0.83      0.83      0.83    320000
weighted avg       0.83      0.83      0.83    320000

