#### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Loading the Dataset

In [None]:
df = pd.read_csv('twitter_dataset.csv')
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


#### Performing Basic EDA process

In [None]:
df.columns = ["tweet id", "entity", "sentiment", "content"]

In [None]:
df.head()

Unnamed: 0,tweet id,entity,sentiment,content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [None]:
df.sentiment.value_counts() #gives count of the unique values

sentiment
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [None]:
df.isnull().sum() #identifies null values

tweet id       0
entity         0
sentiment      0
content      686
dtype: int64

In [None]:
df = df.dropna() #remove null values

In [None]:
df.isnull().sum()

tweet id     0
entity       0
sentiment    0
content      0
dtype: int64

In [None]:
df1 = df[df['sentiment'] != 'Irrelavent'] #removing with condition

In [None]:
# Create the mapping dictionary
mapping_dict = {
    'Positive': 1,
    'Negative': -1,
    'Neutral': 0
}

# Apply the mapping using a lambda function
df1['label_sentiment'] = df1['sentiment'].apply(lambda x: mapping_dict.get(x))

In [None]:
df1.head()

Unnamed: 0,tweet id,entity,sentiment,content,label_sentiment
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,1.0
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,1.0
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,1.0
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,1.0
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,1.0


In [None]:
df1['label_sentiment'].value_counts()

label_sentiment
-1.0    22358
 1.0    20654
 0.0    18108
Name: count, dtype: int64

In [None]:
min_samples = 8000

df_negative = df1[df1.label_sentiment==-1.0].sample(min_samples, random_state=9500)
df_neutral = df1[df1.label_sentiment==0.0].sample(min_samples, random_state=9500)
df_positive = df1[df1.label_sentiment==1.0].sample(min_samples, random_state=9500)   #performing sampling to balanced the data

In [None]:
df_balanced = pd.concat([df_negative,df_positive,df_neutral],axis=0)

In [None]:
df_balanced.head()

Unnamed: 0,tweet id,entity,sentiment,content,label_sentiment
50864,6330,FIFA,Negative,@EAHelp you rats really like the option to rec...,-1.0
46442,11969,Verizon,Negative,@VerizonSupport BEWARE & vulnerable in time of...,-1.0
58817,3283,Facebook,Negative,I’m tired of scrolling on Facebook .,-1.0
27912,406,ApexLegends,Negative,You know that I understand that people have a ...,-1.0
24020,4517,Google,Negative,"pls copy, rt & spread!.. Hi @Google. We apprec...",-1.0


In [None]:
df_balanced['label_sentiment'].value_counts()

label_sentiment
-1.0    8000
 1.0    8000
 0.0    8000
Name: count, dtype: int64

In [None]:
df_balanced.shape

(24000, 5)

In [None]:
!spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

#### Preprocessing the Data

In [None]:
import re
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    #Removes HTML tags: replaces anything between opening and closing <> with empty space

    return TAG_RE.sub('', text)

def preprocess(text):
    text = text.lower()  # Convert text to lowercase
    text = remove_tags(text)

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)  # the apostrophe is replaced by an empty space.we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [None]:
df_balanced['preprocessed_text'] = df_balanced['content'].apply(preprocess)

In [None]:
df_balanced

Unnamed: 0,tweet id,entity,sentiment,content,label_sentiment,preprocessed_text
50864,6330,FIFA,Negative,@EAHelp you rats really like the option to rec...,-1.0,eahelp rat like option record clip fifa app ...
46442,11969,Verizon,Negative,@VerizonSupport BEWARE & vulnerable in time of...,-1.0,verizonsupport beware vulnerable time crisis...
58817,3283,Facebook,Negative,I’m tired of scrolling on Facebook .,-1.0,tired scroll facebook
27912,406,ApexLegends,Negative,You know that I understand that people have a ...,-1.0,know understand people big goal pc
24020,4517,Google,Negative,"pls copy, rt & spread!.. Hi @Google. We apprec...",-1.0,pls copy rt spread hi google appreciate fix al...
...,...,...,...,...,...,...
51487,10438,RedDeadRedemption(RDR),Neutral,Come and see me swing my horses!,0.0,come swing horse
66159,6929,johnson&johnson,Neutral,Johnson & Johnson refused to stop advertising ...,0.0,johnson johnson refuse stop advertising sell t...
6327,288,Amazon,Neutral,2010 . It's great to have with Amazon fun and...,0.0,great amazon fun chance win win participate ...
196,2433,Borderlands,Neutral,i enter that gunner seat and i fear for a life,0.0,enter gunner seat fear life


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_text,
    df_balanced.label_sentiment,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.label_sentiment
)

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (19200,)
Shape of X_test:  (4800,)


#### Model Training and Evaluation

##### Random Forest

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.92      0.54      0.68      1600
         0.0       0.93      0.55      0.69      1600
         1.0       0.52      0.94      0.67      1600

    accuracy                           0.68      4800
   macro avg       0.79      0.68      0.68      4800
weighted avg       0.79      0.68      0.68      4800



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


#1. create a pipeline object
clf1 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Random Forest', RandomForestClassifier())
])

#2. fit with X_train and y_train
clf1.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf1.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.85      0.86      0.86      1600
         0.0       0.88      0.81      0.85      1600
         1.0       0.81      0.87      0.84      1600

    accuracy                           0.85      4800
   macro avg       0.85      0.85      0.85      4800
weighted avg       0.85      0.85      0.85      4800



##### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf2 = Pipeline([
    ('vectorizer_bigrams', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf2.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf2.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.80      0.87      0.83      1600
         0.0       0.85      0.79      0.82      1600
         1.0       0.85      0.84      0.85      1600

    accuracy                           0.83      4800
   macro avg       0.83      0.83      0.83      4800
weighted avg       0.83      0.83      0.83      4800



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


#1. create a pipeline object
clf3 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf3.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf3.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.70      0.84      0.76      1600
         0.0       0.79      0.65      0.71      1600
         1.0       0.78      0.76      0.77      1600

    accuracy                           0.75      4800
   macro avg       0.76      0.75      0.75      4800
weighted avg       0.76      0.75      0.75      4800



##### Support Vector Classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and SVM classifier
clf4 = Pipeline([
     ('vectorizer_tfidf', TfidfVectorizer()),
     ('svm', SVC(kernel='linear'))
])

# 2. Fit the pipeline with X_train and y_train
clf4.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf4.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.78      0.81      0.80      1600
         0.0       0.77      0.77      0.77      1600
         1.0       0.81      0.78      0.79      1600

    accuracy                           0.79      4800
   macro avg       0.79      0.79      0.79      4800
weighted avg       0.79      0.79      0.79      4800



##### K Nearest Neighbour

In [None]:
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and SVM classifier
clf5 = Pipeline([
     ('vectorizer_tfidf', TfidfVectorizer()),
      ('knn', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])

# 2. Fit the pipeline with X_train and y_train
clf5.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf5.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.44      0.97      0.61      1600
         0.0       0.89      0.39      0.54      1600
         1.0       0.92      0.33      0.48      1600

    accuracy                           0.56      4800
   macro avg       0.75      0.56      0.54      4800
weighted avg       0.75      0.56      0.54      4800



##### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# 1. Create a pipeline object with TfidfVectorizer and Gradient Boosting classifier
clf6 = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

# 2. Fit the pipeline with X_train and y_train
clf6.fit(X_train, y_train)

# 3. Get the predictions for X_test and store it in y_pred
y_pred = clf6.predict(X_test)

# 4. Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        -1.0       0.53      0.81      0.64      1600
         0.0       0.69      0.47      0.56      1600
         1.0       0.70      0.56      0.62      1600

    accuracy                           0.61      4800
   macro avg       0.64      0.61      0.61      4800
weighted avg       0.64      0.61      0.61      4800



##### Trail with LSTM (*needs more improvization*)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Activation, Flatten, GlobalMaxPooling1D, Conv1D

In [None]:
# Define parameters
vocab_size = 10000
embedding_dim = 100
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"


# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_balanced['preprocessed_text'])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df_balanced['preprocessed_text'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded, df_balanced['label_sentiment'], test_size=0.2, random_state=2022, stratify=df_balanced['label_sentiment'])

# Convert labels to numpy array
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64)),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 bidirectional_4 (Bidirecti  (None, 128)               84480     
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 24)                3096      
                                                                 
 dense_9 (Dense)             (None, 1)                 25        
                                                                 
Total params: 1087601 (4.15 MB)
Trainable params: 1087601 (4.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=12, validation_data=(X_test, y_test), verbose=2)

Epoch 1/12
600/600 - 30s - loss: -3.5819e+01 - accuracy: 0.3628 - val_loss: -1.4517e+02 - val_accuracy: 0.3975 - 30s/epoch - 51ms/step
Epoch 2/12
600/600 - 27s - loss: -4.0220e+02 - accuracy: 0.4205 - val_loss: -7.2634e+02 - val_accuracy: 0.4090 - 27s/epoch - 44ms/step
Epoch 3/12
600/600 - 26s - loss: -1.2538e+03 - accuracy: 0.4332 - val_loss: -1.6112e+03 - val_accuracy: 0.4000 - 26s/epoch - 44ms/step
Epoch 4/12
600/600 - 27s - loss: -2.5109e+03 - accuracy: 0.4499 - val_loss: -2.9122e+03 - val_accuracy: 0.4246 - 27s/epoch - 45ms/step
Epoch 5/12
600/600 - 27s - loss: -4.1736e+03 - accuracy: 0.4761 - val_loss: -4.4262e+03 - val_accuracy: 0.4452 - 27s/epoch - 45ms/step
Epoch 6/12
600/600 - 27s - loss: -6.2219e+03 - accuracy: 0.4873 - val_loss: -6.2966e+03 - val_accuracy: 0.4360 - 27s/epoch - 44ms/step
Epoch 7/12
600/600 - 27s - loss: -8.5897e+03 - accuracy: 0.4996 - val_loss: -8.2136e+03 - val_accuracy: 0.4675 - 27s/epoch - 44ms/step
Epoch 8/12
600/600 - 27s - loss: -1.1183e+04 - accuracy

In [None]:
# Get predictions for the test set
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype("int32")

# Print classification report
print(classification_report(y_test, y_pred_classes))

              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00      1600
         0.0       0.38      0.82      0.52      1600
         1.0       0.74      0.63      0.68      1600

    accuracy                           0.48      4800
   macro avg       0.37      0.48      0.40      4800
weighted avg       0.37      0.48      0.40      4800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Saving the model with better performance

In [None]:
import pickle

# Save the model to a file
with open('sentiment_model_twitter.pkl', 'wb') as f:
    pickle.dump(clf2, f)

In [None]:
df_balanced.to_csv('cleaned_twitter_data.csv',index=False)