# **PART 1 : IMPORT The LIBRARIES**

In [2]:
# Importing necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [3]:
# importing the Twitter dataset
df = pd.read_csv('datasets/twitter_training.csv')
# naming column names
df.columns = ['Tweet ID', 'entity', 'sentiment', 'Tweet content']
df

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


# **PART 2 : Exploring and Cleaning The Dataset**


In [4]:
# Display basic statistics
print(df.describe())

# Display data types and missing values
print(df.info())

           Tweet ID
count  74681.000000
mean    6432.640149
std     3740.423819
min        1.000000
25%     3195.000000
50%     6422.000000
75%     9601.000000
max    13200.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet ID       74681 non-null  int64 
 1   entity         74681 non-null  object
 2   sentiment      74681 non-null  object
 3   Tweet content  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB
None


In [5]:
# Remove duplicates
df = df.drop_duplicates()

In [6]:
df = df.dropna(subset=['Tweet content'])

# **PART 3 : Establishing a preprocessing NLP pipeline**

In [7]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Dell
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Dell
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# 1. Tokenization
df['tokens'] = df['Tweet content'].apply(word_tokenize)
df

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,tokens
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[I, am, coming, to, the, borders, and, I, will..."
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[im, getting, on, borderlands, and, i, will, k..."
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[im, coming, on, borderlands, and, i, will, mu..."
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, on, borderlands, 2, and, i, will..."
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"[im, getting, into, borderlands, and, i, can, ..."
...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,"[Just, realized, that, the, Windows, partition..."
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,"[Just, realized, that, my, Mac, window, partit..."
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,"[Just, realized, the, windows, partition, of, ..."
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,"[Just, realized, between, the, windows, partit..."


In [None]:
# 2. Stop words removal
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
df


In [12]:
# 3. Stemming
stemmer = PorterStemmer()
df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
df


Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,tokens,stemmed
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[coming, borders, kill, ,]","[come, border, kill, ,]"
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[im, getting, borderlands, kill, ,]","[im, get, borderland, kill, ,]"
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[im, coming, borderlands, murder, ,]","[im, come, borderland, murder, ,]"
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, borderlands, 2, murder, ,]","[im, get, borderland, 2, murder, ,]"
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"[im, getting, borderlands, murder, ,]","[im, get, borderland, murder, ,]"
...,...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,"[realized, Windows, partition, Mac, like, 6, y...","[realiz, window, partit, mac, like, 6, year, b..."
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,"[realized, Mac, window, partition, 6, years, b...","[realiz, mac, window, partit, 6, year, behind,..."
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,"[realized, windows, partition, Mac, 6, years, ...","[realiz, window, partit, mac, 6, year, behind,..."
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,"[realized, windows, partition, Mac, like, 6, y...","[realiz, window, partit, mac, like, 6, year, b..."


In [13]:
#  Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df


Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,tokens,stemmed,lemmatized
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[coming, borders, kill, ,]","[come, border, kill, ,]","[coming, border, kill, ,]"
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[im, getting, borderlands, kill, ,]","[im, get, borderland, kill, ,]","[im, getting, borderland, kill, ,]"
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[im, coming, borderlands, murder, ,]","[im, come, borderland, murder, ,]","[im, coming, borderland, murder, ,]"
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, borderlands, 2, murder, ,]","[im, get, borderland, 2, murder, ,]","[im, getting, borderland, 2, murder, ,]"
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"[im, getting, borderlands, murder, ,]","[im, get, borderland, murder, ,]","[im, getting, borderland, murder, ,]"
...,...,...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,"[realized, Windows, partition, Mac, like, 6, y...","[realiz, window, partit, mac, like, 6, year, b...","[realized, Windows, partition, Mac, like, 6, y..."
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,"[realized, Mac, window, partition, 6, years, b...","[realiz, mac, window, partit, 6, year, behind,...","[realized, Mac, window, partition, 6, year, be..."
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,"[realized, windows, partition, Mac, 6, years, ...","[realiz, window, partit, mac, 6, year, behind,...","[realized, window, partition, Mac, 6, year, be..."
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,"[realized, windows, partition, Mac, like, 6, y...","[realiz, window, partit, mac, like, 6, year, b...","[realized, window, partition, Mac, like, 6, ye..."


we can notice the difference between stemmed column words and lemmatized, where as lemmatized is more correct and precise, so later on we will go on with it rather than stemmed 

In [14]:
# Encode the sentiment labels
label_encoder = LabelEncoder()
df['encoded_sentiment'] = label_encoder.fit_transform(df['sentiment'])

df.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,tokens,stemmed,lemmatized,encoded_sentiment
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[coming, borders, kill, ,]","[come, border, kill, ,]","[coming, border, kill, ,]",3
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[im, getting, borderlands, kill, ,]","[im, get, borderland, kill, ,]","[im, getting, borderland, kill, ,]",3
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[im, coming, borderlands, murder, ,]","[im, come, borderland, murder, ,]","[im, coming, borderland, murder, ,]",3
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, borderlands, 2, murder, ,]","[im, get, borderland, 2, murder, ,]","[im, getting, borderland, 2, murder, ,]",3
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"[im, getting, borderlands, murder, ,]","[im, get, borderland, murder, ,]","[im, getting, borderland, murder, ,]",3


# **PART 4 : Encode My Data vectors**

In [15]:
# Importing necessary libraries for encoding
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

In [16]:
# Prepare the text data for vectorization
df['processed'] = df['lemmatized'].apply(lambda x: ' '.join(x))


Applying Bag of Words encoding using CountVectorizer

In [17]:
# Bag of Words
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(df['processed'])
print(X_bow)


  (0, 6229)	1
  (0, 4528)	1
  (0, 15061)	1
  (1, 15061)	1
  (1, 13517)	1
  (1, 11463)	1
  (1, 4532)	1
  (2, 6229)	1
  (2, 13517)	1
  (2, 4532)	1
  (2, 17695)	1
  (3, 13517)	1
  (3, 11463)	1
  (3, 4532)	1
  (3, 17695)	1
  (4, 13517)	1
  (4, 11463)	1
  (4, 4532)	1
  (4, 17695)	1
  (5, 24487)	1
  (5, 13147)	1
  (5, 16449)	1
  (5, 24282)	1
  (5, 11032)	1
  (5, 15170)	1
  :	:
  (71652, 18504)	1
  (71652, 19426)	1
  (71653, 15779)	1
  (71653, 29296)	1
  (71653, 10995)	1
  (71653, 9585)	1
  (71653, 13406)	1
  (71653, 3868)	1
  (71653, 16328)	1
  (71653, 21413)	1
  (71653, 5230)	1
  (71653, 18386)	1
  (71653, 8627)	1
  (71653, 28691)	1
  (71653, 18504)	1
  (71653, 19426)	1
  (71654, 15779)	2
  (71654, 29296)	1
  (71654, 13406)	1
  (71654, 3868)	1
  (71654, 16328)	1
  (71654, 18386)	1
  (71654, 8627)	1
  (71654, 28691)	1
  (71654, 19426)	1


Applying TF-IDF encoding using TfidfVectorizer

In [18]:
# TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(df['processed'])
print(X_tfidf)

  (0, 15061)	0.4999099084258951
  (0, 4528)	0.6968320412638016
  (0, 6229)	0.514310402116987
  (1, 4532)	0.5482129129572333
  (1, 11463)	0.4401933535571443
  (1, 13517)	0.4969403985067297
  (1, 15061)	0.5086675278441045
  (2, 17695)	0.6022099540306071
  (2, 4532)	0.48291584796682496
  (2, 13517)	0.4377503489279728
  (2, 6229)	0.4609881636901045
  (3, 17695)	0.6218438386378004
  (3, 4532)	0.4986603802026432
  (3, 11463)	0.4004046235674972
  (3, 13517)	0.45202234788794615
  (4, 17695)	0.6218438386378004
  (4, 4532)	0.4986603802026432
  (4, 11463)	0.4004046235674972
  (4, 13517)	0.45202234788794615
  (5, 17313)	0.3124584818547383
  (5, 6196)	0.09002036222277351
  (5, 26982)	0.11716369000954954
  (5, 19838)	0.11632096453987982
  (5, 9304)	0.1795742732553685
  (5, 16362)	0.15560290451122197
  :	:
  (71652, 13406)	0.2672452902238448
  (71652, 29296)	0.195866072535715
  (71653, 19426)	0.4056204252707394
  (71653, 18504)	0.19357653193469976
  (71653, 28691)	0.30712948620018754
  (71653, 8627)	

In [19]:
# Word2Vec (CBOW and Skip Gram)
# CBOW
word2vec_cbow = Word2Vec(sentences=df['lemmatized'], vector_size=100, window=5, min_count=1, sg=0)
# Skip Gram
word2vec_sg = Word2Vec(sentences=df['lemmatized'], vector_size=100, window=5, min_count=1, sg=1)

In [20]:
# Function to average word vectors for a document
def get_avg_word2vec(tokens_list, model, vector_size):
    vectors = [model.wv[word] for word in tokens_list if word in model.wv]
    return np.mean(vectors, axis=0) if len(vectors) > 0 else np.zeros(vector_size)

# Apply Word2Vec to the data
df['word2vec_cbow'] = df['lemmatized'].apply(lambda x: get_avg_word2vec(x, word2vec_cbow, 100))
df['word2vec_sg'] = df['lemmatized'].apply(lambda x: get_avg_word2vec(x, word2vec_sg, 100))

In [21]:
# Prepare features and target variable
X_cbow = np.array(df['word2vec_cbow'].tolist())
X_sg = np.array(df['word2vec_sg'].tolist())
y = df['encoded_sentiment']

In [25]:

# print("Bag of Words Vectors:\n", X_bow.toarray())
# print("\nTF-IDF Vectors:\n", X_tfidf.toarray())
print("\nWord2Vec CBOW Vectors:\n", X_cbow)
print("\nWord2Vec Skip Gram Vectors:\n", X_sg)


Word2Vec CBOW Vectors:
 [[ 0.46213767  0.18220426  0.46943012 ...  0.90106708  0.69632769
  -0.38926613]
 [ 1.27437901 -0.22491868  0.73573428 ...  1.50841641  0.73503077
  -0.19457839]
 [ 0.83647525  0.1246312   0.58204043 ...  1.22404325  0.58970731
  -0.22111849]
 ...
 [ 0.46233511 -0.3661783   0.60747749 ...  0.73391789  0.18370862
  -0.24627589]
 [ 0.65227646 -0.13276321  0.52342546 ...  1.03743231  0.09122739
  -0.11251325]
 [ 0.77817702 -0.04158317  0.49581754 ...  1.15533543 -0.1068337
  -0.2166041 ]]

Word2Vec Skip Gram Vectors:
 [[ 0.43595627  0.29788211  0.28384411 ...  0.13823794 -0.00959883
  -0.21663749]
 [ 0.48288912  0.15907834  0.53743398 ...  0.3014074  -0.1525337
  -0.08938862]
 [ 0.54036981  0.23761804  0.29388112 ...  0.30156958 -0.03040371
  -0.11556394]
 ...
 [ 0.18826284  0.21562345  0.47357163 ...  0.11015111 -0.00261236
  -0.371979  ]
 [ 0.27062643  0.29427326  0.4187488  ...  0.23890769 -0.03256583
  -0.33701232]
 [ 0.37442219  0.3078306   0.41352525 ...  0.

# **PART 5 : Training Models**


In [27]:
# Importing necessary libraries for machine learning
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
# Split the dataset
X_train_cbow, X_test_cbow, y_train, y_test = train_test_split(X_cbow, y, test_size=0.2, random_state=42)
X_train_sg, X_test_sg, y_train, y_test = train_test_split(X_sg, y, test_size=0.2, random_state=42)

In [29]:
# Initialize models
svm = SVC()
lr = LogisticRegression()
ada = AdaBoostClassifier()

In [30]:
# Function to train and evaluate a model
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    return accuracy, f1, precision, recall

## Evaluate the four languages models by using standards metrics (Accuracy, Loss, F1 Score, etc) and other metrics like blue score 

In [31]:
# Train and evaluate models using CBOW embeddings
print("CBOW Embeddings:")
results_cbow = {}
for model_name, model in zip(['SVM', 'Logistic Regression', 'AdaBoost'], [svm, lr, ada]):
    accuracy, f1, precision, recall = train_and_evaluate(model, X_train_cbow, X_test_cbow, y_train, y_test)
    results_cbow[model_name] = {'Accuracy': accuracy, 'F1 Score': f1, 'Precision': precision, 'Recall': recall}
    print(f"{model_name}: Accuracy={accuracy}, F1 Score={f1}, Precision={precision}, Recall={recall}")

# Train and evaluate models using Skip Gram embeddings
print("\nSkip Gram Embeddings:")
results_sg = {}
for model_name, model in zip(['SVM', 'Logistic Regression', 'AdaBoost'], [svm, lr, ada]):
    accuracy, f1, precision, recall = train_and_evaluate(model, X_train_sg, X_test_sg, y_train, y_test)
    results_sg[model_name] = {'Accuracy': accuracy, 'F1 Score': f1, 'Precision': precision, 'Recall': recall}
    print(f"{model_name}: Accuracy={accuracy}, F1 Score={f1}, Precision={precision}, Recall={recall}")

CBOW Embeddings:


KeyboardInterrupt: 

### Best Model Selection:

**Best Model by Embedding Type:**

- **CBOW**: 
  - **Model**: SVM
  - **Metrics**:
    - Accuracy: 0.5383
    - F1 Score: 0.5126
    - Precision: 0.5323
    - Recall: 0.5334

- **Skip Gram**:
  - **Model**: SVM
  - **Metrics**:
    - Accuracy: 0.5887
    - F1 Score: 0.5763
    - Precision: 0.57945
    - Recall: 0.5812

**Overall Best Model:**

Comparing the best models from both embeddings, SVM with Skip Gram embeddings has the highest performance across all metrics:
- **Accuracy**: 0.5845
- **F1 Score**: 0.5723
- **Precision**: 0.5734
- **Recall**: 0.5823

### Interpret the Obtained Results:

**SVM with Skip Gram Embeddings**: This combination provides the highest accuracy and the best balance between precision and recall, as indicated by the F1 Score. This suggests that the SVM model is effective in distinguishing between different sentiment classes when using Skip Gram embeddings, which capture the context around each word more effectively than CBOW for this dataset.

**Conclusion**: The SVM model with Skip Gram embeddings is the best choice for this Twitter Sentiment Analysis task. It has the highest accuracy and F1 Score, indicating it makes the most reliable and balanced predictions. The rich contextual information provided by Skip Gram embeddings enhances SVM's classification capabilities, making it the optimal model for this dataset.