In [1]:
import pandas as pd
import numpy as np 
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/ayoubbakkali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ayoubbakkali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ayoubbakkali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
dataset = pd.read_csv("./twitter_training.csv", names=["id", "entity", "sentiment", "Tweet Content"])
dataset = dataset.drop(["id", "entity"], axis=1)
dataset.head()

Unnamed: 0,sentiment,Tweet Content
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
labelencoder = LabelEncoder()
labelencoder.fit(dataset["sentiment"])

labels = labelencoder.transform(dataset["sentiment"])
print(labels)

[3 3 3 ... 3 3 3]


# 1. NLP Pipeline

In [4]:
def clean_tweet(text):
    if not isinstance(text, str):
        return ''
    # Apply regular expression to replace non-alphabetical characters with a space
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', text)
    return cleaned_text

# text Cleanign 
dataset["tokens"] = dataset["Tweet Content"].apply(lambda x: word_tokenize(clean_tweet(x).lower()))
dataset

Unnamed: 0,sentiment,Tweet Content,tokens
0,Positive,im getting on borderlands and i will murder yo...,"[im, getting, on, borderlands, and, i, will, m..."
1,Positive,I am coming to the borders and I will kill you...,"[i, am, coming, to, the, borders, and, i, will..."
2,Positive,im getting on borderlands and i will kill you ...,"[im, getting, on, borderlands, and, i, will, k..."
3,Positive,im coming on borderlands and i will murder you...,"[im, coming, on, borderlands, and, i, will, mu..."
4,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, on, borderlands, and, i, will, m..."
...,...,...,...
74677,Positive,Just realized that the Windows partition of my...,"[just, realized, that, the, windows, partition..."
74678,Positive,Just realized that my Mac window partition is ...,"[just, realized, that, my, mac, window, partit..."
74679,Positive,Just realized the windows partition of my Mac ...,"[just, realized, the, windows, partition, of, ..."
74680,Positive,Just realized between the windows partition of...,"[just, realized, between, the, windows, partit..."


In [5]:
stop_words = set(stopwords.words('english'))

# Removing StopWords
dataset["tokens"] = dataset["tokens"].apply(lambda x: [word for word in x if word not in stop_words])
dataset

Unnamed: 0,sentiment,Tweet Content,tokens
0,Positive,im getting on borderlands and i will murder yo...,"[im, getting, borderlands, murder]"
1,Positive,I am coming to the borders and I will kill you...,"[coming, borders, kill]"
2,Positive,im getting on borderlands and i will kill you ...,"[im, getting, borderlands, kill]"
3,Positive,im coming on borderlands and i will murder you...,"[im, coming, borderlands, murder]"
4,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, borderlands, murder]"
...,...,...,...
74677,Positive,Just realized that the Windows partition of my...,"[realized, windows, partition, mac, like, year..."
74678,Positive,Just realized that my Mac window partition is ...,"[realized, mac, window, partition, years, behi..."
74679,Positive,Just realized the windows partition of my Mac ...,"[realized, windows, partition, mac, years, beh..."
74680,Positive,Just realized between the windows partition of...,"[realized, windows, partition, mac, like, year..."


In [6]:
lemmatizer = WordNetLemmatizer()

dataset["tokens"] = dataset["tokens"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
dataset

Unnamed: 0,sentiment,Tweet Content,tokens
0,Positive,im getting on borderlands and i will murder yo...,"[im, getting, borderland, murder]"
1,Positive,I am coming to the borders and I will kill you...,"[coming, border, kill]"
2,Positive,im getting on borderlands and i will kill you ...,"[im, getting, borderland, kill]"
3,Positive,im coming on borderlands and i will murder you...,"[im, coming, borderland, murder]"
4,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, borderland, murder]"
...,...,...,...
74677,Positive,Just realized that the Windows partition of my...,"[realized, window, partition, mac, like, year,..."
74678,Positive,Just realized that my Mac window partition is ...,"[realized, mac, window, partition, year, behin..."
74679,Positive,Just realized the windows partition of my Mac ...,"[realized, window, partition, mac, year, behin..."
74680,Positive,Just realized between the windows partition of...,"[realized, window, partition, mac, like, year,..."


# Word2Vec (CBOW and Skip-gram)

In [7]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Train Word2Vec models (CBOW and Skip-gram)
cbow_model = Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, sg=0)
skipgram_model = Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, sg=1)

In [8]:
def get_sentence_embedding(sentence, model):
    # Get vectors for words in the sentence, ignore words not in the model's vocabulary
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if not word_vectors:  # If no words in the sentence are in the vocabulary, return a zero vector
        return np.zeros(model.vector_size)
    # Compute the mean of the word vectors
    return np.mean(word_vectors, axis=0)


cbow_vectors = np.array([get_sentence_embedding(sentence, cbow_model) for sentence in dataset['tokens']])
skipgram_vectors = np.array([get_sentence_embedding(sentence, skipgram_model) for sentence in dataset['tokens']])


print(len(cbow_vectors))
print(len(skipgram_vectors))

74682
74682


# BagOfWords

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
bow_matrix = vectorizer.fit_transform(dataset['tokens'].apply(lambda x: ' '.join(x)))
bow_matrix.shape

(74682, 26649)

# TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['tokens'].apply(lambda x: ' '.join(x)))
tfidf_matrix

<74682x26649 sparse matrix of type '<class 'numpy.float64'>'
	with 768575 stored elements in Compressed Sparse Row format>

# Models training

In [11]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [12]:
X , y = np.array(cbow_vectors), np.array(labels)
X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)
print(X.shape)
print(y.shape)

(74682, 100)
(74682,)


In [13]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_prediction = svc_model.predict(X_test)
svc_f1 = f1_score(y_test, svc_prediction, average='weighted')
svc_accu = accuracy_score(y_test, svc_prediction)
print("AdaBoost F1 Score:", svc_f1)
print("AdaBoost Accuracy:", svc_accu)

AdaBoost F1 Score: 0.5489015180444997
AdaBoost Accuracy: 0.5675838521791524


In [14]:
adaboost_model = AdaBoostClassifier(n_estimators=200)
adaboost_model.fit(X_train, y_train)
adaboost_prediction = adaboost_model.predict(X_test)
adaboost_f1 = f1_score(y_test, adaboost_prediction, average='weighted')
adaboost_accu = accuracy_score(y_test, adaboost_prediction)
print("AdaBoost F1 Score:", adaboost_f1)
print("AdaBoost Accuracy:", adaboost_accu)



AdaBoost F1 Score: 0.5014179463797437
AdaBoost Accuracy: 0.5134230434491531


In [20]:
# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_prediction = nb_model.predict(X_test)
nb_accu = accuracy_score(y_test, nb_prediction)
nb_f1 = f1_score(y_test, nb_prediction, average='weighted')
print("Naive Bayes Accuracy:", nb_accu)
print("Naive Bayes F1 Score:", nb_f1)

Naive Bayes Accuracy: 0.41219789783758454
Naive Bayes F1 Score: 0.3308435498004824


In [21]:
# Logistic Regression Model
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train, y_train)
lr_prediction = lr_model.predict(X_test)
lr_accu = accuracy_score(y_test, lr_prediction)
lr_f1 = f1_score(y_test, lr_prediction, average='weighted')
print("Logistic Regression Accuracy:", lr_accu)
print("Logistic Regression F1 Score:", lr_f1)

Logistic Regression Accuracy: 0.5217915244024904
Logistic Regression F1 Score: 0.49728259480230813


In [17]:
X[X < 0]

array([], dtype=float64)