<a href="https://colab.research.google.com/github/AbdNasir24/Atelier-2-NLP/blob/main/Part2_Twitter_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Section 1: Loading the Dataset and Preprocessing**

**A_ Import Libraries**

In [5]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**B_Load the Dataset**

In [13]:
# Load the dataset
df = pd.read_csv("/twitter_training.csv")

# Display the first few rows of the dataframe to verify the data
print(df.head())

   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  
0  I am coming to the borders and I will kill you...     
1  im getting on borderlands and i will kill you ...     
2  im coming on borderlands and i will murder you...     
3  im getting on borderlands 2 and i will murder ...     
4  im getting into borderlands and i can murder y...     


In [14]:
print(df.columns)

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')


**C_Preprocessing Function**

In [19]:
# Preprocessing
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]  # Stemming
    return " ".join(tokens)

df['processed_text'] = df['Positive'].apply(preprocess_text)

**Section 2: Splitting the Data into Train and Test Sets**

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['Positive'], test_size=0.2, random_state=42)


**Section 3: Encoding Data Vectors**

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

# Using Word2Vec
sentences = [word_tokenize(text) for text in X_train]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
X_train_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in sentences])

# Using Bag of Words (BoW)
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train)

# Using TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)


**Section 4: Training the Models**

In [23]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

# Train the models
svm_model = SVC()
svm_model.fit(X_train_word2vec, y_train)

nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)

lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train_word2vec, y_train)


**Section 5: Predictions and Evaluationt**

In [24]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Predictions for SVM
X_test_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in word_tokenize(text) if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for text in X_test])
svm_pred = svm_model.predict(X_test_word2vec)

# Predictions for Naive Bayes
X_test_bow = vectorizer_bow.transform(X_test)
nb_pred = nb_model.predict(X_test_bow)

# Predictions for Logistic Regression
X_test_tfidf = vectorizer_tfidf.transform(X_test)
lr_pred = lr_model.predict(X_test_tfidf)

# Predictions for AdaBoost
adaboost_pred = adaboost_model.predict(X_test_word2vec)

# Evaluate the models
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred, average='weighted')
svm_report = classification_report(y_test, svm_pred)

nb_accuracy = accuracy_score(y_test, nb_pred)
nb_f1 = f1_score(y_test, nb_pred, average='weighted')
nb_report = classification_report(y_test, nb_pred)

lr_accuracy = accuracy_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred, average='weighted')
lr_report = classification_report(y_test, lr_pred)

adaboost_accuracy = accuracy_score(y_test, adaboost_pred)
adaboost_f1 = f1_score(y_test, adaboost_pred, average='weighted')
adaboost_report = classification_report(y_test, adaboost_pred)


**Section 6: Printing the Evaluation Metrics**

In [25]:
print("SVM Accuracy:", svm_accuracy)
print("SVM F1 Score:", svm_f1)
print("SVM Classification Report:")
print(svm_report)

print("\nNaive Bayes Accuracy:", nb_accuracy)
print


SVM Accuracy: 1.0
SVM F1 Score: 1.0
SVM Classification Report:
              precision    recall  f1-score   support

  Irrelevant       1.00      1.00      1.00      2661
    Negative       1.00      1.00      1.00      4471
     Neutral       1.00      1.00      1.00      3551
    Positive       1.00      1.00      1.00      4254

    accuracy                           1.00     14937
   macro avg       1.00      1.00      1.00     14937
weighted avg       1.00      1.00      1.00     14937


Naive Bayes Accuracy: 1.0


<function print>