**Abhina Premachandran Bindu**

**April 30 2024**
# Comparing the performance of gensim vs nltk libraries
<p> In this analysis, the nltk and gensim nlp libraries are compared based on the accuracy scores of the same classifier applied on the processed texts corresponding to the libraries. It is found that using nltk library to process the text and tfidf vectorizer to apply the classifier resulted in better accuracy scores compared to using gensim's word2vect function for training the classifier. The accuracy for nltk is 0.99 for the gradient boosting classifier while the gensim accuracy for the same classifier is only 0.96.</p>

## Loading and initial cleaning

In [None]:
# importing the necessary libraries
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# importing the libraries for nltk
import nltk
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# BoW
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#importing the data
data = pd.read_csv(input('Enter the file path for the csv file:'))
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# dropping the na values
data.dropna(inplace=True)

In [None]:
# dropping the redundant 'Unnamed: 0' column
data.drop(columns='Unnamed: 0',inplace=True)

In [None]:
# checking the value counts of 'target' to check for data imbalance
data.target.value_counts()

 Since the number of Fake and True classes are almost same, there is no class imbalance

In [None]:
data.subject.value_counts()

## Data Preprocessing

In [None]:
# encoding the class labels to numerical - Real:1 and Fake:0
class_mapping = {label: idx for idx, label in enumerate(np.unique(data['target']))}
data['target'] = data['target'].map(class_mapping)
data.head()

## using nltk for cleaning and preparing for classification

In [None]:
# Tokenize and removing stop words
stop_words = set(stopwords.words('english'))
def clean_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    # remove non-alphabetical characters and stopwords
    cleaned_tokens = [re.sub(r'[^a-zA-Z ]', '', text).lower() for text in tokens if text.lower() not in stop_words]
    cleaned_tokens = [token for token in cleaned_tokens if ((token not in  set(string.punctuation)))]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in cleaned_tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    #stem the tokens
    porter = PorterStemmer()
    cleaned_text = " ".join(porter.stem(token) for token in processed_text.split())
    return cleaned_text

# Apply the function across the DataFrame
data_nltk = data.copy()
data_nltk['cleaned_text'] = data_nltk['text'].apply(clean_text)

In [None]:
data_nltk.head()

## using gensim to clean and build the vectors for the text

In [None]:
# Apply the function across the DataFrame
data_gensim = data.copy()
data_gensim['cleaned_text'] = data_gensim['text'].apply(gensim.utils.simple_preprocess)


In [None]:
data.head()

## Building, training and using the gensim word2vect model for getting the word vectors

In [None]:
# building the word2vec model
model = gensim.models.Word2Vec(
    window = 6,
    min_count = 1,
    workers = 4
)
model.build_vocab(data_gensim['cleaned_text'])

In [None]:
# training the model
model.train(data_gensim['cleaned_text'], total_examples=model.corpus_count, epochs=5)

# saving the model
model.save("word2vec/word2vec_model")

In [None]:
model.wv.index_to_key[:5]

In [None]:
len(model.wv.index_to_key)

In [None]:
# a function for finding the average of the word vectors 
def get_average_word2vec_vector(text, model, word_dim):
  vec = np.zeros((word_dim,))  
  count = 0
  for word in text:
    if word in model.wv:  
      vec += model.wv[word]
      count += 1
  if count != 0:
    vec /= count  
  return vec

# Get word dimensions from the model
word_dim = model.vector_size

# Apply the function to each cleaned_text
word_vectors = [get_average_word2vec_vector(text, model, word_dim) for text in data_gensim['cleaned_text']]


In [None]:
# adding the word vectors to the data
data['word_vectors'] = word_vectors

In [None]:
data.head()

## Classifying the data

In [None]:
# importing the model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
# importing necessary libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
# defining the model
clf = GradientBoostingClassifier()

### using nltk and tfidf

In [None]:
# defining the X and y arrays for training and testing
X1 = data_nltk['cleaned_text'].values
y1 = data_nltk['target'].values

In [None]:
X1.shape,y1.shape

In [None]:
# splitting data to train-test split
X_train1,X_test1,y_train1,y_test1 = train_test_split(X1,y1,test_size=0.33,random_state=44)

In [None]:
# defining the tfidf vectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

In [None]:
# defining the pipeline to fit the training data
gb_tfidf = Pipeline([
    ('vect',tfidf),
    ('gb clf',clf)
])
gb_tfidf.fit(X_train1,y_train1)

In [None]:
y_pred1 = gb_tfidf.predict(X_test1)
# printing the classification report for validation of the model
print(classification_report(y_test1, y_pred1))

### using gensim - word2vect

In [None]:
# defining X and y arrays for training and testing
X2 = word_vectors
y2 = data['target'].values

In [None]:
# Create training and test sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.33, random_state=44)

In [None]:
# reshaping the input values for classifying
X_train_2d = np.stack(X_train2)
X_test_2d =  np.stack(X_test2)
X_train_2d.shape , X_test_2d.shape

In [None]:
# fitting the train data
clf.fit(X_train_2d, y_train2)
# predicting the test values
y_pred2 = clf.predict(X_test_2d)
# printing the classification report for validation of the model
print(classification_report(y_test2, y_pred2))