# Import Modules

In [56]:
import pandas as pd
import numpy as np
import nltk
nltk.download('omw-1.4')
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load Dataset

In [57]:

# Load the data
data = pd.read_csv("IMDB Dataset.csv")


# Preprocess Data

In [58]:
# Preprocess the data
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove emails
    email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
    text = re.sub(email_pattern, '', text)
    
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text().strip()
    
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(" ")])
    
    return text

data['review'] = data['review'].apply(preprocess_text)




# Tokenization

In [59]:
# Tokenize the preprocessed text
tokenized_text = [review.split() for review in data['review']]

# Train Word2Vec Model - CBOW

In [60]:
# Train a Word2Vec model on the tokenized text
model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=0)

# Convert the tokenized text to vectors using the trained Word2Vec model
X = np.array([np.mean([model.wv[word] for word in review], axis=0) for review in tokenized_text])

# Split the data into training and testing sets
X_train_cbow, X_test_cbow, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2)

# Train Word2Vec Model - Skip-gram

In [61]:
# Train a Word2Vec model on the tokenized text
model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=1)

# Convert the tokenized text to vectors using the trained Word2Vec model
X = np.array([np.mean([model.wv[word] for word in review], axis=0) for review in tokenized_text])

# Split the data into training and testing sets
X_train_skip_gram, X_test_skip_gram, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2)

# Train and run Logistic Regression Model for Sentiment Analysis

In [62]:
# Train a logistic regression model on the training data
def logreg(X_train, X_test):
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    y_test_pred = classifier.predict(X_test)
    return y_test_pred

# Train and run Random Forrest Model for Sentiment Analysis

In [63]:
def ranforr(X_train, X_test):
    # Train a random forest classifier on the training data
    classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    classifier.fit(X_train, y_train)
    y_test_pred = classifier.predict(X_test)
    return y_test_pred

# Train and run Support Vector Machine

In [64]:
def supvecmach(X_train, X_test):
    # Train a support vector machine classifier on the training set
    classifier = SVC(random_state=42)
    classifier.fit(X_train, y_train)
    y_test_pred = classifier.predict(X_test)
    return y_test_pred

# Evaluate Performance

In [65]:
# Evaluate the logistic regression model on the testing data
def evaluate(y_test_cbow, y_test_skip_gram):
    print("Accuracy-CBOW:", accuracy_score(y_test, y_test_cbow))
    print("Accuracy-Skip-gram:", accuracy_score(y_test, y_test_skip_gram))
    print("Precision-CBOW:", precision_score(y_test, y_test_cbow, pos_label='positive'))
    print("Precision-Skip-gram:", precision_score(y_test, y_test_skip_gram, pos_label='positive'))
    print("Recall-CBOW:", recall_score(y_test, y_test_cbow, pos_label='positive'))
    print("Recall-Skip-gram:", recall_score(y_test, y_test_skip_gram, pos_label='positive'))
    print("F1 Score-CBOW:", f1_score(y_test, y_test_cbow, pos_label='positive'))
    print("F1 Score-Skip-gram:", f1_score(y_test, y_test_skip_gram, pos_label='positive'))
    # Create a confusion matrix
    print("Confusion Matrix-CBOW", confusion_matrix(y_test, y_test_cbow))
    print("Confusion Matrix-Skip-gram", confusion_matrix(y_test, y_test_skip_gram))
    

# Logreg Performance

In [66]:
y_test_cbow = logreg(X_train_cbow, X_test_cbow)
y_test_skip_gram = logreg(X_train_skip_gram, X_test_skip_gram)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
evaluate(y_test_cbow, y_test_skip_gram)

Accuracy-CBOW: 0.503
Accuracy-Skip-gram: 0.8765
Precision-CBOW: 0.5069354139575206
Precision-Skip-gram: 0.8765113974231913
Recall-CBOW: 0.4646404449741756
Recall-Skip-gram: 0.8784266984505363
F1 Score-CBOW: 0.4848673300165837
F1 Score-Skip-gram: 0.8774680027780533
Confusion Matrix-CBOW [[2691 2275]
 [2695 2339]]
Confusion Matrix-Skip-gram [[4343  623]
 [ 612 4422]]


### Evaulation CBOW vs skip-gram using logistic regression:
As we can see, the performance of skip-gram is significantly better than the performance of CBOW. Almost 3700 more texts are correctly assigned to their respective sentiment category

# Random Forrest Performance

In [68]:
y_test_cbow = ranforr(X_train_cbow, X_test_cbow)
y_test_skip_gram = ranforr(X_train_skip_gram, X_test_skip_gram)


In [69]:
evaluate(y_test_cbow, y_test_skip_gram)

Accuracy-CBOW: 0.4975
Accuracy-Skip-gram: 0.8439
Precision-CBOW: 0.5009857612267251
Precision-Skip-gram: 0.842167487684729
Recall-CBOW: 0.45431068732618196
Recall-Skip-gram: 0.8490266189908622
F1 Score-CBOW: 0.4765079695801646
F1 Score-Skip-gram: 0.845583143733307
Confusion Matrix-CBOW [[2688 2278]
 [2747 2287]]
Confusion Matrix-Skip-gram [[4165  801]
 [ 760 4274]]


### Evaulation CBOW vs skip-gram using Random Forrest Performance:
As we can see, the performance of skip-gram is significantly better than the performance of CBOW. Almost 3500 more texts are correctly assigned to their respective sentiment category

# Support Vector Machine Perfomance

In [70]:
y_test_cbow = supvecmach(X_train_cbow, X_test_cbow)
y_test_skip_gram = supvecmach(X_train_skip_gram, X_test_skip_gram)

In [71]:
evaluate(y_test_cbow, y_test_skip_gram)

Accuracy-CBOW: 0.5045
Accuracy-Skip-gram: 0.8767
Precision-CBOW: 0.5112953960537604
Precision-Skip-gram: 0.8744827586206897
Recall-CBOW: 0.35518474374255066
Recall-Skip-gram: 0.8816050854191498
F1 Score-CBOW: 0.4191771187434064
F1 Score-Skip-gram: 0.8780294786823623
Confusion Matrix-CBOW [[3257 1709]
 [3246 1788]]
Confusion Matrix-Skip-gram [[4329  637]
 [ 596 4438]]


### Evaulation CBOW vs skip-gram using Support Vector Machine:
As we can see, the performance of skip-gram is significantly better than the performance of CBOW. Almost 3700 more texts are correctly assigned to their respective sentiment category