# Import Modules

In [29]:
import pandas as pd
import numpy as np
import nltk
nltk.download('omw-1.4')
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load Dataset

In [30]:

# Load the data
data = pd.read_csv("IMDB Dataset.csv")


# Preprocess Data

In [31]:
# Preprocess the data
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove emails
    email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
    text = re.sub(email_pattern, '', text)
    
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text().strip()
    
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(" ")])
    
    return text

data['review'] = data['review'].apply(preprocess_text)




# Tokenization

In [32]:
# Tokenize the preprocessed text
tokenized_text = [review.split() for review in data['review']]

# Train Word2Vec Model - CBOW

In [33]:
# Train a Word2Vec model on the tokenized text
model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=0)

# Convert the tokenized text to vectors using the trained Word2Vec model
X = np.array([np.mean([model.wv[word] for word in review], axis=0) for review in tokenized_text])

# Split the data into training and testing sets
X_train_cbow, X_test_cbow, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2)

# Train Word2Vec Model - Skip-gram

In [34]:
# Train a Word2Vec model on the tokenized text
model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=1)

# Convert the tokenized text to vectors using the trained Word2Vec model
X = np.array([np.mean([model.wv[word] for word in review], axis=0) for review in tokenized_text])

# Split the data into training and testing sets
X_train_skip_gram, X_test_skip_gram, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2)

# Train and run Logistic Regression Model for Sentiment Analysis

In [35]:
# Train a logistic regression model on the training data
def logreg(X_train, X_test):
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    y_test_pred = classifier.predict(X_test)
    return y_test_pred

# Train and run Random Forrest Model for Sentiment Analysis

In [36]:
def ranforr(X_train, X_test):
    # Train a random forest classifier on the training data
    classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    classifier.fit(X_train, y_train)
    y_test_pred = classifier.predict(X_test)
    return y_test_pred

# Evaluate Performance

In [37]:
# Evaluate the logistic regression model on the testing data
def evaluate(y_test_cbow, y_test_skip_gram):
    print("Accuracy-CBOW:", accuracy_score(y_test, y_test_cbow))
    print("Accuracy-Skip-gram:", accuracy_score(y_test, y_test_skip_gram))
    print("Precision-CBOW:", precision_score(y_test, y_test_cbow, pos_label='positive'))
    print("Precision-Skip-gram:", precision_score(y_test, y_test_skip_gram, pos_label='positive'))
    print("Recall-CBOW:", recall_score(y_test, y_test_cbow, pos_label='positive'))
    print("Recall-Skip-gram:", recall_score(y_test, y_test_skip_gram, pos_label='positive'))
    print("F1 Score-CBOW:", f1_score(y_test, y_test_cbow, pos_label='positive'))
    print("F1 Score-Skip-gram:", f1_score(y_test, y_test_skip_gram, pos_label='positive'))
    # Create a confusion matrix
    print("Confusion Matrix-CBOW", confusion_matrix(y_test, y_test_cbow))
    print("Confusion Matrix-Skip-gram", confusion_matrix(y_test, y_test_skip_gram))
    

# Logreg Performance

In [38]:
y_test_cbow = logreg(X_train_cbow, X_test_cbow)
y_test_skip_gram = logreg(X_train_skip_gram, X_test_skip_gram)

In [39]:
evaluate(y_test_cbow, y_test_skip_gram)

Accuracy-CBOW: 0.4984
Accuracy-Skip-gram: 0.8678
Precision-CBOW: 0.49635467980295567
Precision-Skip-gram: 0.8595870206489675
Recall-CBOW: 0.5059248845149629
Recall-Skip-gram: 0.8778871259289014
F1 Score-CBOW: 0.5010940919037199
F1 Score-Skip-gram: 0.8686406995230526
Confusion Matrix-CBOW [[2465 2556]
 [2460 2519]]
Confusion Matrix-Skip-gram [[4307  714]
 [ 608 4371]]


# Random Forrest Performance

In [40]:
y_test_cbow = ranforr(X_train_cbow, X_test_cbow)
y_test_skip_gram = ranforr(X_train_skip_gram, X_test_skip_gram)


In [41]:
evaluate(y_test_cbow, y_test_skip_gram)

Accuracy-CBOW: 0.4977
Accuracy-Skip-gram: 0.8356
Precision-CBOW: 0.4953429297205758
Precision-Skip-gram: 0.827667518176459
Recall-CBOW: 0.4699738903394256
Recall-Skip-gram: 0.8459530026109661
F1 Score-CBOW: 0.48232505410697724
F1 Score-Skip-gram: 0.836710369487485
Confusion Matrix-CBOW [[2637 2384]
 [2639 2340]]
Confusion Matrix-Skip-gram [[4144  877]
 [ 767 4212]]
