In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

# Load the dataset
df = pd.read_csv("IMDB Dataset Processed Lemma test.csv")

# Step 1: Preprocess and tag the documents
documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(df['cleaned_review'])]

# Step 2: Initialize and train the Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=40)
doc2vec_model.build_vocab(documents)
doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Step 3: Generate document embeddings (vectors) for each review
X_doc2vec = [doc2vec_model.infer_vector(doc.words) for doc in documents]

# Step 4: Split the data into training and test sets
y = df['sentiment']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_doc2vec, y_encoded, test_size=0.2, random_state=42, stratify=y)

In [2]:
### Initialize and tune Logistic Regression classifier

# Define the parameter grid for regularization strength C
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Initialize the Logistic Regression classifier
log_reg_classifier = LogisticRegression(max_iter=1000, solver='liblinear')

# Define the scoring metric with macro F1 score
scoring = make_scorer(f1_score, average='micro')
grid_search = GridSearchCV(estimator=log_reg_classifier, param_grid=param_grid, scoring=scoring, cv=5) # Cross validation with 5 folds

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_C = grid_search.best_params_['C']
best_score = grid_search.best_score_

print(f"Best C: {best_C}")
print(f"Best F1 score: {best_score}")



Best C: 1
Best F1 score: 0.8666078406655741


In [3]:
# Train the Logistic Regression classifier with the best C
best_log_reg = LogisticRegression(C=best_C, max_iter=1000, solver='liblinear')
best_log_reg.fit(X_train, y_train)

### Model Evaluation

# Predict the classes and probabilities using the best model
predicted_class = best_log_reg.predict(X_test)
predicted_class_train = best_log_reg.predict(X_train)
test_probs = best_log_reg.predict_proba(X_test)
train_probs = best_log_reg.predict_proba(X_train)

# Calculate and print the performance metrics
print('Train confusion matrix is:')
print(confusion_matrix(y_train, predicted_class_train))
print('Test confusion matrix is:')
print(confusion_matrix(y_test, predicted_class))
print(classification_report(y_test, predicted_class))

# Calculate train and test accuracy
train_accuracy = accuracy_score(y_train, predicted_class_train)
test_accuracy = accuracy_score(y_test, predicted_class)
print("Train accuracy score: ", train_accuracy)
print("Test accuracy score: ", test_accuracy)

# Calculate and print the AUC-ROC score
train_auc = roc_auc_score(y_train, train_probs[:, 1], multi_class='ovr')
test_auc = roc_auc_score(y_test, test_probs[:, 1], multi_class='ovr')
print("Train ROC-AUC score:", train_auc)
print("Test ROC-AUC score:", test_auc)

Train confusion matrix is:
[[17132  2626]
 [ 2603 17304]]
Test confusion matrix is:
[[4286  654]
 [ 686 4291]]
              precision    recall  f1-score   support

           0       0.86      0.87      0.86      4940
           1       0.87      0.86      0.86      4977

    accuracy                           0.86      9917
   macro avg       0.86      0.86      0.86      9917
weighted avg       0.86      0.86      0.86      9917

Train accuracy score:  0.8681709315517459
Test accuracy score:  0.864878491479278
Train ROC-AUC score: 0.9389920240160373
Test ROC-AUC score: 0.9365346586199352
