In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

# Load the dataset
df = pd.read_csv("IMDB Dataset Processed Lemma test.csv")

### Representing the textual data in a suitable model (i.e., Bag of Words, TF-IDF Vectors)

# Represent the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_review'])

### Splitting the data into training and test sets, stratifying by sentiment labels

# Labels (i.e., Sentiment)
y = df['sentiment']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Get the mapping of the numeric labels to the original labels
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label encoding mapping:")
print(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42, stratify=y)

Label encoding mapping:
{'negative': 0, 'positive': 1}


In [2]:
### Initialize and tune Logistic Regression classifier

# Define the parameter grid for regularization strength C
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Initialize the Logistic Regression classifier
log_reg_classifier = LogisticRegression(max_iter=1000, solver='liblinear')

# Define the scoring metric with macro F1 score
scoring = make_scorer(f1_score, average='micro')
grid_search = GridSearchCV(estimator=log_reg_classifier, param_grid=param_grid, scoring=scoring, cv=5) # Cross validation with 5 folds

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_C = grid_search.best_params_['C']
best_score = grid_search.best_score_

print(f"Best C: {best_C}")
print(f"Best F1 score: {best_score}")



Best C: 1
Best F1 score: 0.8891213916551116


In [3]:
# Train the Logistic Regression classifier with the best C
best_log_reg = LogisticRegression(C=best_C, max_iter=1000, solver='liblinear')
best_log_reg.fit(X_train, y_train)

### Model Evaluation

# Predict the classes and probabilities using the best model
predicted_class = best_log_reg.predict(X_test)
predicted_class_train = best_log_reg.predict(X_train)
test_probs = best_log_reg.predict_proba(X_test)
train_probs = best_log_reg.predict_proba(X_train)

# Calculate and print the performance metrics
print('Train confusion matrix is:')
print(confusion_matrix(y_train, predicted_class_train))
print('Test confusion matrix is:')
print(confusion_matrix(y_test, predicted_class))
print(classification_report(y_test, predicted_class))

# Calculate train and test accuracy
train_accuracy = accuracy_score(y_train, predicted_class_train)
test_accuracy = accuracy_score(y_test, predicted_class)
print("Train accuracy score: ", train_accuracy)
print("Test accuracy score: ", test_accuracy)

# Calculate and print the AUC-ROC score
train_auc = roc_auc_score(y_train, train_probs[:, 1], multi_class='ovr')
test_auc = roc_auc_score(y_test, test_probs[:, 1], multi_class='ovr')
print("Train ROC-AUC score:", train_auc)
print("Test ROC-AUC score:", test_auc)

Train confusion matrix is:
[[18161  1597]
 [ 1246 18661]]
Test confusion matrix is:
[[4338  602]
 [ 502 4475]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4940
           1       0.88      0.90      0.89      4977

    accuracy                           0.89      9917
   macro avg       0.89      0.89      0.89      9917
weighted avg       0.89      0.89      0.89      9917

Train accuracy score:  0.9283247195260305
Test accuracy score:  0.8886760108903903
Train ROC-AUC score: 0.9782797224423256
Test ROC-AUC score: 0.9578802166077316


NameError: name 'y_test' is not defined