In [21]:
# Importing modules
import numpy as np
import matplotlib.pyplot as plt
import scipy
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from datasets import load_dataset
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from typing import Tuple
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

# Fix the random seed for reproducibility
# !! Important !! : do not change this
seed = 1234
np.random.seed(seed) 

In [22]:
# Load dataset once
imdb_dataset = load_dataset("imdb")

In [None]:
# Preprocess the data
def preprocess(text):
    return text.lower().split()

train_sentences = [preprocess(text) for text in imdb_dataset['train']['text']]
test_sentences = [preprocess(text) for text in imdb_dataset['test']['text']]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Vectorize the sentences
def vectorize_sentences(sentences, model, vector_size):
    vectors = []
    for sentence in sentences:
        sentence_vec = np.zeros(vector_size)
        for word in sentence:
            if word in model.wv:
                sentence_vec += model.wv[word]
        vectors.append(sentence_vec / len(sentence))
    return np.array(vectors)

X_train = vectorize_sentences(train_sentences, w2v_model, 100)
X_test = vectorize_sentences(test_sentences, w2v_model, 100)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Inspect the shape of the transformed data
print(f"Train data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

In [None]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Extract labels
y_train = imdb_dataset['train']['label']
y_test = imdb_dataset['test']['label']

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

# Initialize the Logistic Regression model
classifier = LogisticRegression()

grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)  # X_train and y_train need to be predefined with your data

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))


In [15]:
# Fit the model on the training data
classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7720
