In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('IMDB Dataset.csv', engine='python', on_bad_lines='skip')
display(df.head())

In [None]:
df.shape

In [None]:
df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])
display(df.head())

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
import nltk

In [None]:
nltk.download('punkt_tab')

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
import string

In [None]:
import re
def transform_text (text):
  text = text.lower()
  text = re.sub(r'<br />', '', text) # Remove <br /> tags
  text = nltk.word_tokenize(text)
  y = []
  for i in text:
    if i.isalnum():
      y.append(i)
  text = y[:]
  y.clear()
  for i in text:
    if i not in stopwords.words('english') and i not in string.punctuation:
       y.append(i)

  text = y[:]
  y.clear()
  for i in text:
    y.append(ps.stem(i))
  return " ".join(y)

In [None]:
df["transformed_text"] = df["review"].apply(transform_text)

In [None]:
df.to_csv('imdb_dataset_cleaned.csv', index=False)
print('DataFrame exported to imdb_dataset_cleaned.csv')

In [None]:
df.head(1)

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
good_review=wc.generate(df[df['sentiment']==1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.imshow(good_review)

In [None]:
bad_review=wc.generate(df[df['sentiment']==0]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.imshow(bad_review)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(max_features= 5000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
display(X.shape)

In [None]:
pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
X = model.encode(
    df['transformed_text'].tolist(),
    show_progress_bar=True
)

print(X.shape)

In [None]:
pip install gensim

In [None]:
from gensim.models import Word2Vec
import numpy as np

In [None]:
sentences = df['transformed_text'].apply(lambda x: x.split())

In [None]:
w2v = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)


In [None]:
def document_vector(doc):
    words = doc.split()
    vectors = [
        w2v.wv[word]
        for word in words
        if word in w2v.wv
    ]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v.vector_size)


In [None]:
X = np.vstack(df['transformed_text'].apply(document_vector))
print(X.shape)

In [None]:
y = df['sentiment'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
!pip install optuna

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, precision_score

def objective(trial):
    # 2. Define ngram_range parameter
    ngram_range = trial.suggest_categorical('ngram_range', [(1, 1), (1, 2)])

    # 3. Define max_df parameter
    max_df = trial.suggest_float('max_df', 0.5, 0.8, step=0.05)

    # 4. Define min_df parameter
    min_df = trial.suggest_int('min_df', 1, 5)

    # 5. Define max_features parameter
    max_features = trial.suggest_int('max_features', 4000, 6000, step=200)

    # 6. Initialize TfidfVectorizer with trial parameters
    tfidf = TfidfVectorizer(ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features)

    # 7. Apply TfidfVectorizer
    X = tfidf.fit_transform(df['transformed_text']).toarray()

    # Prepare target variable
    y = df['sentiment'].values

    # 8. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

    # 10. Train the model
    lr = LogisticRegression(solver='liblinear') # Using 'liblinear' solver for binary classification
    lr.fit(X_train, y_train)

    # 11. Make predictions
    y_pred = lr.predict(X_test)

    # Calculate F1-score and Precision
    current_f1_score = f1_score(y_test, y_pred)

    return current_f1_score

print("Objective function 'objective' redefined to prioritize precision=1 successfully.")

In [None]:
import optuna

# Create an Optuna study object
# We want to maximize the F1-score among precision=1 solutions, so direction is 'maximize'
study = optuna.create_study(direction='maximize')

# Run the optimization process with the new objective function
# Call the objective function for a specified number of trials (e.g., 100)
study.optimize(objective, n_trials=100)

# Print the best trial's value (maximum F1-score achieved with precision=1)
print(f"Best trial's F1-score : {study.best_value:.4f}")

# Print the best trial's parameters
print("Best trial's parameters (for precision=1):")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

In [None]:
best_params_precision1 = study.best_params

optimal_ngram_range_precision1 = best_params_precision1['ngram_range']
optimal_max_df_precision1 = best_params_precision1['max_df']
optimal_min_df_precision1 = best_params_precision1['min_df']
optimal_max_features_precision1 = best_params_precision1['max_features']

print("Stored optimal TF-IDF parameters for precision=1:")
print(f"  ngram_range: {optimal_ngram_range_precision1}")
print(f"  max_df: {optimal_max_df_precision1}")
print(f"  min_df: {optimal_min_df_precision1}")
print(f"  max_features: {optimal_max_features_precision1}")

In [None]:
tfidf = TfidfVectorizer(  ngram_range =(1, 1), max_df= 0.55, min_df= 2, max_features= 5200)
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
import pickle

# Define the filename for the exported TF-IDF vectorizer
tfidf_filename = 'tfidf_vectorizer.pkl'

# Open the file in binary write mode and save the tfidf object
with open(tfidf_filename, 'wb') as file:
    pickle.dump(tfidf, file)

print(f"TF-IDF vectorizer exported to {tfidf_filename}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
lr = LogisticRegression(
        solver="liblinear",
        penalty="l2",
        C=2.909456422800902,
        max_iter=195,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    )
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Logistic Regression Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"Logistic Regression Recall: {recall_score(y_test, y_pred_lr):.4f}")
print(f"Logistic Regression F1 Score: {f1_score(y_test, y_pred_lr):.4f}")

In [None]:
cm_lr = confusion_matrix(y_test, y_pred_lr)

plt.figure(figsize=(6, 4))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Logistic Regression Confusion Matrix')
plt.show()

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

def objective_lr(trial):

    # Regularization strength
    C = trial.suggest_float("C", 1e-3, 10.0, log=True)

    # Tolerance
    tol = trial.suggest_float("tol", 1e-6, 1e-3, log=True)

    # Class weight
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    # Max iterations
    max_iter = trial.suggest_int("max_iter", 200, 1000)

    # Build model
    model = LogisticRegression(
        solver = "liblinear",
        penalty = "l2", # Added missing comma here
        C=C,
        tol=tol,
        class_weight=class_weight,
        max_iter=max_iter,
        random_state=42,
        n_jobs=-1
    )

    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Metric
    return f1_score(y_test, y_pred)


print("Objective function 'objective_lr' redefined successfully with comprehensive hyperparameter tuning for Logistic Regression.")

In [None]:
import optuna

# Create an Optuna study object to maximize the F1-score
study_lr = optuna.create_study(direction='maximize')

# Run the optimization process with the objective_lr function for 100 trials
study_lr.optimize(objective_lr, n_trials=50)

# Print the best trial's value (maximum F1-score)
print(f"Best trial's F1-score for Logistic Regression: {study_lr.best_value:.4f}")

# Print the best trial's parameters
print("Best trial's parameters for Logistic Regression:")
for key, value in study_lr.best_params.items():
    print(f"  {key}: {value}")

In [None]:
from xgboost import XGBClassifier

# Instantiate an XGBClassifier object
xgb = XGBClassifier(random_state=2, use_label_encoder=False, eval_metric='logloss')

# Train the XGBoost model
xgb.fit(X_train, y_train)

# Make predictions on X_test
y_pred_xgb = xgb.predict(X_test)

# Calculate and print the evaluation metrics
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"XGBoost Precision: {precision_score(y_test, y_pred_xgb):.4f}")
print(f"XGBoost Recall: {recall_score(y_test, y_pred_xgb):.4f}")
print(f"XGBoost F1 Score: {f1_score(y_test, y_pred_xgb):.4f}")

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Re-split the data after updating X with optimal TF-IDF parameters
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Instantiate an XGBClassifier object
xgb = XGBClassifier(random_state=2, use_label_encoder=False, eval_metric='logloss')

# Train the XGBoost model
xgb.fit(X_train, y_train)

# Make predictions on X_test
y_pred_xgb = xgb.predict(X_test)

# Calculate and print the evaluation metrics
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"XGBoost Precision: {precision_score(y_test, y_pred_xgb):.4f}")
print(f"XGBoost Recall: {recall_score(y_test, y_pred_xgb):.4f}")
print(f"XGBoost F1 Score: {f1_score(y_test, y_pred_xgb):.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier object
rf = RandomForestClassifier(random_state=2)

# Train the Random Forest model
rf.fit(X_train, y_train)

# Make predictions on X_test
y_pred_rf = rf.predict(X_test)

# Calculate and print the evaluation metrics
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Random Forest Precision: {precision_score(y_test, y_pred_rf):.4f}")
print(f"Random Forest Recall: {recall_score(y_test, y_pred_rf):.4f}")
print(f"Random Forest F1 Score: {f1_score(y_test, y_pred_rf):.4f}")

In [None]:
from sklearn.svm import SVC

# Instantiate an SVC object
# Using a linear kernel for text data can be effective and computationally less expensive than RBF for high-dimensional sparse data
svm = SVC(kernel='linear', random_state=2)

# Train the SVM model
svm.fit(X_train, y_train)

# Make predictions on X_test
y_pred_svm = svm.predict(X_test)

# Calculate and print the evaluation metrics
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(f"SVM Precision: {precision_score(y_test, y_pred_svm):.4f}")
print(f"SVM Recall: {recall_score(y_test, y_pred_svm):.4f}")
print(f"SVM F1 Score: {f1_score(y_test, y_pred_svm):.4f}")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Instantiate a KNeighborsClassifier object
# Using n_neighbors=5 as a common starting point
knn = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn.fit(X_train, y_train)

# Make predictions on X_test
y_pred_knn = knn.predict(X_test)

# Calculate and print the evaluation metrics
print(f"KNN Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"KNN Precision: {precision_score(y_test, y_pred_knn):.4f}")
print(f"KNN Recall: {recall_score(y_test, y_pred_knn):.4f}")
print(f"KNN F1 Score: {f1_score(y_test, y_pred_knn):.4f}")