In [9]:
import pandas as pd
import string
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np

In [10]:
nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)

print(f"NLTK data directory: {nltk_data_dir}")
print(f"punkt exists: {os.path.exists(os.path.join(nltk_data_dir, 'tokenizers', 'punkt'))}")
print(f"stopwords exists: {os.path.exists(os.path.join(nltk_data_dir, 'corpora', 'stopwords'))}")
print(f"wordnet exists: {os.path.exists(os.path.join(nltk_data_dir, 'corpora', 'wordnet'))}")
print(f"punkt_tab exists: {os.path.exists(os.path.join(nltk_data_dir, 'tokenizers', 'punkt_tab'))}")

NLTK data directory: C:\Users\Ziad\nltk_data
punkt exists: True
stopwords exists: True
wordnet exists: False
punkt_tab exists: True


[nltk_data] Downloading package punkt to C:\Users\Ziad\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ziad\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Ziad\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
file_path = r"D:\Courses\GP-Cinemate\ML\machine-learning-dev\preprocessing\reviews.txt"
if os.path.exists(file_path):
    print("File found!")
    data = pd.read_csv(file_path, sep='\t', header=None, names=['sentiment', 'review'])
    print(data.head())
else:
    print("File not found at:", file_path)

File found!
   sentiment                                             review
0          1            The Da Vinci Code book is just awesome.
1          1  this was the first clive cussler i've ever rea...
2          1                   i liked the Da Vinci Code a lot.
3          1                   i liked the Da Vinci Code a lot.
4          1  I liked the Da Vinci Code but it ultimatly did...


In [12]:
data = data.drop_duplicates(subset='review')
data = data.dropna(subset=['review'])

In [13]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [14]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [15]:
data['cleaned_review'] = data['review'].apply(preprocess_text)

In [16]:
X = data['cleaned_review']
y = data['sentiment']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [19]:
models = {
    'SVM': SVC(probability=True),
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

In [20]:
param_grids = {
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'Naive Bayes': {
        'alpha': [0.1, 0.5, 1.0, 2.0]
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear'],
        'penalty': ['l2']
    }
}

In [None]:
best_models = {}
print("\nGrid Search Results:")
for name in models.keys():
    print(f"\nTuning {name}...")
    grid_search = GridSearchCV(
        models[name],
        param_grids[name],
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train_tfidf, y_train)
    
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")


Grid Search Results:

Tuning SVM...
Best parameters for SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation score: 0.9690

Tuning Naive Bayes...
Best parameters for Naive Bayes: {'alpha': 0.5}
Best cross-validation score: 0.9467

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Best cross-validation score: 0.9690


In [None]:
print("\nTest Set Evaluation:")
best_model_name = None
best_test_score = 0
for name, model in best_models.items():
    y_pred = model.predict(X_test_tfidf)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Test Results:")
    print(f"Accuracy: {test_accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))
    
    if test_accuracy > best_test_score:
        best_test_score = test_accuracy
        best_model_name = name


Test Set Evaluation:

SVM Test Results:
Accuracy: 0.9768
Classification Report:
              precision    recall  f1-score   support

    negative       0.96      0.98      0.97       106
    positive       0.99      0.97      0.98       153

    accuracy                           0.98       259
   macro avg       0.97      0.98      0.98       259
weighted avg       0.98      0.98      0.98       259


Naive Bayes Test Results:
Accuracy: 0.9382
Classification Report:
              precision    recall  f1-score   support

    negative       0.96      0.89      0.92       106
    positive       0.93      0.97      0.95       153

    accuracy                           0.94       259
   macro avg       0.94      0.93      0.94       259
weighted avg       0.94      0.94      0.94       259


Logistic Regression Test Results:
Accuracy: 0.9807
Classification Report:
              precision    recall  f1-score   support

    negative       0.98      0.97      0.98       106
    positive  

In [23]:
print(f"\nBest model overall: {best_model_name}")
joblib.dump(best_models[best_model_name], 'sentiment_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
print("Best model and vectorBraizer saved successfully.")


Best model overall: Logistic Regression
Best model and vectorBraizer saved successfully.
