In [1]:
# Model Training and Evaluation
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, classification_report, confusion_matrix)
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
print("="*50)
print("LOADING DATA")
print("="*50)

train_df = pd.read_csv('data/processed/train.csv')
val_df = pd.read_csv('data/processed/val.csv')
test_df = pd.read_csv('data/processed/test.csv')

X_train = train_df['text']
y_train = train_df['label']
X_val = val_df['text']
y_val = val_df['label']
X_test = test_df['text']
y_test = test_df['label']

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")

LOADING DATA
Training samples: 34707
Validation samples: 7437
Test samples: 7438


In [3]:
# Feature Extraction
print("\n" + "="*50)
print("FEATURE EXTRACTION")
print("="*50)

# TF-IDF Vectorization
print("Creating TF-IDF features...")

tfidf_vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True,
    norm='l2'
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF feature shape: {X_train_tfidf.shape}")

# Save vectorizer
with open('models/tfidf_vectorizer_2.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print("✓ TF-IDF vectorizer saved")


FEATURE EXTRACTION
Creating TF-IDF features...
TF-IDF feature shape: (34707, 20000)
✓ TF-IDF vectorizer saved


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.5, 1, 2, 3],
    'penalty': ['l2'],
    'solver': ['liblinear']
}

log_reg = LogisticRegression(
    max_iter=2000,
    class_weight='balanced',
    random_state=42
)

grid_search = GridSearchCV(
    log_reg,
    param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_tfidf, y_train)

best_model = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation F1 score:", grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'C': 3, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation F1 score: 0.8957083477615478


In [6]:
y_val_pred = best_model.predict(X_val_tfidf)
# Calculate metrics
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)



print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")

  Accuracy:  0.8981
  Precision: 0.8909
  Recall:    0.9081
  F1 Score:  0.8994


In [9]:
import numpy as np

X_train_full = pd.concat([X_train, X_val])
y_train_full = pd.concat([y_train, y_val])

X_train_full_tfidf = tfidf_vectorizer.fit_transform(X_train_full)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

best_model.fit(X_train_full_tfidf, y_train_full)


In [10]:
y_test_pred = best_model.predict(X_test_tfidf)

print("FINAL TEST RESULTS")
print(classification_report(y_test, y_test_pred))
conf_matrix = confusion_matrix(y_test, y_test_pred)

FINAL TEST RESULTS
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      3705
           1       0.89      0.91      0.90      3733

    accuracy                           0.90      7438
   macro avg       0.90      0.90      0.90      7438
weighted avg       0.90      0.90      0.90      7438



In [11]:
print(conf_matrix)

[[3291  414]
 [ 318 3415]]


In [20]:
# from src.predict import SentimentPredictor
from src.predict import SentimentPredictor

# Initialize predictor
predictor = SentimentPredictor(
    model_path='models/best_model.pkl',
    vectorizer_path='models/tfidf_vectorizer.pkl'
)

# Single prediction
result = predictor.predict("This movie was fantastic!")
print(result)
# Output: {'text': '...', 'sentiment': 'positive', 'label': 1, 'confidence': 0.95}

# Batch prediction
texts = ["Great movie!", "Terrible film."]
results = predictor.predict_batch(texts)
print(results)
# Output: [{'text': 'Great movie!', 'sentiment': 'positive', 'label': 1, 'confidence': 0.95}, {'text': 'Terrible film.', 'sentiment': 'negative', 'label': 0, 'confidence': 0.98}]

{'text': 'This movie was fantastic!', 'sentiment': 'positive', 'label': 1, 'confidence': 0.9511326401485647}
[{'text': 'Great movie!', 'sentiment': 'positive', 'label': 1, 'confidence': 0.9683028804711064}, {'text': 'Terrible film.', 'sentiment': 'negative', 'label': 0, 'confidence': 0.9925853752559761}]


In [7]:
print(X_train)

0        work library expected like movie came year ago...
1        eagle wing pleasant surprise movie keep viewer...
2        new york love collective work eleven short fil...
3        saw movie yesterday night one best made tv fil...
4        playwright sidney bruhl wonderfully overthetop...
                               ...                        
34702    love movie tv programed record come nov nd rea...
34703    big jim carey fan took seat cinema optimism fu...
34704    even buck cast parttime actor christopher nola...
34705    one best movie ive ever seen good acting hank ...
34706    growing voyage space favorite movie remember t...
Name: text, Length: 34707, dtype: object


# Trying with deep learning model using tensorflow

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 20000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')


  if not hasattr(np, "object"):


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()




In [11]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stop]
)


Epoch 1/10
[1m394/543[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m35s[0m 239ms/step - accuracy: 0.6879 - loss: 0.5660

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


In [None]:
model.save("../models/sentiment_lstm_model")

import pickle
with open("../models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
