In [1]:
import pandas as pd

df = pd.read_csv('../data/cleaned_reviews.csv')
print(df.shape)
df.head()

(162819, 4)


Unnamed: 0,reviewText,summary,sentiment,cleaned_text
0,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1,awesome listen musthave slayer fanssadly neede...
1,bien,Five Stars,1,bien
2,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1,great hear old stuff like new stuff recommend ...
3,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1,well best ofs bit poison normally bad pretty g...
4,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1,say casting crownsthis good blessing filled cd


In [2]:
from sklearn.model_selection import train_test_split

X = df['cleaned_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training size:", X_train.shape)
print("Testing size:", X_test.shape)

Training size: (130255,)
Testing size: (32564,)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Training matrix shape:", X_train_tfidf.shape)
print("Testing matrix shape:", X_test_tfidf.shape)

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [4]:
df = df.dropna(subset=['cleaned_text'])

X = df['cleaned_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Training matrix shape:", X_train_tfidf.shape)
print("Testing matrix shape:", X_test_tfidf.shape)

Training matrix shape: (129363, 10000)
Testing matrix shape: (32341, 10000)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.25      0.39       743
           1       0.98      1.00      0.99     31598

    accuracy                           0.98     32341
   macro avg       0.92      0.62      0.69     32341
weighted avg       0.98      0.98      0.98     32341



In [6]:
model_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
model_balanced.fit(X_train_tfidf, y_train)

y_pred_balanced = model_balanced.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_balanced))

              precision    recall  f1-score   support

           0       0.22      0.83      0.35       743
           1       1.00      0.93      0.96     31598

    accuracy                           0.93     32341
   macro avg       0.61      0.88      0.65     32341
weighted avg       0.98      0.93      0.95     32341



In [7]:
from sklearn.utils import resample

# Separate classes
df_positive = df[df['sentiment'] == 1]
df_negative = df[df['sentiment'] == 0]

# Undersample positive to match negative count
df_positive_undersampled = resample(df_positive, 
                                     n_samples=len(df_negative),
                                     random_state=42)

# Combine
df_balanced = pd.concat([df_positive_undersampled, df_negative])
print(df_balanced['sentiment'].value_counts())

sentiment
1    3986
0    3986
Name: count, dtype: int64


In [8]:
X_balanced = df_balanced['cleaned_text']
y_balanced = df_balanced['sentiment']

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

vectorizer_b = TfidfVectorizer(max_features=10000)
X_train_b_tfidf = vectorizer_b.fit_transform(X_train_b)
X_test_b_tfidf = vectorizer_b.transform(X_test_b)

model_b = LogisticRegression(max_iter=1000)
model_b.fit(X_train_b_tfidf, y_train_b)

y_pred_b = model_b.predict(X_test_b_tfidf)
print(classification_report(y_test_b, y_pred_b))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87       783
           1       0.90      0.83      0.87       812

    accuracy                           0.87      1595
   macro avg       0.87      0.87      0.87      1595
weighted avg       0.87      0.87      0.87      1595



In [9]:
import joblib

joblib.dump(model_b, '../model/sentiment_model.pkl')
joblib.dump(vectorizer_b, '../model/vectorizer.pkl')

print("Model and vectorizer saved!")

Model and vectorizer saved!
