In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, f1_score
from imblearn.over_sampling import SMOTE
import pickle

In [6]:
df = pd.read_csv("data/parsed_reviews.zip", compression="zip")

In [7]:
df.columns

Index(['reviewText', 'labels', 'cleaned_reviews', 'parsed_reviews'], dtype='object')

In [59]:
df[df["labels"] == 0].iloc[3]["parsed_reviews"]

'use month phone decid quit mount lost pictur pet die shortli got replac card send old one give chanc recov content new one work fine fail month inexcus'

In [8]:
df.shape

(4772, 4)

In [9]:
df["labels"].value_counts()

labels
1.0    4448
0.0     324
Name: count, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df["parsed_reviews"], df["labels"], test_size=0.2)

In [11]:
y_train.value_counts()

labels
1.0    3553
0.0     264
Name: count, dtype: int64

In [12]:
y_test.value_counts()

labels
1.0    895
0.0     60
Name: count, dtype: int64

# Vectorize text using TF-IDF

In [15]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [19]:
type(X_train_tfidf), X_train_tfidf[0]

(scipy.sparse._csr.csr_matrix,
 <1x6413 sparse matrix of type '<class 'numpy.float64'>'
 	with 11 stored elements in Compressed Sparse Row format>)

# Handle imbalance

In [17]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [24]:
y_train_resampled.value_counts()

labels
1.0    3553
0.0    3553
Name: count, dtype: int64

# Logistic Regression

In [26]:
lr = LogisticRegression(C=1.0, penalty='l2')
lr.fit(X_train_resampled, y_train_resampled)

In [27]:
y_pred_lr = lr.predict(X_test_tfidf)

In [32]:
f1_score(y_test, y_pred_lr)

0.9758019133370849

In [31]:
accuracy_score(y_test, y_pred_lr)

0.9549738219895288

# Naive Bayes

Without SMOTE

In [37]:
naive_bayes = MultinomialNB(alpha=1.0)
naive_bayes.fit(X_train_tfidf, y_train)

In [38]:
y_pred_nb = naive_bayes.predict(X_test_tfidf)

In [39]:
f1_score(y_test, y_pred_nb)

0.9675675675675676

In [40]:
accuracy_score(y_test, y_pred_nb)

0.93717277486911

With SMOTE

In [41]:
naive_bayes_smote = MultinomialNB(alpha=1.0)
naive_bayes_smote.fit(X_train_resampled, y_train_resampled)

In [42]:
y_pred_nb_smote = naive_bayes_smote.predict(X_test_tfidf)

In [43]:
f1_score(y_test, y_pred_nb_smote)

0.9523255813953488

In [44]:
accuracy_score(y_test, y_pred_nb_smote)

0.9141361256544502

Conclusion: It's worse with oversampling

# Support Vector Machine

In [46]:
svm = SVC(C=1.0, max_iter=-1, tol=1e-3)
svm.fit(X_train_resampled, y_train_resampled)

In [47]:
y_pred_svm = svm.predict(X_test_tfidf)

In [48]:
f1_score(y_test, y_pred_svm)

0.9775342465753424

In [49]:
accuracy_score(y_test, y_pred_svm)

0.9570680628272251

# Saving models

In [51]:
pickle.dump(lr, open("models/logistic_regression_model.sav", "wb"))
pickle.dump(naive_bayes, open("models/naive_bayes_model.sav", "wb"))
pickle.dump(svm, open("models/svm_model.sav", "wb"))

In [52]:
pickle.dump(vectorizer, open("models/tfidf_vectorizer.sav", "wb"))