In [1]:
%load_ext lab_black
import pandas as pd
import numpy as np
import time
import ast
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics

In [2]:
df_reviews = pd.read_csv("../data/w2v_yelp.csv")

In [3]:
df_reviews.head()

Unnamed: 0,stars,Review_Labels,cleaned,w2v
0,2.0,0,someon ha work mani museum wa eager visit thi ...,"[-0.7833308577537537, -0.6459603905677795, 1.5..."
1,1.0,0,actual horrifi thi place still busi year old s...,"[2.3488664627075195, 0.18374943733215332, 1.44..."
2,5.0,2,love deagan realli atmospher cozi festiv shrim...,"[0.10850790143013, 0.16999416053295135, 0.9217..."
3,1.0,0,dismal lukewarm defrostedtast texmex glop mumb...,"[-0.7600313425064087, -0.5083361864089966, 0.9..."
4,4.0,2,oh happi day final cane near casa ye gripe dri...,"[0.5988718867301941, 0.927848756313324, 0.2136..."


In [4]:
train, test = train_test_split(
    df_reviews, test_size=0.3, stratify=df_reviews["Review_Labels"], random_state=42
)

In [5]:
y_train = train["Review_Labels"].values
y_test = test["Review_Labels"].values
x_train = train["w2v"].values
x_test = test["w2v"].values

In [6]:
x_train_cleaned = []
for x in x_train:
    i = ast.literal_eval(x)
    x_train_cleaned.append(list(np.array(i).astype(float)))

In [7]:
x_test_cleaned = []
for x in x_test:
    i = ast.literal_eval(x)
    x_test_cleaned.append(list(np.array(i).astype(float)))

### Logistic Regression: Word2Vec

In [8]:
start_time = time.time()
lr = LogisticRegression(solver="liblinear", random_state=42, max_iter=1000)
lr.fit(x_train_cleaned, y_train)
y_pred = lr.predict(x_test_cleaned)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Test Accuracy: ", round(metrics.accuracy_score(y_test, y_pred), 3))
print("Test F1: ", round(metrics.f1_score(y_test, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  0.11  mins
Test Accuracy:  0.738
Test F1:  0.738


In [10]:
start_time = time.time()
lr = LogisticRegression(solver="liblinear", random_state=42, max_iter=1000)
lr.fit(x_train_cleaned, y_train)
y_pred = lr.predict(x_train_cleaned)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Train Accuracy: ", round(metrics.accuracy_score(y_train, y_pred), 3))
print("Train F1: ", round(metrics.f1_score(y_train, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  0.12  mins
Train Accuracy:  0.738
Train F1:  0.738


### Logistic Regression: BOW

In [11]:
cv = CountVectorizer(binary=True)
cv.fit_transform(train["cleaned"].values)
train_feature_set = cv.transform(train["cleaned"].values)
test_feature_set = cv.transform(test["cleaned"].values)

In [13]:
start_time = time.time()
lr = LogisticRegression(solver="liblinear", random_state=42, max_iter=1000)
lr.fit(train_feature_set, y_train)
y_pred = lr.predict(test_feature_set)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Test Accuracy: ", round(metrics.accuracy_score(y_test, y_pred), 3))
print("Test F1: ", round(metrics.f1_score(y_test, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  0.28  mins
Test Accuracy:  0.842
Test F1:  0.842


In [14]:
start_time = time.time()
lr = LogisticRegression(solver="liblinear", random_state=42, max_iter=1000)
lr.fit(train_feature_set, y_train)
y_pred = lr.predict(train_feature_set)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Train Accuracy: ", round(metrics.accuracy_score(y_train, y_pred), 3))
print("Train F1: ", round(metrics.f1_score(y_train, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  0.28  mins
Train Accuracy:  0.927
Train F1:  0.927


### Logistic Regression: TFIDF

In [16]:
start_time = time.time()
tfidf_v = TfidfVectorizer(use_idf=True, min_df=10, max_df=0.95)
tfidf_v.fit_transform(train["cleaned"].values)
train_feature_set_tfidf = tfidf_v.transform(train["cleaned"].values)
test_feature_set_tfidf = tfidf_v.transform(test["cleaned"].values)
print(
    "Time takes to convert text input into feature vector: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)

Time takes to convert text input into feature vector:  0.13  mins


In [17]:
start_time = time.time()
lr = LogisticRegression(solver="liblinear", random_state=42, max_iter=1000)
lr.fit(train_feature_set_tfidf, y_train)
y_pred = lr.predict(test_feature_set_tfidf)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Test Accuracy: ", round(metrics.accuracy_score(y_test, y_pred), 3))
print("Test F1: ", round(metrics.f1_score(y_test, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  0.04  mins
Test Accuracy:  0.849
Test F1:  0.849


In [18]:
start_time = time.time()
lr = LogisticRegression(solver="liblinear", random_state=42, max_iter=1000)
lr.fit(train_feature_set_tfidf, y_train)
y_pred = lr.predict(train_feature_set_tfidf)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Train Accuracy: ", round(metrics.accuracy_score(y_train, y_pred), 3))
print("Train F1: ", round(metrics.f1_score(y_train, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  0.04  mins
Train Accuracy:  0.868
Train F1:  0.868


### Random Forest: Word2Vec

In [20]:
start_time = time.time()
rf = RandomForestClassifier(n_estimators=200, random_state=0)
rf.fit(x_train_cleaned, y_train)
y_pred = rf.predict(x_test_cleaned)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Test Accuracy: ", round(metrics.accuracy_score(y_test, y_pred), 3))
print("Test F1: ", round(metrics.f1_score(y_test, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  3.63  mins
Test Accuracy:  0.7
Test F1:  0.7


In [21]:
start_time = time.time()
rf = RandomForestClassifier(n_estimators=200, random_state=0)
rf.fit(x_train_cleaned, y_train)
y_pred = rf.predict(x_train_cleaned)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Train Accuracy: ", round(metrics.accuracy_score(y_train, y_pred), 3))
print("Train F1: ", round(metrics.f1_score(y_train, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  3.64  mins
Train Accuracy:  1.0
Train F1:  1.0


### Random Forest: BOW

In [22]:
cv = CountVectorizer(binary=True)
cv.fit_transform(train["cleaned"].values)
train_feature_set = cv.transform(train["cleaned"].values)
test_feature_set = cv.transform(test["cleaned"].values)

In [23]:
start_time = time.time()
rf = RandomForestClassifier(n_estimators=200, random_state=0)
rf.fit(train_feature_set, y_train)
y_pred = rf.predict(test_feature_set)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Test Accuracy: ", round(metrics.accuracy_score(y_test, y_pred), 3))
print("Test F1: ", round(metrics.f1_score(y_test, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  8.12  mins
Test Accuracy:  0.791
Test F1:  0.791


In [24]:
start_time = time.time()
rf = RandomForestClassifier(n_estimators=200, random_state=0)
rf.fit(train_feature_set, y_train)
y_pred = rf.predict(train_feature_set)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Train Accuracy: ", round(metrics.accuracy_score(y_train, y_pred), 3))
print("Train F1: ", round(metrics.f1_score(y_train, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  8.45  mins
Train Accuracy:  1.0
Train F1:  1.0


### Random Forest: TFIDF

In [25]:
start_time = time.time()
tfidf_v = TfidfVectorizer(use_idf=True, min_df=10, max_df=0.95)
tfidf_v.fit_transform(train["cleaned"].values)
train_feature_set_tfidf = tfidf_v.transform(train["cleaned"].values)
test_feature_set_tfidf = tfidf_v.transform(test["cleaned"].values)
print(
    "Time takes to convert text input into feature vector: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)

Time takes to convert text input into feature vector:  0.14  mins


In [26]:
start_time = time.time()
rf = RandomForestClassifier(n_estimators=200, random_state=0)
rf.fit(train_feature_set_tfidf, y_train)
y_pred = rf.predict(test_feature_set_tfidf)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Test Accuracy: ", round(metrics.accuracy_score(y_test, y_pred), 3))
print("Test F1: ", round(metrics.f1_score(y_test, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  6.5  mins
Test Accuracy:  0.805
Test F1:  0.805


In [27]:
start_time = time.time()
rf = RandomForestClassifier(n_estimators=200, random_state=0)
rf.fit(train_feature_set_tfidf, y_train)
y_pred = rf.predict(train_feature_set_tfidf)
print(
    "Time takes to train model and make predictions: ",
    round((time.time() - start_time) / 60, 2),
    " mins",
)
print("Train Accuracy: ", round(metrics.accuracy_score(y_train, y_pred), 3))
print("Train F1: ", round(metrics.f1_score(y_train, y_pred, average="micro"), 3))

Time takes to train model and make predictions:  6.48  mins
Train Accuracy:  1.0
Train F1:  1.0
