In [1]:
import pandas as pd
import gzip
import json
import numpy as np
import os
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from preprocessing.pre_process import clean_data_helpfulness
from sample.under_sampling import under_sample

In [2]:
filename="data/reviews_Office_Products_5.json.gz"
path=os.path.join(os.getcwd(), filename)

def parse(path):
    with gzip.open(path, 'rb') as f:
        for line in f:
            yield json.loads(line)

def getDF(path):
    try:
        df = {i: d for i, d in enumerate(parse(path))}
        return pd.DataFrame.from_dict(df, orient='index')
    except FileNotFoundError:
        print(f"Error: File '{path}' not found.")
    except Exception as e:
        print(f"Error: {e}")

amazon_reviews = getDF(path)

In [3]:
amazon_reviews = clean_data_helpfulness(amazon_reviews)

Cleaning the text...


In [4]:
amazon_reviews.head()

Unnamed: 0,clean_text,helpful,sentiment
0,bought first hp c serv faith lost travel searc...,"[3, 4]",Positive
1,belat review feel oblig share view old workhor...,"[7, 9]",Positive
2,hp gx kick twenti year hp year old still flawl...,"[3, 3]",Negative
3,start financ stuff recent went look good time ...,"[7, 8]",Positive
4,simpl calcul discount cash flow one still best...,"[0, 0]",Positive


In [5]:
# Extract helpful votes and total votes from the 'helpful' column
amazon_reviews['helpful_votes'] = amazon_reviews['helpful'].apply(lambda x: x[0])
amazon_reviews['total_votes'] = amazon_reviews['helpful'].apply(lambda x: x[1])

# Calculate the quality score for each review
# Quality score = Number of helpful votes / Total number of votes
# For reviews with no votes, the quality score will be NaN or 0 (we will handle this later)
amazon_reviews['helpfullness'] = amazon_reviews['helpful_votes'] / amazon_reviews['total_votes']
amazon_reviews['helpfullness'] = amazon_reviews['helpfullness'].fillna(0)

# Display the updated dataframe
amazon_reviews.head()

Unnamed: 0,clean_text,helpful,sentiment,helpful_votes,total_votes,helpfullness
0,bought first hp c serv faith lost travel searc...,"[3, 4]",Positive,3,4,0.75
1,belat review feel oblig share view old workhor...,"[7, 9]",Positive,7,9,0.777778
2,hp gx kick twenti year hp year old still flawl...,"[3, 3]",Negative,3,3,1.0
3,start financ stuff recent went look good time ...,"[7, 8]",Positive,7,8,0.875
4,simpl calcul discount cash flow one still best...,"[0, 0]",Positive,0,0,0.0


In [6]:
amazon_reviews.shape

(52799, 6)

In [7]:
# distribution of sentiment classes 
amazon_reviews.value_counts('sentiment')/len(amazon_reviews)*100

sentiment
Positive    85.090627
Neutral      9.526696
Negative     5.382678
Name: count, dtype: float64

In [8]:
# under sampling 
amazon_reviews_balanced = under_sample(amazon_reviews)

In [9]:
# distribution of sentiment classes after under sampling
amazon_reviews_balanced.value_counts('sentiment')/len(amazon_reviews_balanced)*100

sentiment
Negative    33.333333
Neutral     33.333333
Positive    33.333333
Name: count, dtype: float64

In [10]:
# X is both clean_text and helpfulness column
X = amazon_reviews_balanced[['clean_text', 'helpfullness']]
y = amazon_reviews_balanced['sentiment'] 

# split data into train and test sets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [11]:
print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Train target shape: {y_train.shape}")
print(f"Test target shape: {y_test.shape}")

Train set shape: (6820, 2)
Test set shape: (1706, 2)
Train target shape: (6820,)
Test target shape: (1706,)


In [12]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [13]:
# check unique values in y_train and y_test
print(f"Unique values in y_train: {np.unique(y_train)}")
print(f"Unique values in y_test: {np.unique(y_test)}")

Unique values in y_train: [0 1 2]
Unique values in y_test: [0 1 2]


In [14]:
# hyper tune a svc model
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# vectorize the clean_text column
vectorizer = TfidfVectorizer()
clean_train_transformed = vectorizer.fit_transform(X_train['clean_text'])
clean_test_transformed = vectorizer.transform(X_test['clean_text'])

In [15]:
# concat the helpfulness column to the vectorized clean_text column
from scipy.sparse import hstack
X_train_transformed = hstack((clean_train_transformed, np.array(X_train['helpfullness'])[:,None]))
X_test_transformed = hstack((clean_test_transformed, np.array(X_test['helpfullness'])[:,None]))

In [16]:
# run svc model
svc = SVC(random_state=42,
          kernel='linear')
svc.fit(X_train_transformed, y_train)

In [17]:
#predict on test set
y_pred = svc.predict(X_test_transformed)

In [19]:
# plot diagram of decision boundary
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X=X_test_transformed, 
                      y=y_test,
                      clf=svc, 
                      legend=2)

ModuleNotFoundError: No module named 'mlxtend'

In [28]:
#accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6465416178194607

In [30]:
import joblib
# save the vectorizer
joblib.dump(vectorizer, 'models/vectorizer_helpful.pkl')

# save the label encoder
joblib.dump(label_encoder, 'models/label_encoder_helpful.pkl')

#save the model
import joblib
joblib.dump(svc, 'models/svc_helpful.pkl')


['models/svc_helpful.pkl']

## Run the model on the sample data

In [44]:
data = pd.read_csv('data/df_select.csv')

#clean the data
data = clean_data_helpfulness(data)

Cleaning the text...


In [45]:
data.head()

Unnamed: 0,clean_text,helpful,sentiment
0,never seen product curiou test glad fantast po...,"[0, 0]",Positive
1,order printer replac much larger canon could n...,"[1, 1]",Positive
2,new style sharpi origin one use like old style...,"[0, 0]",Positive
3,fairli price calcul rang ful complet geometri ...,"[2, 3]",Positive
4,replac toner recommend clean roller guid job b...,"[0, 0]",Positive


In [46]:
# Extract helpful votes and total votes from the 'helpful' using eval function
data['helpful_votes'] = data['helpful'].apply(lambda x:eval(x)[0])
data['total_votes'] = data['helpful'].apply(lambda x:eval(x)[1])

In [47]:
data.head()

Unnamed: 0,clean_text,helpful,sentiment,helpful_votes,total_votes
0,never seen product curiou test glad fantast po...,"[0, 0]",Positive,0,0
1,order printer replac much larger canon could n...,"[1, 1]",Positive,1,1
2,new style sharpi origin one use like old style...,"[0, 0]",Positive,0,0
3,fairli price calcul rang ful complet geometri ...,"[2, 3]",Positive,2,3
4,replac toner recommend clean roller guid job b...,"[0, 0]",Positive,0,0


In [48]:
# Calculate the quality score for each review
# Quality score = Number of helpful votes / Total number of votes
# For reviews with no votes, the quality score will be NaN or 0 (we will handle this later)
data['helpfullness'] = data['helpful_votes'] / data['total_votes']
data['helpfullness'] = data['helpfullness'].fillna(0)

# Display the updated dataframe
data.head()

Unnamed: 0,clean_text,helpful,sentiment,helpful_votes,total_votes,helpfullness
0,never seen product curiou test glad fantast po...,"[0, 0]",Positive,0,0,0.0
1,order printer replac much larger canon could n...,"[1, 1]",Positive,1,1,1.0
2,new style sharpi origin one use like old style...,"[0, 0]",Positive,0,0,0.0
3,fairli price calcul rang ful complet geometri ...,"[2, 3]",Positive,2,3,0.666667
4,replac toner recommend clean roller guid job b...,"[0, 0]",Positive,0,0,0.0


In [49]:
#load the vectorizer
vectorizer_load = joblib.load('models/vectorizer_helpful.pkl')
clean_sample_transformed = vectorizer_load.transform(data['clean_text'])
feature_transformed = hstack((clean_sample_transformed, np.array(data['helpfullness'])[:,None]))

In [50]:
# load the best model from the pickle file
model_path = 'models/svc_helpful.pkl'
# label_encoder_path = 'models/log_label_encoder.pkl'

best_model = joblib.load(model_path)
# label_encoder = joblib.load(label_encoder_path)

In [51]:
# make the predictions
y_pred = best_model.predict(feature_transformed)

In [52]:
#load the label encoder
label_encoder = joblib.load('models/label_encoder_helpful.pkl')
# labe encode the sentiment column
y_pred= label_encoder.inverse_transform(y_pred)

In [54]:
data['state_predicted'] = y_pred

In [55]:

# make a new dataframe 
data = data[['clean_text','sentiment','state_predicted']]

# save the data
data.to_csv('data/state_svc_predicted.csv', index=False)

In [56]:
# read comparison metrics
metrics = pd.read_csv('data/metrics_comparison_table.csv')

In [59]:
metrics.head(4)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1
0,Texblob,0.723751,0.80544,0.723751,0.757164
1,Vader,0.826707,0.801022,0.826707,0.812372
2,SVC,0.732926,0.89723,0.732926,0.782023
3,Logistic,0.713558,0.883101,0.713558,0.767337
