In [3]:
import pandas as pd
import gzip
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from preprocessing.pre_process import clean_data_helpfulness
from sample.under_sampling import under_sample
from network.svc import svc_hypertune

In [4]:
filename="data/reviews_Office_Products_5.json.gz"
path=os.path.join(os.getcwd(), filename)

def parse(path):
    with gzip.open(path, 'rb') as f:
        for line in f:
            yield json.loads(line)

def getDF(path):
    try:
        df = {i: d for i, d in enumerate(parse(path))}
        return pd.DataFrame.from_dict(df, orient='index')
    except FileNotFoundError:
        print(f"Error: File '{path}' not found.")
    except Exception as e:
        print(f"Error: {e}")

amazon_reviews = getDF(path)

In [5]:
amazon_reviews = clean_data_helpfulness(amazon_reviews)

Cleaning the text...


In [6]:
amazon_reviews.head()

Unnamed: 0,clean_text,helpful,sentiment
0,bought first hp c serv faith lost travel searc...,"[3, 4]",Positive
1,belat review feel oblig share view old workhor...,"[7, 9]",Positive
2,hp gx kick twenti year hp year old still flawl...,"[3, 3]",Negative
3,start financ stuff recent went look good time ...,"[7, 8]",Positive
4,simpl calcul discount cash flow one still best...,"[0, 0]",Positive


In [7]:
# Extract helpful votes and total votes from the 'helpful' column
amazon_reviews['helpful_votes'] = amazon_reviews['helpful'].apply(lambda x: x[0])
amazon_reviews['total_votes'] = amazon_reviews['helpful'].apply(lambda x: x[1])

# Calculate the quality score for each review
# Quality score = Number of helpful votes / Total number of votes
# For reviews with no votes, the quality score will be NaN or 0 (we will handle this later)
amazon_reviews['helpfullness'] = amazon_reviews['helpful_votes'] / amazon_reviews['total_votes']
amazon_reviews['helpfullness'] = amazon_reviews['helpfullness'].fillna(0)

# Display the updated dataframe
amazon_reviews.head()

Unnamed: 0,clean_text,helpful,sentiment,helpful_votes,total_votes,helpfullness
0,bought first hp c serv faith lost travel searc...,"[3, 4]",Positive,3,4,0.75
1,belat review feel oblig share view old workhor...,"[7, 9]",Positive,7,9,0.777778
2,hp gx kick twenti year hp year old still flawl...,"[3, 3]",Negative,3,3,1.0
3,start financ stuff recent went look good time ...,"[7, 8]",Positive,7,8,0.875
4,simpl calcul discount cash flow one still best...,"[0, 0]",Positive,0,0,0.0


In [8]:
amazon_reviews.shape

(52799, 6)

In [9]:
# distribution of sentiment classes 
amazon_reviews.value_counts('sentiment')/len(amazon_reviews)*100

sentiment
Positive    85.090627
Neutral      9.526696
Negative     5.382678
Name: count, dtype: float64

In [10]:
# under sampling 
amazon_reviews_balanced = under_sample(amazon_reviews)

In [11]:
# distribution of sentiment classes after under sampling
amazon_reviews_balanced.value_counts('sentiment')/len(amazon_reviews_balanced)*100

sentiment
Negative    33.333333
Neutral     33.333333
Positive    33.333333
Name: count, dtype: float64

In [12]:
# X is both clean_text and helpfulness column
X = amazon_reviews_balanced[['clean_text', 'helpfullness']]
y = amazon_reviews_balanced['sentiment'] 

# split data into train and test sets
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [15]:
print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Train target shape: {y_train.shape}")
print(f"Test target shape: {y_test.shape}")

Train set shape: (6820, 2)
Test set shape: (1706, 2)
Train target shape: (6820,)
Test target shape: (1706,)


In [23]:
# hyper tune a svc model
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# vectorize the clean_text column
vectorizer = TfidfVectorizer()
clean_train_transformed = vectorizer.fit_transform(X_train['clean_text'])
clean_test_transformed = vectorizer.transform(X_test['clean_text'])

In [24]:
# concat the helpfulness column to the vectorized clean_text column
from scipy.sparse import hstack
X_train_transformed = hstack((clean_train_transformed, np.array(X_train['helpfullness'])[:,None]))
X_test_transformed = hstack((clean_test_transformed, np.array(X_test['helpfullness'])[:,None]))

In [25]:

# hyper tune svc model
pipline = Pipeline([('clf', SVC())])

parameters = {'clf__C': [0.1, 1, 10, 100, 1000],
                'clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                'clf__kernel': ['rbf', 'poly', 'sigmoid']}
grid_search = GridSearchCV(pipline, parameters, n_jobs=-1, cv=5, verbose=2)

grid_search.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
