# Helpfulness Prediction
## Data Science and Big Data Analytics Project

---

### Authors: 
- **Andrea Alberti** ([GitHub](https://github.com/AndreaAlberti07))
- **Davide Ligari** ([GitHub](https://github.com/DavideLigari01))
- **Cristian Andreoli** ([GitHub](https://github.com/CristianAndreoli94))

### Date: September 2023

---

## Data: 
The chosen dataset is [Amazon Books Reviews](https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews).


## Goal:
Build a model able to predict the helpfulness of a review based on its content. 

---

In [1]:
import pymongo as pm
import pyspark as ps
import pandas as pd
import numpy as np
import gensim
import sklearn as sk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestRegressor
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaalberti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1. Data Loading

In [2]:
client = pm.MongoClient('mongodb://localhost:27017/')
spark_db = client['spark_db']
books_ratings = spark_db['books_rating']

### 2. Data Reshaping

In [None]:
pipeline_remove = {'$match':{
                        'review/score':{'$exists':True},
                        'N_helpful'	:{'$exists':True, '$ne':0},
                        'Tot_votes'	:{'$exists':True, '$ne':0}
                        }
    
                }

smoothing_param = 1

pipeline_project = {'$project':{
                            'review/text':1,
                            'helpfulness_score':{'$divide':[
                                                        {'$sum':['$N_helpful', smoothing_param]},
                                                        {'$sum': ['$Tot_votes', smoothing_param*2]}
                                                             ]
                                                 },
                            '_id':0,
                            }
                    }

mongo_dataset = books_ratings.aggregate([pipeline_remove, pipeline_project])
df_dataset = pd.DataFrame(list(mongo_dataset))
arr_dataset = np.array(df_dataset)

X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(arr_dataset[:,0], arr_dataset[:,1], test_size=0.2, random_state=42)

### 3. Features Extraction

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    tokens = gensim.utils.simple_preprocess(doc)
    return [token for token in tokens if token not in stop_words]

X_train_w2v = [preprocess(doc) for doc in X_train]

In [None]:
model = gensim.models.Word2Vec(X_train_w2v, vector_size=30, window=5, min_count=2)

def get_embedding(doc):
    embeddings = []
    words = preprocess(doc)
    for word in words:
        if word in model.wv:
            embeddings.append(model.wv[word])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

X_train_embedding = [get_embedding(doc) for doc in X_train]
X_test_embedding = [get_embedding(doc) for doc in X_test]

In [None]:
np.savez('../model/train_data_wv2_30_5.npz',x = X_train_embedding, y = Y_train)
np.savez('../model/test_data_w2v_30_5.npz',x = X_test_embedding, y = Y_test)

### 4. Model Training

In [None]:
rand_forest = RandomForestRegressor(n_estimators=100, random_state=42)
rand_forest.fit(X_train_embedding, Y_train)
joblib.dump(rand_forest, '../model/rand_forest_model.gz', compress=('gzip', 3))

In [None]:
Y_pred_smoothing = rand_forest.predict(X_test_embedding)

In [None]:
def rmse(Y_test, Y_pred):
    return np.sqrt(sk.metrics.mean_squared_error(Y_test, Y_pred))

rmse_smoothing = rmse(Y_test, Y_pred_smoothing)
print('RMSE with smoothing: ', rmse_smoothing)

# Choose a dynamic threshold to determine whether a review is helpful or not

### 5. Model Evaluation

In [None]:
# Other approach: change the helpfulness before, and just fit a binary classification model.

# Other: use y as features and x as target, and fit a regression model.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def equation(c, x, y):
    return (x + c) / (y + 2 * c)

def find_xy_pairs(c, target_ratio, max_value=100):
    valid_pairs = []

    for x in range(1, max_value + 1):
        for y in range(1, max_value + 1):
            if (
                equation(c, x, y) >= target_ratio * 0.95
                and equation(c, x, y) <= target_ratio * 1.05
            ):
                valid_pairs.append((x, y))

    return valid_pairs

c = 5  # Replace with your desired constant value
target_ratio = 0.8  # Replace with your desired target ratio
max_value = 100  # You can adjust this maximum value as needed

valid_pairs = find_xy_pairs(c, target_ratio, max_value)

x_values = [x for x, _ in valid_pairs]
y_values = [y for _, y in valid_pairs]

plt.scatter(x_values, y_values, label=f"(x, y) for ~{target_ratio}", color='blue')
plt.xlabel("x")
plt.ylabel("y")
plt.title(f"Pairs of (x, y) for (x+c)/(y+2c) ~ {target_ratio} (c = {c})")
plt.grid(True)
plt.legend()
plt.show()



---

## Use Total votes as feature to predict helpful votes

In [3]:
pipeline_remove = {'$match':{
                        'review/score':{'$exists':True},
                        'N_helpful'	:{'$exists':True, '$ne':0},
                        'Tot_votes'	:{'$exists':True, '$ne':0}
                        }
    
                }

smoothing_param = 1

pipeline_project = {'$project':{
                            'review/text':1,
                            'Tot_votes':1,
                            'N_helpful':1,
                            '_id':0,
                            }
                    }

mongo_dataset = books_ratings.aggregate([pipeline_remove, pipeline_project])
df_dataset = pd.DataFrame(list(mongo_dataset))
arr_dataset = np.array(df_dataset)

X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(arr_dataset[:,0:2], arr_dataset[:,2], test_size=0.2, random_state=42)

In [4]:
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    tokens = gensim.utils.simple_preprocess(doc)
    return [token for token in tokens if token not in stop_words]

X_train_w2v = [preprocess(doc[0]) for doc in X_train[:]]

In [5]:
model = gensim.models.Word2Vec(X_train_w2v, vector_size=30, window=5, min_count=2)

def get_embedding(doc):
    embeddings = []
    words = preprocess(doc[0])
    for word in words:
        if word in model.wv:
            embeddings.append(np.append(model.wv[word], doc[1]))
    if len(embeddings) > 0:
         return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size+1)

X_train_embedding = [get_embedding(doc) for doc in X_train[:]]
X_test_embedding = [get_embedding(doc) for doc in X_test[:]]

In [6]:
np.savez('../model/train_data_wv2_totvotes_30_5.npz',x = X_train_embedding, y = Y_train)
np.savez('../model/test_data_w2v_totvotes_30_5.npz',x = X_test_embedding, y = Y_test)

In [7]:
rand_forest = RandomForestRegressor(n_estimators=100, random_state=42)
rand_forest.fit(X_train_embedding, Y_train)
joblib.dump(rand_forest, '../model/rand_forest_model_totvotes.gz', compress=('gzip', 3))

['../model/rand_forest_model_totvotes.gz']

In [13]:
Y_test_pred = rand_forest.predict(X_test_embedding)
Y_train_pred = rand_forest.predict(X_train_embedding)

In [14]:
def rmse(Y_test, Y_pred):
    return np.sqrt(sk.metrics.mean_squared_error(Y_test, Y_pred))

rmse_test = rmse(Y_test, Y_test_pred)
print('RMSE with smoothing: ', rmse_test)

rmse_train = rmse(Y_train, Y_train_pred)
print('RMSE with smoothing: ', rmse_train)

RMSE with smoothing:  37.83598524834085
RMSE with smoothing:  3.6201110196690367
