In [3]:
from surprise import SVD, BaselineOnly, NMF, SlopeOne, CoClustering 
from surprise import Dataset
from surprise import Reader
from sklearn.pipeline import Pipeline
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import accuracy
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from collections import defaultdict
import pandas as pd
import gzip, pickle
import json
import numpy as np

## Data Selection Step:


1. **Read Data from GZ File**: In this step, we read the data from a GZ file line by line and store each line in a list.

2. **Convert List to DataFrame**: After reading the data, we convert the list into a dataframe. A dataframe is a tabular data structure commonly used in data analysis and manipulation.

3. **Column Selection**: Once we have the dataframe, we identify and drop the unwanted columns that are not required for further analysis. 

4. **Choose Relevant Data Fields**: From the remaining columns, we select three specific data fields: 'ReviewerID', 'asin', and 'overall'. These fields provide information about the reviewer's ID, the product's ASIN (Amazon Standard Identification Number), and the overall rating given by the reviewer.

5. **Drop Null Fields**: We identify and drop any null fields in the dataframe. Null fields are missing or undefined values that could affect the accuracy of our analysis.

6. **Reduce Dataset Size**: Since the dataset is extremely big, for ease of calcluation, a subset of 100,000 data samples were chosen the dataset for further analysis.

In [15]:
def load_data(datafile):
    data = []
    with gzip.open(datafile) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    print('Finished readind data from the file...')
    print('Proceeding to convert the data into a dataframe')
    df = pd.DataFrame(data)
    return df

datafile = 'Automotive.json.gz'
# df = load_data(datafile)

# Dropping unwanted columns
# df.drop(['verified', 'reviewTime', 'style','reviewText', 'reviewerName', 'summary', 'unixReviewTime', 'vote', 'image'], axis=1, inplace=True)

# Dropping fields which are NULL
# df = df.dropna()

# Saving that dataset as a csv file for further use
# df.to_csv('./data/dataset.csv', index=False)

In [2]:
total_dataset = pd.read_csv('./data/dataset.csv')

total_ratings = total_dataset.shape[0]
print("Total Number of ratings: ", total_ratings)

total_unique_reviwer_id, total_reviewer_count = np.unique(total_dataset.reviewerID, return_counts = True)
print("Number of unique reviewers: ", len(total_unique_reviwer_id))


total_unique_product_id, total_product_count = np.unique(total_dataset.asin, return_counts = True)
print("Number of unique products: ", len(total_unique_product_id))

Total Number of ratings:  7990166
Number of unique reviewers:  3873247
Number of unique products:  925387


In [40]:
dataset = total_dataset.sample(n=100000) 

unique_reviwer_id, reviewer_count = np.unique(dataset.reviewerID, return_counts = True)
print("Number of unique reviewers: ", len(unique_reviwer_id))

unique_product_id, product_count = np.unique(dataset.asin, return_counts = True)
print("Number of unique products: ", len(unique_product_id))

dataset.to_csv('./data/final_dataset.csv', index=False)

Number of unique reviewers:  96805
Number of unique products:  61933


In [4]:
dataset = pd.read_csv('./data/final_dataset.csv')

## Data Preprocessing Step:

1. **Group Data by Review IDs**: In this step, we group the data by review IDs. This allows us to analyze and process the data at the reviewer level.

2. **Split Data into Train and Test Sets**: Once the data is grouped by review IDs, we split the total dataset into train and test subsets. The train set will be used to train our model, while the test set will be used to evaluate its performance.

3. **Create Train and Test Sets using surprise's Reader() Class**: To prepare the data for further analysis, we utilize surprise's Reader() class. This class helps parse through the dataframes and create train and test sets that can be used with various collaborative filtering algorithms.



In [5]:
def get_tain_and_test(dataset):
    data_to_sample = dataset.copy()
    grouped_data = data_to_sample.groupby(by='reviewerID')
    train_data = grouped_data.sample(frac=0.8)
    test_data = data_to_sample.drop(train_data.index)
    reader = Reader(rating_scale=(1,5))
    train_data = Dataset.load_from_df(train_data[['reviewerID','asin','overall']], reader = reader)
    test_data = Dataset.load_from_df(test_data[['reviewerID','asin','overall']], reader = reader)
    trainset = train_data.build_full_trainset()
    testset = test_data.build_full_trainset().build_testset()
    return trainset, testset, train_data, test_data

trainset, testset, train_data, test_data = get_tain_and_test(dataset)

## Model training and evaluation:

After preparing the datasets for training and testing, I proceeded to evaluate the performance of four recommendation algorithms: SVD, BaselineOnly, NMF, and CoClustering. The implementation of these algorithms was done using the Python package 'Surprise'.

The evaluation process involved the following steps:

1. **Initialization**: I initialized an instance of each algorithm using the Surprise package. This step sets up the algorithm with default or specified parameters.

2. **Training and Testing**: The initialized algorithms were trained using the training set, which contains user-item interactions. The models were then tested using the testing set, which consists of unseen user-item pairs. The algorithms utilized the training data to learn patterns and make predictions for the testing data.

3. **Performance Metrics**: To assess the accuracy of the models, I calculated the Root Mean Squared Error (RMSE) and Mean Squared Error (MSE) scores on the predictions obtained from the test set. These metrics provide insights into the quality of the recommendations made by the models.

4. **Top 10 Recommendations**: After obtaining the predictions, I selected the top 10 recommendations for each user from each algorithm's output. These recommendations represent the most highly recommended items for each user based on the trained models.

5. **Recommendation Evaluation**: Additionally, I evaluated the quality of the recommendation predictions for each algorithm. Specifically, I calculated precision, recall, F-measure, and conversion rate for the top 10 recommendations generated by each algorithm for each item in the test set. The code for precision and recall calculations was adapted from the Surprise package's documentation, which can be found at [this source](https://surprise.readthedocs.io/en/stable/FAQ.html).

The models were trained using the Surprise package, which provides a convenient framework for building and evaluating recommendation systems. The use of existing code for precision and recall calculations from the Surprise documentation ensures accurate and standardized evaluation of the recommendation algorithms.



In [8]:
def get_model(algo_name):
    if algo_name == 'Baseline':
        return BaselineOnly({'method': 'sgd', 'n_epochs': 10},verbose=True)
    elif algo_name == 'SVD':
        return SVD()
    elif algo_name == 'Co-Clustering':
        return CoClustering()
    elif algo_name == 'NMF':
        return NMF()
    else:
        print("Invalid algorithm!")

def train_model(model_name, train_set, test_set):
    print()
    print('------------------------------------------------------------------------')
    print('Current model:', model_name)
    model = get_model(model_name)
    print('Fitting model...')
    model.fit(train_set)
    print('Completed model training... ')
    print()
    print('Obtaining predictions on TEST data... ')
    test_predictions = model.test(test_set)
    test_rmse = accuracy.rmse(test_predictions, verbose = True)
    test_mse = accuracy.mse(test_predictions, verbose = True)
    print('test RMSE: ' + str(test_rmse) + ' .... Test MSE: ' + str(test_mse))
    print('------------------------------------------------------------------------')
    print()
    return model, test_predictions

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    precisions = dict()
    recalls = dict()
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    f_measure = (2*recall*precision)/(recall+precision)
    c = len([prec for prec in precisions.values() if prec>0])
    conversionRate = c/len(precisions)
    return precision, recall, f_measure, conversionRate

def get_top_k(predictions, k, model_name):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:k]
    File = open('./results/' + str(model_name) + '.txt', "w")
    for uid, user_ratings in top_n.items():
        for i, (iid, est) in enumerate(user_ratings):
            File.write(uid + ' ' + iid + "\n")
    File.close()
    return top_n

In [9]:
baseline_model, baseline_test_predictions = train_model('Baseline', trainset, testset)
pickle.dump( baseline_model, open( "./results/baseline_model.p", "wb" ) )
pickle.dump( baseline_test_predictions, open( "./results/baseline_test_predictions.p", "wb" ) )

precision_baseline, recall_baseline, f_measure_baseline, conversionRate_baseline = precision_recall_at_k(baseline_test_predictions, k=10, threshold=3)

print('Precision:', precision_baseline)
print('Recall:', recall_baseline)
print('F-Measure:', f_measure_baseline)
print('Conversion Rate:', conversionRate_baseline)

baseline_top_recs = get_top_k(baseline_test_predictions, 10, 'baseline_recs')


------------------------------------------------------------------------
Current model: Baseline
Fitting model...
Estimating biases using sgd...
Completed model training... 

Obtaining predictions on TEST data... 
RMSE: 0.9740
MSE: 0.9487
test RMSE: 0.9739877430272629 .... Test MSE: 0.9486521235673414
------------------------------------------------------------------------

Precision: 0.9458333333333333
Recall: 1.0
F-Measure: 0.9721627408993576
Conversion Rate: 0.9458333333333333


In [10]:
svd_model, svd_test_predictions = train_model('SVD', trainset, testset)
pickle.dump( svd_model, open( "./results/svd_model.p", "wb" ) )
pickle.dump( svd_test_predictions, open( "./results/svd_test_predictions.p", "wb" ) )

svd_precision, svd_recall, svd_f_measure, svd_conversionRate = precision_recall_at_k(svd_test_predictions, k=10, threshold=3)

print('Precision:', svd_precision)
print('Recall:', svd_recall)
print('F-Measure:', svd_f_measure)
print('Conversion Rate:', svd_conversionRate)

svd_top_recs = get_top_k(svd_test_predictions, 10, 'svd_recs')


------------------------------------------------------------------------
Current model: SVD
Fitting model...
Completed model training... 

Obtaining predictions on TEST data... 
RMSE: 0.9669
MSE: 0.9348
test RMSE: 0.9668528793004127 .... Test MSE: 0.9348044902114985
------------------------------------------------------------------------

Precision: 0.9458333333333333
Recall: 1.0
F-Measure: 0.9721627408993576
Conversion Rate: 0.9458333333333333


In [11]:
co_clustering_model, co_clustering_test_predictions = train_model('Co-Clustering', trainset, testset)
pickle.dump( co_clustering_model, open( "./results/co_clustering_model.p", "wb" ) )
pickle.dump( co_clustering_model, open( "./results/co_clustering_test_predictions.p", "wb" ) )

co_clustering_precision, co_clustering_recall, co_clustering_f_measure, co_clustering_conversionRate = precision_recall_at_k(co_clustering_test_predictions, k=10, threshold=3)

print('Precision:', co_clustering_precision)
print('Recall:', co_clustering_recall)
print('F-Measure:', co_clustering_f_measure)
print('Conversion Rate:', co_clustering_conversionRate)

co_clustering_top_recs = get_top_k(co_clustering_test_predictions, 10, 'co_clustering_recs')


------------------------------------------------------------------------
Current model: Co-Clustering
Fitting model...
Completed model training... 

Obtaining predictions on TEST data... 
RMSE: 1.2828
MSE: 1.6456
test RMSE: 1.2827916545823057 .... Test MSE: 1.6455544290660096
------------------------------------------------------------------------

Precision: 0.9583333333333334
Recall: 0.8958333333333334
F-Measure: 0.9260299625468166
Conversion Rate: 0.9583333333333334


In [12]:
nmf_model, nmf_test_predictions = train_model('NMF', trainset, testset)
pickle.dump( nmf_model, open( "./results/nmf_model.p", "wb" ) )
pickle.dump( nmf_test_predictions, open( "./results/nmf_test_predictions.p", "wb" ) )

nmf_precision, nmf_recall, nmf_f_measure, nmf_conversionRate = precision_recall_at_k(nmf_test_predictions, k=10, threshold=4)

print('Precision:', nmf_precision)
print('Recall:', nmf_recall)
print('F-Measure:', nmf_f_measure)
print('Conversion Rate:', nmf_conversionRate)

nmf_top_recs = get_top_k(nmf_test_predictions, 10, 'nmf_recs')


------------------------------------------------------------------------
Current model: NMF
Fitting model...
Completed model training... 

Obtaining predictions on TEST data... 
RMSE: 1.2740
MSE: 1.6231
test RMSE: 1.2740038591371974 .... Test MSE: 1.6230858330964721
------------------------------------------------------------------------

Precision: 0.8958333333333334
Recall: 0.6916666666666667
F-Measure: 0.780621172353456
Conversion Rate: 0.8958333333333334
