# Collaborative Filtering

The following file summarizes the performed collaborative filtering methods.

Arbeitspakete:
- Filterfunktionen bauen: Random sampling
- Evaluation über verschiedene Filterfunktionen & einzelne Algorithmen
- RMSE / MAE Plots ? 

In [27]:
%run functions.py

# Import packages
import numpy as np
import pandas as pd

from surprise import SVD, NMF, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, NormalPredictor
from surprise import Dataset, Reader, SVD, accuracy
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV, ShuffleSplit

# Load dataset
interactions = pd.read_csv('../data/RAW_interactions.csv', sep=',')
recipes = pd.read_csv('../data/RAW_recipes.csv', sep=',')
print('Unique rows in interactions: ', len(interactions))
print('Unique rows in recipes: ', len(recipes))

Unique rows in interactions:  1132367
Unique rows in recipes:  231637


## Filter Strategies

Filter ideas:
- Varianz-basiert ?
- Up-/Downsampling ? 
- Random Sampling (User & Recipes) - sample_size_user / sample_size_recipe in Anzahl

In [5]:
ratings = interactions[['user_id', 'recipe_id', 'rating']].copy()

# Delete all '0' ratings
ratings = ratings[ratings.rating != 0]

# Create a dictonary that stores all of our filtering possibilities
filters = {"IQR": filter_user_item(ratings, 1, 1, "iqr", "iqr")}

Evaluate the filtered dataset:

In [6]:
def print_filter_results(ratings, interactions):
    # Print the number of ratings included in the filtered dataset
    print(f"Number of ratings that is left: {len(ratings)}\n")

    # Print the number of users included in the filtered dataset
    print(f"Number of users that is left: {ratings.user_id.unique().size}\n")

    # Print the number of recipes included in the filtered dataset
    print(f"Number of recipes that is left: {ratings.recipe_id.unique().size}\n")

    # Print the fraction of ratings included in the filtered dataset
    print(f"Fraction of ratings that is left: {round(len(ratings) / len(interactions), 2)}\n")

    # Print the fraction of users included in the filtered dataset
    print(f"Fraction of users that is left: {round(ratings.user_id.unique().size / interactions.user_id.unique().size, 2)}\n")

    # Print the fraction of recipes included in the filtered dataset
    print(f"Fraction of recipes that is left: {round(ratings.recipe_id.unique().size / interactions.recipe_id.unique().size, 2)}\n")

## Algorithms

In [39]:
# Prep grid search

# Params
param_grid = {
    "KNNBasic" : {
        "k" : [20,40,60],
        "sim_options" : {
            "name" : ['cosine'],
            "user_based"  : [True, False],
        }
    },
    "KNNWithMeans" : {
        "k" : [20,40,60],
        "sim_options" : {
            "name" : ['cosine'],
            "user_based"  : [True, False],
        }                       
    },          
    "SVD" : {
        "n_factors" : [5, 15, 25, 50]
    },
    "NMF" : {
        "n_factors" : [5, 15, 25, 50]
    },
    "Baseline" : {}
}


# Algos
algos = {"SVD" : SVD, "NMF" : NMF, "Baseline" : NormalPredictor}

## Evaluation

In [None]:
TODO: Plot performance of best hyperparameter setting per algorithm
RMSE / MAE

In [40]:
# Benchmark
benchmark = []
fitted_algos = {}

# Cross-validation
cv = ShuffleSplit(n_splits = 5, test_size = 0.3, random_state=42, shuffle=True)

# Evaluate the algorithms
for f_name, f in filters.items():
    # Apply the current filtering
    ratings = f
    # Print the key facts for the filtering
    print_filter_results(ratings, interactions)
    # Create train-test split for model evaluation
    ratings_train, ratings_test = train_test_split(ratings, test_size=0.3, random_state=42, shuffle=True, stratify=ratings.rating)
    # Create dataset objects from the train-test-split which is required for suprise package 
    trainset = Dataset.load_from_df(ratings_train, Reader())
    testset = Dataset.load_from_df(ratings_test, Reader()).build_full_trainset().build_testset()
    # Run a grid search for each algorithm
    for name, algo in algos.items():
        gs = GridSearchCV(algo, param_grid.get(name), measures=['rmse'], cv=cv, refit=True, n_jobs=-1)
        # Refit the estimator with the best parameters on whole training set
        gs.fit(trainset)
        # Get the final score
        predicitons = gs.test(testset)
        final_score = accuracy.rmse(predicitons)
        # Store the fitted estimator for later use
        fitted_algos[name] = gs
        # Store the results
        benchmark.append([f_name, name, final_score, gs.best_params])

results = pd.DataFrame(benchmark, columns=["Filter", "Algorithm", "Final_RMSE", "Params"]).sort_values("Final_RMSE")
results.set_index(["Filter", "Algorithm"], inplace=True)
results.sort_index(inplace=True)
results

Number of ratings that is left: 87623

Number of users that is left: 77098

Number of recipes that is left: 64500

Fraction of ratings that is left: 0.08

Fraction of users that is left: 0.34

Fraction of recipes that is left: 0.28

RMSE: 0.9860
RMSE: 1.0100
RMSE: 1.2591


Unnamed: 0_level_0,Unnamed: 1_level_0,Final_RMSE,Params
Filter,Algorithm,Unnamed: 2_level_1,Unnamed: 3_level_1
IQR,SVD,0.985958,{'rmse': {'n_factors': 5}}
IQR,NMF,1.009973,{'rmse': {'n_factors': 50}}
IQR,Baseline,1.259059,{'rmse': {}}
