# Collaborative Filtering

The following file summarizes the performed collaborative filtering methods.

Arbeitspakete:
- Filterfunktionen bauen: Random sampling
- Evaluation über verschiedene Filterfunktionen & einzelne Algorithmen
- RMSE / MAE Plots ? 

In [37]:
%run functions.py

# Import packages
import numpy as np
import pandas as pd

from surprise import SVD, NMF, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import Dataset, Reader, SVD, accuracy
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate

# Load dataset
interactions = pd.read_csv('../data/RAW_interactions.csv', sep=',')
recipes = pd.read_csv('../data/RAW_recipes.csv', sep=',')
print('Unique rows in interactions: ', len(interactions))
print('Unique rows in recipes: ', len(recipes))

Unique rows in interactions:  1132367
Unique rows in recipes:  231637


## Filter Strategies

Filter ideas:
- Varianz-basiert ?
- Up-/Downsampling ? 
- Random Sampling (User & Recipes) - sample_size_user / sample_size_recipe in Anzahl

In [38]:
ratings = interactions[['user_id', 'recipe_id', 'rating']].copy()

# Delete all '0' ratings
ratings = ratings[ratings.rating != 0]

# Filter ratings
ratings = filter_user_item(ratings, 1, 1, "iqr", "iqr")

Evaluate the filtered dataset:

In [43]:
# Print the number of ratings included in the filtered dataset
print(f"Number of ratings that is left: {len(ratings)}\n")

# Print the number of users included in the filtered dataset
print(f"Number of users that is left: {ratings.user_id.unique().size}\n")

# Print the number of recipes included in the filtered dataset
print(f"Number of recipes that is left: {ratings.recipe_id.unique().size}\n")

# Print the fraction of ratings included in the filtered dataset
print(f"Fraction of ratings that is left: {round(len(ratings) / len(interactions), 2)}\n")

# Print the fraction of users included in the filtered dataset
print(f"Fraction of users that is left: {round(ratings.user_id.unique().size / interactions.user_id.unique().size, 2)}\n")

# Print the fraction of recipes included in the filtered dataset
print(f"Fraction of recipes that is left: {round(ratings.recipe_id.unique().size / interactions.recipe_id.unique().size, 2)}\n")

Number of ratings: 87623

Number of users: 77098

Number of recipes: 64500

Fraction of ratings that is left: 0.08

Fraction of users that is left: 0.34

Fraction of recipes that is left: 0.28



## Algorithms

In [None]:
# Prep grid search

# Params
param_grid = {#"KNNBasic" : {"k" : [20,40,60],
#                             "sim_options" : {"name" : ['cosine'],
#                                              "user_based"  : [True, False],
#                                             }
#                            },
#               "KNNWithMeans" : {"k" : [20,40,60],
#                                "sim_options" : {"name" : ['cosine'],
#                                                 "user_based"  : [True, False],
#                                                }
#                                },          
    "SVD" : {"n_factors" : [5, 15, 25, 50]},
#               "NMF" : {"n_factors" : [5, 15, 25, 50]},
#               "SlopeOne" : {},
#               "Baseline" : {}
             }


# Algos
algos = {"SVD" : SVD, "NMF" : NMF, "SlopeOne" : SlopeOne, "Baseline" : NormalPredictor}

trainset, testset = train_test_split(data, test_size=0.25, random_state = 0) 

# for each Filter
#     for each Algo

## Evaluation

In [None]:
TODO: Plot performance of best hyperparameter setting per algorithm
RMSE / MAE

In [None]:
# Benchmark
benchmark = []

# Cross-validation
cv = ShuffleSplit(n_splits = 5, test_size = 0.3, random_state=42, shuffle=True)

# Evaluate the algorithms
for name, algo in algos.items():
    gs = GridSearchCV(algo, param_grid.get(name), measures=['rmse'], cv=cv, refit=True, n_jobs=-1)
    gs.fit(trainset)
    predicitons = gs.test(testset)
    final_score = accuracy.rmse(predicitons)
    benchmark.append([name, final_score, gs.best_params])

results = pd.DataFrame(benchmark, columns=["Algorithm", "Final_RMSE", "Params"]).sort_values("Final_RMSE")
results.set_index("Algorithm", inplace=True)
results