In [1]:
import pandas as pd

# Load data
file_path = "data/ratings_small.csv"
data = pd.read_csv(file_path, usecols=['userId', 'movieId', 'rating', 'timestamp'])

# Display the first few rows of the dataset
print(data.head())

   userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205


In [None]:
# Computer had a problem and wasnt able to install suprise when creating final jupiter notebook to run the result
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import KNNBasic, KNNWithZScore, SVD

In [None]:
# Define algorithms
user_based_cf = KNNBasic(sim_options={'user_based': True})
item_based_cf = KNNBasic(sim_options={'user_based': False})
pmf = SVD()

In [None]:
# Perform 5-fold cross-validation and compute MAE, RMSE for each algorithm
def evaluate_algorithm(algo, data):
    results = cross_validate(algo, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)
    return results

user_based_results = evaluate_algorithm(user_based_cf, data)
item_based_results = evaluate_algorithm(item_based_cf, data)
pmf_results = evaluate_algorithm(pmf, data)

In [None]:
# Display average MAE and RMSE for each algorithm
def display_results(results, algorithm_name):
    avg_mae = results['test_mae'].mean()
    avg_rmse = results['test_rmse'].mean()
    print(f'{algorithm_name}:')
    print(f'  Average MAE: {avg_mae:.4f}')
    print(f'  Average RMSE: {avg_rmse:.4f}')
    print()

display_results(user_based_results, 'User-Based Collaborative Filtering')
display_results(item_based_results, 'Item-Based Collaborative Filtering')
display_results(pmf_results, 'Probabilistic Matrix Factorization')

In [None]:
# Compare the average performances of the algorithms
best_algorithm_mae = min(user_based_results['test_mae'].mean(), item_based_results['test_mae'].mean(), pmf_results['test_mae'].mean())
best_algorithm_rmse = min(user_based_results['test_rmse'].mean(), item_based_results['test_rmse'].mean(), pmf_results['test_rmse'].mean())

print(f'Best Algorithm (MAE): {best_algorithm_mae:.4f}')
print(f'Best Algorithm (RMSE): {best_algorithm_rmse:.4f}')


In [None]:
# Examine the impact of similarity metrics on User-Based and Item-Based Collaborative Filtering
similarity_metrics = ['cosine', 'msd', 'pearson']

def evaluate_similarity_impact(algo, data, similarity_metric):
    algo.sim_options['similarity_options'] = {'name': similarity_metric}
    results = cross_validate(algo, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)
    return results

user_based_similarity_results = {metric: evaluate_similarity_impact(user_based_cf, data, metric) for metric in similarity_metrics}
item_based_similarity_results = {metric: evaluate_similarity_impact(item_based_cf, data, metric) for metric in similarity_metrics}

In [None]:
# Plot the results
import matplotlib.pyplot as plt

def plot_results(results, title):
    for metric, result in results.items():
        plt.plot(result['test_mae'], label=f'MAE ({metric})')
        plt.plot(result['test_rmse'], label=f'RMSE ({metric})')

    plt.title(title)
    plt.xlabel('Fold')
    plt.ylabel('Error')
    plt.legend()
    plt.show()

plot_results(user_based_similarity_results, 'User-Based Collaborative Filtering Similarity Impact')
plot_results(item_based_similarity_results, 'Item-Based Collaborative Filtering Similarity Impact')


In [None]:
# Examine the impact of the number of neighbors on User-Based and Item-Based Collaborative Filtering
neighbor_values = [5, 10, 15, 20, 25]

def evaluate_neighbors_impact(algo, data, neighbor_value):
    algo.sim_options['k'] = neighbor_value
    results = cross_validate(algo, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)
    return results

user_based_neighbors_results = {value: evaluate_neighbors_impact(user_based_cf, data, value) for value in neighbor_values}
item_based_neighbors_results = {value: evaluate_neighbors_impact(item_based_cf, data, value) for value in neighbor_values}

In [None]:
# Plot the results
plot_results(user_based_neighbors_results, 'User-Based Collaborative Filtering Neighbors Impact')
plot_results(item_based_neighbors_results, 'Item-Based Collaborative Filtering Neighbors Impact')

In [None]:
# Identify the best number of neighbors for User-Based and Item-Based Collaborative Filtering
best_k_user_based = min(user_based_neighbors_results, key=lambda k: user_based_neighbors_results[k]['test_rmse'].mean())
best_k_item_based = min(item_based_neighbors_results, key=lambda k: item_based_neighbors_results[k]['test_rmse'].mean())

print(f'Best number of neighbors for User-Based Collaborative Filtering: {best_k_user_based}')
print(f'Best number of neighbors for Item-Based Collaborative Filtering: {best_k_item_based}')