# Import Data


## Import and Install

In [1]:
%pip install pandas

import pandas as pd
import os
import urllib.request
import zipfile
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Get Data

In [13]:
DATA_FILE = 'ml-latest-small'
DATA_URL = f"https://files.grouplens.org/datasets/movielens/${DATA_FILE}.zip"
DATA_DIR = '../data'

data_path = os.path.join(DATA_DIR, DATA_FILE, ".zip")

if not os.path.exists(data_path):
    urllib.request.urlretrieve(DATA_URL, data_path)
    with zipfile.ZipFile(data_path, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)

## Load Data

In [23]:
movies_path = os.path.join(DATA_DIR, DATA_FILE, 'movies.csv')
ratings_path = os.path.join(DATA_DIR, DATA_FILE, 'ratings.csv')
tags_path = os.path.join(DATA_DIR, DATA_FILE, 'tags.csv')
links_path = os.path.join(DATA_DIR, DATA_FILE, 'links.csv')

def load_data():
    """
    Lädt die Daten in DataFrames.

    Returns:
    movies_df : DataFrame
        DataFrame mit den Filmen.
    ratings_df : DataFrame
        DataFrame mit den Bewertungen.
    tags_df : DataFrame
        DataFrame mit den Tags.
    links_df : DataFrame
        DataFrame mit den Links.
    """
    movies_df = pd.read_csv(movies_path)
    ratings_df = pd.read_csv(ratings_path)
    tags_df = pd.read_csv(tags_path)
    links_df = pd.read_csv(links_path)
    
    return movies_df, ratings_df, tags_df, links_df




def load_data_as_dataset(df, reader):
    """
    Lädt die Daten in ein Surprise Dataset.

    Parameters:
    df : DataFrame
        DataFrame mit den Bewertungsdaten.
    reader : Reader
        Ein Reader-Objekt von Surprise.

    Returns:
    full_data : Trainset
        Das vollständige Trainset.
    train_set : list
        Die Trainingsdaten.
    test_set : list
        Die Testdaten.
    """
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    full_data = data.build_full_trainset()
    train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
    return full_data, train_set, test_set

# Laden der Daten
movies_df, ratings_df, tags_df, links_df = load_data()
print("PandaFrames loaded successfully!")


PandaFrames loaded successfully!


# Data Exploration
What data exploration methods do we need?

# Train Models

## Content-Based
This is a function to train a content based recommendation model



In [4]:
def train_content_based_model(movies, ratings, tags, links) :
    print("Training content-based model")

## Collaborative Filtering - Neighborhood


In [5]:
def train_neighborhood_model(movies, ratings, tags, links) :
    print("Training neighborhood model")

## Collaborative Filtering - Matrix Factorization


In [19]:
def train_matrix_factorization_model(movies, ratings, tags, links):
    full_rating_dataset, train_rating_dataset, test_rating_dataset = load_data_as_dataset(ratings, reader = Reader(line_format="user item rating timestamp", sep=","))
    print("Dataset loaded successfully!")

    # Use SVD for item-based collaborative filtering
    svd_model = SVD()  # Set user_based to False for item-based collaborative filtering

    # Train the model on the training set
    svd_model.fit(train_rating_dataset)

    # Make predictions on the test set
    predictions = svd_model.test(test_rating_dataset)

    # Evaluate the model using RMSE
    accuracy = rmse(predictions)
    print(f"RMSE on the test set: {accuracy:.4f}")

    from collections import defaultdict
    def get_top_n(predictions, n=10):
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))
        for uid, user_ratings in top_n.items():
                user_ratings.sort(key=lambda x: x[1], reverse=True)
                top_n[uid] = user_ratings[:n]
        return top_n

    top_n = get_top_n(predictions, n=10)
    for uid, user_ratings in top_n.items():
        print(uid, [iid for (iid, _) in user_ratings])
        print("Training matrix factorization model")

train_matrix_factorization_model(movies_df, ratings_df, tags_df, links_df)
    

RMSE: 0.8807
RMSE on the test set: 0.8807
140 [2542, 529, 48516, 1, 1234, 1242, 3578, 2067, 2947, 953]
Training matrix factorization model
603 [1172, 912, 296, 3147, 1246, 1089, 1193, 1248, 2858, 1213]
Training matrix factorization model
438 [1196, 34405, 4993, 1732, 5902, 4011, 1527, 6, 364, 4995]
Training matrix factorization model
433 [608, 1089, 296, 164179]
Training matrix factorization model
474 [3037, 899, 2324, 902, 924, 7438, 1089, 928, 1283, 3083]
Training matrix factorization model
304 [1198, 1035, 1704, 593, 1196, 356, 318, 2502, 1653, 457]
Training matrix factorization model
298 [3578, 1210, 1222, 48516, 260, 2858, 80489, 1230, 1, 1215]
Training matrix factorization model
131 [1213, 4226, 1228, 1193, 1288, 1617, 1200, 1136, 593, 293]
Training matrix factorization model
288 [1653, 7153, 1200, 908, 2028, 1704, 8874, 1198, 3147, 1199]
Training matrix factorization model
448 [1214, 1234, 48516, 1200, 296, 1228, 778, 1617, 1262, 1610]
Training matrix factorization model
284 [35

## Train all Models

In [24]:
def train_models(movies, ratings, tags, links) :
    train_content_based_model(movies, ratings, tags, links)
    train_neighborhood_model(movies, ratings, tags, links)
    train_matrix_factorization_model(movies, ratings, tags, links)

movies_df, ratings_df, tags_df, links_df = load_data()
train_models(movies=movies_df, ratings=ratings_df, tags=tags_df, links=links_df)

Training content-based model
Training neighborhood model
RMSE: 0.8811
RMSE on the test set: 0.8811
140 [6787, 914, 1234, 48516, 3030, 1610, 296, 529, 5995, 8961]
Training matrix factorization model
603 [2150, 912, 356, 1193, 1912, 29, 1221, 1246, 2858, 1267]
Training matrix factorization model
438 [1196, 5902, 8368, 3793, 6377, 34405, 899, 7147, 4448, 6539]
Training matrix factorization model
433 [608, 296, 1089, 164179]
Training matrix factorization model
474 [260, 6377, 1200, 1288, 1206, 337, 1172, 1178, 1228, 3037]
Training matrix factorization model
304 [318, 1704, 1196, 457, 593, 474, 1198, 356, 17, 112]
Training matrix factorization model
298 [5618, 260, 3578, 48516, 3996, 1275, 1285, 1210, 3481, 1222]
Training matrix factorization model
131 [1193, 1213, 1288, 1136, 593, 1200, 4226, 1228, 1617, 293]
Training matrix factorization model
288 [2571, 1200, 4226, 1653, 2959, 1704, 58559, 1198, 1199, 54997]
Training matrix factorization model
448 [1198, 923, 296, 48516, 1884, 1968, 1704

# Use Models

## Content-Based

In [8]:
# This function should return a list of recommended items with their scores
# [{'movieId': 1, 'score': 0.5}, {'movieId': 2, 'score': 0.4}, {'movieId': 3, 'score': 0.3}]
def make_content_based_recommendations(user, model) -> []:
    return []

## Collaborative Filtering - Neighborhood


In [9]:
# This function should return a list of recommended items with their scores
# [{'movieId': 1, 'score': 0.5}, {'movieId': 2, 'score': 0.4}, {'movieId': 3, 'score': 0.3}]
def make_neighborhood_recommendations(user, model) -> []:
    return []

## Collaborative Filtering - Matrix Factorization


In [10]:
# This function should return a list of recommended items with their scores
# [{'movieId': 1, 'score': 0.5}, {'movieId': 2, 'score': 0.4}, {'movieId': 3, 'score': 0.3}]
def make_matrix_factorization_recommendations(user, model) -> []:
    # add the user to the model
    # get the recommendations for the user
    return []

## Hybrid Recommendations

In [11]:
def make_recommendations(user, content_model, collab_model1, collab_model2) -> []:
    content_based_recommendations = make_content_based_recommendations(user, content_model)
    neighborhood_recommendations = make_neighborhood_recommendations(user, collab_model1)
    matrix_factorization_recommendations = make_matrix_factorization_recommendations(user, collab_model2)

    # Combine the recommendations from the three models
    return []

### Train the models to estimate a score for a recommendation

In [12]:
# user contains information about the user -> details tbd
# probably a list of movies the user has rated and the ratings given

# movie_list contains a list of movie ids
def recommendations_from_list(user, movie_list, content_model, collab_model1, collab_model2):
    scores = {}
    for movie in movie_list:
        try: 
            content_score = content_model.estimate(user, movie)
        except: 
            content_score = 0
        try:
            collab_score1 = collab_model1.estimate(user, movie)
        except: 
            collab_score1 = 0
        try:
            collab_score2 = collab_model2.estimate(user, movie)
        except: 
            collab_score2 = 0

        combined_score = (content_score + collab_score1 + collab_score2) / 3
        scores[movie] = combined_score
        # Create a combined score
        combined_score = (content_score + collab_score1 + collab_score2) / 3
        scores[movie] = combined_score
    # Sort the scores with best recommendations first
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores