# Gruppenprojekt - Big Data Analysis
Dieses Notebook beschreibt unser Programm und bietet einen Überblik über unser Projekt mit den *FlixNet*‑Daten.

**INFO**: Dieses Projekt wurde in einem [Github Repository](https://github.com/01Niklas/big-data-gruppenprojekt) entwickelt und wurde im nachgang in dieses Notebook übertragen.



## 🔧 Libraries & Setup

In [None]:
%pip install loguru # install our logging library
%pip install optuna # install the library we used to optimize hyperparameters for deep learning recommender



In [None]:
from abc import abstractmethod
from datetime import datetime
from typing import List
from typing import Optional, Literal, Dict, Any

import numpy as np
import optuna
import pandas as pd
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.optim as optim
from loguru import logger
from pydantic import BaseModel
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

## Recommender deklarieren
(Base-Recommender, Collaborative Filtering, Content-Based und Deep-Learning-Recommender)

In [None]:
class Recommender:
    def __init__(self):
        self.k = 3 # default
        self.user_id = None
        self.item_id = None
        self.similarity: Literal["cosine", "pearson"] = "cosine"  # default
        self.calculation_variant: Literal["weighted", "unweighted"] = "weighted"  # default
        self.data = None


    @abstractmethod
    def _preprocess_data(self):
        ...


    def _prepare_information(self, user_id: str, item_id: str, k: int, similarity: Literal["cosine", "pearson"] = "cosine", calculation_variant: Literal["weighted", "unweighted"] = "weighted") -> None:
        self.user_id = user_id
        self.item_id = item_id
        self.similarity = similarity
        self.calculation_variant = calculation_variant
        self.k = k

        if similarity == 'pearson' and self.data is not None:
            self.data['mean'] = self.data.mean(axis=1)


    @abstractmethod
    def predict(
            self,
            user_id: str,
            item_id: str,
            similarity: Optional[Literal['cosine', 'pearson']] = 'cosine',   # only for collaborative filtering
            calculation_variety: Optional[Literal['weighted', 'unweighted']] = 'weighted',  # only for collaborative filtering
            k: Optional[int] = 3,
            second_k_value: Optional[int] = None):
        ...

In [None]:
class CollaborativeFilteringRecommender(Recommender):
    def __init__(self, data: pd.DataFrame, mode: Literal['user', 'item'] = 'user', display_results_for_each_step: Optional[bool] = False) -> None:
        super().__init__()
        self.display_results_for_each_step = display_results_for_each_step
        self.original_data = data
        self.mode = mode
        self._preprocess_data()


    def _preprocess_data(self) -> None:
        self.original_data = self.original_data.set_index("user_ID")
        self.original_data.index = self.original_data.index.astype(str) # convert the index to string (due to error with int values)
        if self.mode == 'item':
            self.data = self.original_data.T  # transpose for item based
        else:
            self.data = self.original_data  # original for user based


    def _calculate_distance_and_indices(self, dataframe: pd.DataFrame) -> ([], []):
        knn = NearestNeighbors(metric="cosine", algorithm='brute')
        knn.fit(dataframe.values)
        distances, indices = knn.kneighbors(dataframe.values, n_neighbors=self.k + 1)

        if self.mode == 'item':
            index = dataframe.index.get_loc(self.item_id)
        else:
            index = dataframe.index.get_loc(self.user_id)

        similar_distances = distances[index, 1:]
        similar_indices = indices[index, 1:]

        return similar_distances, similar_indices

    def _calculate_similarities(self, similar_distances: np.ndarray) -> np.ndarray:
        similarity = [1 - x for x in similar_distances]
        similarity = [(y + 1) / 2 for y in similarity]
        return np.array(similarity)

    def _calculate_result(self, similarity: np.ndarray, ratings: np.ndarray) -> float:
        if self.calculation_variant == "weighted":
            mean = np.dot(ratings, similarity) / similarity.sum()
            return mean
        else:
            return float(np.mean(ratings))

    def _check_values(self) -> None:
        if self.mode == 'user':
            if self.user_id not in self.data.index:
                raise ValueError(f"User {self.user_id} nicht in Daten.")
            if self.item_id not in self.data.columns:
                raise ValueError(f"Item {self.item_id} nicht in Daten.")
        elif self.mode == 'item':
            if self.user_id not in self.original_data.index:
                raise ValueError(
                    f"User {self.user_id} nicht in Originaldaten.")
            if self.item_id not in self.data.index:
                raise ValueError(
                    f"Item {self.item_id} nicht in transponierten Daten.")

    def _process_item_based(self) -> pd.DataFrame:
        user_ratings = self.original_data.loc[self.user_id]

        # filter based on the item. Only the users that already gave a rating are relevant
        rated_items = user_ratings[user_ratings > 0.0].index.tolist()

        if not rated_items:
            raise ValueError(f"User {self.user_id} hat keine Items bewertet!")

        return self.data.loc[rated_items + [self.item_id]]

    def _process_user_based(self) -> pd.DataFrame:
        # filter based on the item. Only the users that already gave a rating are relevant
        relevant_df = self.data[self.data[self.item_id] > 0.0]

        # add the user we are looking for (due to non-existing rating this user where filtered out)
        return pd.concat([relevant_df, self.data.loc[[self.user_id]]])

    def _normalize_for_pearson(self, relevant_df: pd.DataFrame) -> pd.DataFrame:
        mean_values = relevant_df.mean(axis=1).to_numpy()
        relevant_df = relevant_df.sub(mean_values, axis=0)
        return relevant_df

    def predict(
            self,
            user_id: str,
            item_id: str,
            similarity: Literal['cosine', 'pearson'] = 'cosine',
            calculation_variety: Literal['weighted', 'unweighted'] = 'weighted',
            k: Optional[int] = 3,
            second_k_value: Optional[int] = None) -> float:
        self._prepare_information(user_id=user_id, item_id=item_id, similarity=similarity, calculation_variant=calculation_variety, k=k)
        self._check_values()

        if self.mode == 'item':
            relevant_df = self._process_item_based()
        else:
            relevant_df = self._process_user_based()

        if similarity == 'pearson':
            self._normalize_for_pearson(relevant_df)

        # make sure that there are no NaN values -> set NaN to 0.0
        relevant_df = relevant_df.fillna(0.0)
        similar_distances, similar_indices = self._calculate_distance_and_indices(dataframe=relevant_df)

        if self.mode == 'item':
            ratings = relevant_df.iloc[similar_indices][self.user_id].to_numpy()
        else:
            ratings = relevant_df.iloc[similar_indices][self.item_id].to_numpy()

        similarity = self._calculate_similarities(similar_distances)
        result = self._calculate_result(similarity, ratings)

        if self.display_results_for_each_step:
          self.explain(similar_indices, relevant_df, ratings, similarity, result)

        return result

    def explain(self, similar_indices, relevant_df, ratings, similarity, result) -> None:
        print("-" * 50)
        print(f"<mode: {self.mode}>")
        print(f"({self.calculation_variant}) Mittelwert: {result:.4f}")
        print(f"Metrik: {self.similarity}")
        print()
        print(f"k ({self.k}) ähnlichsten {'Items' if self.mode == 'item' else 'Nutzer'}:")
        df = pd.DataFrame({
            "ID": relevant_df.index[similar_indices],
            "rating": ratings,
            "similarity": similarity
        }).reset_index(drop=True)
        print(df.to_string(index=True, header=True))
        print("-" * 50)

In [None]:
class ContentBasedRecommender(Recommender):
    def __init__(self, item_profile: pd.DataFrame, user_ratings: pd.DataFrame) -> None:
        super().__init__()
        self.item_profile = item_profile
        self.user_ratings = user_ratings
        self.k = 3
        self.feature_matrix = None
        self._preprocess_data()

        # check if the features "budget", "revenue", "runtime" are relevant for the item/rating correlation
        self._check_features_correlation(features=["budget", "revenue", "runtime"])
        self._calculate_tfidf_matrix()


    def _preprocess_data(self):
        self.item_profile["item_ID"] = self.item_profile["item_ID"].astype(str)
        self.user_ratings["item_ID"] = self.user_ratings["item_ID"].astype(str)
        self.user_ratings["user_ID"] = self.user_ratings["user_ID"].astype(str)

    def _check_features_correlation(self, features: List[str]) -> None:
        irrelevant_features = []  #  list for irrelevant feature that will be removed

        for feature in features:
            if feature not in self.item_profile.columns:
                continue

            # combine item and user ratings
            merged_data = pd.merge(self.user_ratings, self.item_profile, on="item_ID")

            # convert to numeric
            feature_data = pd.to_numeric(merged_data[feature].fillna(0), errors="coerce")
            rating_data = pd.to_numeric(merged_data["rating"].fillna(0), errors="coerce")

            # calculate the correlation between the user rating and the feature
            correlation, p_value = stats.pearsonr(feature_data, rating_data)

            # check if the correlation is relevant / significant
            if abs(correlation) < 0.1 or p_value > 0.05:
                logger.debug(f"Feature '{feature}' does not have a sigificant correlation and will be ignored.")
                irrelevant_features.append(feature)
            else:
                logger.debug(f"Feature '{feature}' has a significant correlation: {correlation}")

        self.item_profile.drop(columns=irrelevant_features, inplace=True)


    def _safe_get_feature(self, feature_name):
        if feature_name in self.item_profile.columns:
            return self.item_profile[feature_name]
        else:
            return None

    def _calculate_tfidf_matrix(self) -> None:
        # optional but if the title is empty we set it as an empty string
        self.item_profile["title"] = self.item_profile["title"].fillna("")

        # use the TfidfVectorizer() to transform title into numerical feature
        title_vectorizer = TfidfVectorizer()
        title_features = title_vectorizer.fit_transform(self.item_profile["title"])

        # change genre columns in text by just extracting the word after '"Genre_"'
        genre_cols = [col for col in self.item_profile.columns if col.startswith("Genre_")]
        if genre_cols:
            self.item_profile["genre_text"] = self.item_profile[genre_cols].astype(int).apply(
                lambda row: " ".join([col.replace("Genre_", "") for col, val in row.items() if val == 1]), axis=1
            )
            genre_vectorizer = TfidfVectorizer()
            genre_features = genre_vectorizer.fit_transform(self.item_profile["genre_text"])
        else:
            genre_features = np.empty((len(self.item_profile), 0))

        # the language of the items transformed into one-hot-encoded-dummies
        language_dummies = pd.get_dummies(self.item_profile["original_language"], prefix="lang")

        # put runtime into three categories (short, medium, long)
        runtime_feature = self._safe_get_feature("runtime")
        if runtime_feature is not None:
            runtime_bucket = pd.qcut(runtime_feature, q=3, labels=["kurz", "mittel", "lang"])
            runtime_dummies = pd.get_dummies(runtime_bucket, prefix="runtime")
        else:
            runtime_dummies = pd.DataFrame(index=self.item_profile.index)

        # budget and include will be logarithmically transformed and then scaled
        numerical_features = []
        if "budget" in self.item_profile.columns:
            self.item_profile["log_budget"] = np.log1p(self.item_profile["budget"].fillna(0))
            numerical_features.append("log_budget")
        if "revenue" in self.item_profile.columns:
            self.item_profile["log_revenue"] = np.log1p(self.item_profile["revenue"].fillna(0))
            numerical_features.append("log_revenue")

        if numerical_features:
            scaler = StandardScaler()
            scaled_numericals = scaler.fit_transform(self.item_profile[numerical_features])
        else:
            scaled_numericals = np.empty((len(self.item_profile), 0))

        # create feature matrix
        self.feature_matrix = hstack([
            title_features,
            genre_features,
            language_dummies.values,
            runtime_dummies.values,
            scaled_numericals
        ])

        self.feature_matrix = csr_matrix(self.feature_matrix)

    def _check_values(self):
        if self.user_id not in self.user_ratings["user_ID"].values:
            raise ValueError(f"User-ID {self.user_id} not found.")

        if self.item_id not in self.item_profile["item_ID"].values:
            raise ValueError(f"Item-ID {self.item_id} not found.")


    def predict(
            self,
            user_id: str,
            item_id: str,
            similarity: Optional[Literal['cosine', 'pearson']] = 'cosine',  # only for collaborative filtering
            calculation_variety: Optional[Literal['weighted', 'unweighted']] = 'weighted', # only for collaborative filtering
            k: Optional[int] = 3,
            second_k_value: Optional[int] = None) -> float:

        # default function to save all the information
        self._prepare_information(user_id=user_id, item_id=item_id, k=k)

        # check if the values included in the dataframes
        self._check_values()

        # extract only the items, the user rated
        rated_items = self.user_ratings[self.user_ratings["user_ID"] == user_id]
        rated_item_ids = rated_items["item_ID"].values

        # this case can happen when k is greater than the rated items by the user
        if self.k > len(rated_item_ids):
            self.k = len(rated_item_ids)

        # extract the rated item indices from the item profile
        rated_item_indices = self.item_profile[self.item_profile["item_ID"].isin(rated_item_ids)].index

        # check if the user rated some items... if not then return 0.0
        if len(rated_item_indices) == 0:
            return 0.0

        # get the feature matrix that is calculated in the '_calculate_tfidf_matrix()'-Method
        filtered_matrix = self.feature_matrix[rated_item_indices]

        # default kNN usage like in the lecture with brute algorithm and cosine as metric
        knn = NearestNeighbors(metric="cosine", algorithm="brute")
        knn.fit(filtered_matrix)

        item_index = self.item_profile[self.item_profile["item_ID"] == item_id].index[0]
        distances, indices = knn.kneighbors(self.feature_matrix[item_index], n_neighbors=self.k + 1)  # k+1 because the item itself is also included

        # the similar item indices beginning with the first real neighbor
        similar_items = indices.flatten()[1:]
        similar_item_indices = rated_item_indices[similar_items]

        # extract for each item in the similar item indices list the rating and save it in the list
        similar_ratings = []
        for idx in similar_item_indices:
            similar_item_id = self.item_profile.iloc[idx]["item_ID"]
            user_rating = self.user_ratings[(self.user_ratings["user_ID"] == user_id) & (self.user_ratings["item_ID"] == similar_item_id)]
            if not user_rating.empty:
                similar_ratings.append(user_rating["rating"].values[0])

        # if the similar ratings is zero then we return a default 0.0
        if not similar_ratings:
            return 0.0

        # calculate the predicted rating based on the sum of ratings and len of ratings
        return sum(similar_ratings) / len(similar_ratings)

In [None]:
class HybridRecommender(Recommender):
    def __init__(self, data: pd.DataFrame, item_profile: pd.DataFrame, user_ratings: pd.DataFrame, mode: Literal['user', 'item'] = 'user', alpha: float = 0.5):
        super().__init__()
        self.collaborative_recommender = CollaborativeFilteringRecommender(data=data, mode=mode)
        self.content_based_recommender = ContentBasedRecommender(item_profile=item_profile, user_ratings=user_ratings)
        self.alpha = alpha

    def predict(
            self,
            user_id: str,
            item_id: str,
            similarity: Optional[Literal['cosine', 'pearson']] = 'cosine',  # only for collaborative filtering
            calculation_variety: Optional[Literal['weighted', 'unweighted']] = 'weighted', # only for collaborative filtering
            k: Optional[int] = 3,
            second_k_value: Optional[int] = 3):

        collaborative_prediction = self.collaborative_recommender.predict(
            user_id=user_id,
            item_id=item_id,
            similarity=similarity, # ignore that it can be NONE
            calculation_variety=calculation_variety, # ignore that it can be NONE
            k=k
        )

        content_based_prediction = self.content_based_recommender.predict(
            user_id=user_id,
            item_id=item_id,
            similarity=similarity,
            calculation_variety=calculation_variety,
            k=second_k_value
        )

        # combine both with alpha as weight
        combined_prediction = (self.alpha * collaborative_prediction) + ((1 - self.alpha) * content_based_prediction)
        return combined_prediction

---

In [None]:
# Dataset class to handle user-item-rating data
class RatingsDataset(Dataset):
    def __init__(self, data: pd.DataFrame):
        # Convert user, item, and rating columns to tensors
        self.u = torch.tensor(data["user_idx"].values, dtype=torch.long)
        self.i = torch.tensor(data["item_idx"].values, dtype=torch.long)
        self.r = torch.tensor(data["rating"].values, dtype=torch.float32)

    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.r)

    def __getitem__(self, idx):
        # Return a single sample (user, item, rating) by index
        return self.u[idx], self.i[idx], self.r[idx]


# hybrid matrix factorization model
class HybridMF(nn.Module):
    def __init__(self, num_users: int, num_items: int, embedding_dim: int, item_features, dropout: float = 0.15):
        super().__init__()
        # Embedding layers for users and items (Embedding-layer is one layer in the neral network (vectors))
        self.P = nn.Embedding(num_users, embedding_dim)
        self.Q = nn.Embedding(num_items, embedding_dim)

        # Bias terms for users and items (representate individual variances... e.g one user can generally give better ratings as default)
        self.bu = nn.Embedding(num_users, 1)
        self.bi = nn.Embedding(num_items, 1)

        # Global bias term (reprentate the average variance over the complete dataset)
        self.mu = nn.Parameter(torch.zeros(1))

        # Linear layer to project item features into the latent space (to combine them with the embeddings of the items)
        self.F = nn.Linear(item_features.shape[1], embedding_dim, bias=False)

        # Register item features as a buffer (non-trainable parameter)
        self.register_buffer("item_features", item_features)

        # Dropout layer for regularization
        self.drop = nn.Dropout(dropout)

        # Initialize weights for embeddings and linear layer
        nn.init.normal_(self.P.weight, std=0.05)
        nn.init.normal_(self.Q.weight, std=0.05)
        nn.init.xavier_uniform_(self.F.weight)

    def forward(self, u, i):
        # Compute item latent factors by combining embeddings and projected features
        q = self.Q(i) + self.F(self.item_features[i])
        q = self.drop(q)

        # Compute the predicted rating
        return (self.P(u) * q).sum(-1) + self.mu + self.bu(u).squeeze() + self.bi(i).squeeze()


class DeepLearningRecommender:
    def __init__(
            self,
            trainingdata: pd.DataFrame,
            item_profile: pd.DataFrame,
            testdata: pd.DataFrame,
            embedding_dim=64,
            batch_size=1024,
            epochs=60,
            lr=1e-3,
            weight_decay=3e-5,
            dropout_p=0.2,
            early_stopping_rounds=10
    ) -> None:

        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr  # learning rate
        self.weight_decay = weight_decay
        self.dropout_p = dropout_p
        self.early_stopping_rounds = early_stopping_rounds
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Prepare data and build the model
        self.train_data, self.val_data, self.feat_mat, self.global_mean = self._prepare_data(trainingdata, item_profile, testdata)
        self.model = self._build_model()
        self.fit()

    def _prepare_data(self, trainingdata, item_profile, testdata):
        # Convert user and item ids to strings
        trainingdata[["user_ID", "item_ID"]] = trainingdata[["user_ID", "item_ID"]].astype(str)
        testdata[["user_ID", "item_ID"]] = testdata[["user_ID", "item_ID"]].astype(str)

        # Create mappings from user/item ids to indices
        self.user2idx = {u: i for i, u in enumerate(trainingdata["user_ID"].unique())}
        self.item2idx = {m: j for j, m in enumerate(trainingdata["item_ID"].unique())}

        # Map user and item ids to the indices in training and test data (both needed)
        for df in [trainingdata, testdata]:
            df["user_idx"] = df["user_ID"].map(self.user2idx)
            df["item_idx"] = df["item_ID"].map(self.item2idx)

        # filter and process item features
        ip = item_profile[item_profile["item_ID"].isin(self.item2idx)].copy()
        ip["item_idx"] = ip["item_ID"].map(self.item2idx)
        ip.sort_values("item_idx", inplace=True)
        feat_df = ip.filter(regex="^Genre_")

        # scale feature or set placeholder if no feature
        if not feat_df.empty:
            feat_df = StandardScaler().fit_transform(feat_df.fillna(0))
        else:
            feat_df = np.zeros((len(self.item2idx), 1))

        # convert features to a tensor (array in a dimension you need, vgl. Skalar (5), Vektor ([1,2,3]), ...)
        feat_mat = torch.tensor(feat_df, dtype=torch.float32)

        # Compute the global mean rating... optional but whats the ase when user or item i unknown ? (vgl. cold-start-szenario)
        global_mean = trainingdata["rating"].mean()

        # Create data loaders for training and validation (Dataloaders take the work of batching, shuffle or parallel loading to improve training)
        train_loader = DataLoader(RatingsDataset(trainingdata), batch_size=self.batch_size, shuffle=True)
        val_loader = DataLoader(RatingsDataset(testdata), batch_size=self.batch_size)

        return train_loader, val_loader, feat_mat, global_mean

    def _build_model(self):
        # Build the hybrid matrix factorization model
        return HybridMF(
            num_users=len(self.train_data.dataset.u.unique()),
            num_items=len(self.train_data.dataset.i.unique()),
            embedding_dim=self.embedding_dim,
            item_features=self.feat_mat,
            dropout=self.dropout_p,
        ).to(self.device)

    def fit(self):
        # Initialize optimizer and learning rate scheduler
        opt = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(opt, factor=0.5, patience=3)
        best_mae, epochs_no_imp = float("inf"), 0
        best_model_state = None  # Variable to store the best model state

        # Training loop ... as we discussed in the lecture
        for ep in range(1, self.epochs + 1):
            self.model.train()
            epoch_loss = 0
            for u, i, r in self.train_data:
                u, i, r = u.to(self.device), i.to(self.device), r.to(self.device)
                opt.zero_grad()
                loss = nn.functional.smooth_l1_loss(self.model(u, i), r, beta=1.0)  # Feed-Forward
                loss.backward()  # Backpropagation
                opt.step()
                epoch_loss += loss.item()  # collect the loss-value


            # Calculate validation-Loss (we need the smallest MAE possible)
            val_mae = self.evaluate_loader(self.val_data)
            # Print average loss and MAE of each epoch
            logger.debug(f"Epoche {ep}\t| Training Loss: {epoch_loss / len(self.train_data)} \t| Validation MAE: {val_mae}")
            scheduler.step(val_mae)

            # Save the best model
            if val_mae < best_mae:
                best_mae, epochs_no_imp = val_mae, 0
                best_model_state = self.model.state_dict()
            else:
                epochs_no_imp += 1
                if epochs_no_imp >= self.early_stopping_rounds:
                    break

        # Load the best model state from memory
        if best_model_state is not None:
            self.model.load_state_dict(best_model_state)


    def evaluate_loader(self, loader):
        # Evaluate the model on a data loader
        self.model.eval()
        err, n = 0.0, 0
        with torch.no_grad():
            for u, i, r in loader:
                preds = self.model(u.to(self.device), i.to(self.device)).cpu()
                err += torch.abs(preds - r).sum().item()
                n += len(r)
        return err / n

    def predict(self, user_id, item_id):
        # Convert user and item IDs to indices
        user_idx = self.user2idx.get(user_id)
        item_idx = self.item2idx.get(item_id)

        # Handle cold-start cases
        if user_idx is None and item_idx is None:
            return float(self.global_mean)

        if user_idx is None:
            item_bias = self.model.bi.weight[item_idx].item()
            return float(np.clip(self.global_mean + item_bias, 0.0, 5.0)) # clip ensures that the value is between 0 and 5

        if item_idx is None:
            user_bias = self.model.bu.weight[user_idx].item()
            return float(np.clip(self.global_mean + user_bias, 0.0, 5.0)) # clip ensures that the value is between 0 and 5

        # Compute the predicted rating
        u = torch.tensor([user_idx], device=self.device)
        i = torch.tensor([item_idx], device=self.device)
        self.model.eval()
        with torch.no_grad():
            score = self.model(u, i).item()
        return float(np.clip(score, 0.0, 5.0)) # clip ensures that the value is between 0 and 5



class HyperparamOptimizedDeepLearningRecommender(Recommender):
    def __init__(self, testdata: pd.DataFrame, item_profile: pd.DataFrame, trainingdata: pd.DataFrame, include_hyperparam_check: Optional[bool] = False):
        super().__init__()
        self.testdata = testdata
        self.item_profile = item_profile
        self.trainingdata = trainingdata

        self.include_hyperparam_check= include_hyperparam_check
        self.recommender = None


    def _preprocess_data(self):
        # this will be done in the used recommender class
        pass

    def _objective(self, trial):
        embedding_dim = trial.suggest_int("embedding_dim", 32, 128, step=16)
        batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128])
        lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
        dropout_p = trial.suggest_uniform("dropout_p", 0.1, 0.5)
        weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-3)

        optimize_recommender = DeepLearningRecommender(
            trainingdata=self.trainingdata,
            item_profile=self.item_profile,
            testdata=self.testdata,
            embedding_dim=embedding_dim,
            batch_size=batch_size,
            lr=lr,
            dropout_p=dropout_p,
            weight_decay=weight_decay,
        )

        val_mae = optimize_recommender.evaluate_loader(optimize_recommender.val_data)
        return val_mae

    def _build_recommender(self, params: dict) -> None:
        self.recommender = DeepLearningRecommender(
            trainingdata=self.trainingdata,
            item_profile=self.item_profile,
            testdata=self.testdata,
            embedding_dim=params["embedding_dim"],
            batch_size=params["batch_size"],
            lr=params["lr"],
            dropout_p=params["dropout_p"],
            weight_decay=params["weight_decay"],
        )

    def _find_out_best_params(self) -> Dict[str, Any]:
        study = optuna.create_study(direction="minimize")
        study.optimize(self._objective, n_trials=50)

        print("Beste Parameter:", study.best_params)

        return study.best_params


    def predict(
            self,
            user_id: str,
            item_id: str,
            similarity: Optional[Literal['cosine', 'pearson']] = 'cosine',
            calculation_variety: Optional[Literal['weighted', 'unweighted']] = 'weighted',
            k: Optional[int] = 3,
            second_k_value: Optional[int] = None,
    ) -> float:

        if self.include_hyperparam_check:
            best_params = self._find_out_best_params()
        else:
            # values came from our test with the hyperparam check
            best_params = {
                "lr": 0.00483293357554159,
                "embedding_dim": 32,
                "dropout_p": 0.24090306736140638,
                "weight_decay": 0.00022232253823222672,
                "batch_size": 64,
            }


        if self.recommender is None:
            self._build_recommender(best_params)

        return self.recommender.predict(user_id, item_id)

---

## MAE Tester
Verwendet, um die verschiedenen Recommender zu Evaluieren und anhand des MAEs zu vergleichen

In [None]:
class Test(BaseModel):
    name: str
    type: Literal["collaborative_filtering", "content_based", "hybrid", "deep_learning"]
    mode: Optional[Literal["user", "item"]] = "item"
    k_value: Optional[int] = 3
    second_k_value: Optional[int] = 3
    metric: Optional[Literal["cosine", "pearson"]] = 'cosine'
    calculation_variety: Optional[Literal["weighted", "unweighted"]] = 'weighted'
    alpha: Optional[float] = 0.5


class TestResult(BaseModel):
    name: str
    type: Literal["collaborative_filtering", "content_based", "hybrid", "deep_learning"]
    mode: Literal["user", "item"] | None
    k_value: int | None
    second_k_value: int | None
    metric: Literal["cosine", "pearson"] | None
    calculation_variety: Literal["weighted", "unweighted"] | None
    alpha: float | None
    mae: float


class TestResults(BaseModel): # just for saving in a "pretty" form
    date: str
    num_tests: int
    best_test: TestResult
    results: List[TestResult]



class MAETester:
    def __init__(self, tests: List[Test], test_data_path: str, data_path: str, item_profile_path: str, ratings: str, eval_data_path: str):
        self.tests = tests
        self.testdata = pd.read_csv(test_data_path)  # testdata (for training of Neural Network)
        self.item_profile = pd.read_csv(item_profile_path)
        self.user_ratings = pd.read_csv(ratings)
        self.eval_data = pd.read_csv(eval_data_path)  # eval / testdata for MAE tester
        self._prepare_data()
        self.data = pd.read_csv(data_path)  # trainings-data
        self.results: List[TestResult] = []


    def _prepare_data(self):
        self.eval_data["user_ID"] = self.eval_data["user_ID"].astype(str)
        self.eval_data["item_ID"] = self.eval_data["item_ID"].astype(str)

    # because we only have 0.5 steps in testdata
    @staticmethod
    def _round_to_nearest_half(value: float):
        return round(value * 2) / 2

    def run_tests(self) -> pd.DataFrame:
        for test in self.tests:
            result = self._run_test(test)
            self.results.append(result)
            logger.success(f"Test abgeschlossen: {test.name}, MAE: {result.mae:.4f}\n")

        # display final resultse
        result_df = self._summarize_test_results()

        return result_df

    def _run_test(self, test: Test) -> TestResult:
        logger.info(f"Running test: {test.name}")

        if test.type == "content_based":
            recommender = ContentBasedRecommender(
                item_profile=self.item_profile,
                user_ratings=self.user_ratings,
            )
        elif test.type == "collaborative_filtering":
            recommender = CollaborativeFilteringRecommender(
                mode=test.mode, # ignore type (that this can be NONE)
                data=self.data,
            )
        elif test.type == "hybrid":
            recommender = HybridRecommender(
                data=self.data,
                item_profile=self.item_profile,
                user_ratings=self.user_ratings,
                mode=test.mode,  # ignore type (that this can be NONE)
                alpha=test.alpha,
            )
        elif test.type == "deep_learning":
            recommender = HyperparamOptimizedDeepLearningRecommender(
                trainingdata=self.user_ratings,
                item_profile=self.item_profile,
                testdata=self.testdata,
            )
        else:
            raise ValueError(f"Unbekannter Recomendertyp: {test.type}")

        predictions = []
        actuals = []

        eval_data_list = self.eval_data.to_numpy()

        for row in tqdm(eval_data_list, desc="Vorhersagen werden berechnet"):
            user_id: str = str(row[0])
            item_id: str = str(row[1])
            actual_rating = row[2]

            try:
                predicted_rating = recommender.predict(
                    user_id=user_id,
                    item_id=item_id,
                    similarity=test.metric,
                    calculation_variety=test.calculation_variety,
                    k=test.k_value,
                    second_k_value=test.second_k_value,
                )

                predicted_rating = self._round_to_nearest_half(value=predicted_rating)

                predictions.append(predicted_rating)
                actuals.append(actual_rating)
            except ValueError as e:
                logger.warning(f"Fehler bei der Vorhersage: {e}")

        mae = self._mean_absolute_error(actuals, predictions)

        return TestResult(
            name=test.name,
            type=test.type,
            mode=test.mode,
            k_value=test.k_value,
            metric=test.metric,
            calculation_variety=test.calculation_variety,
            alpha=test.alpha,
            second_k_value=test.second_k_value,
            mae=mae,
        )

    @staticmethod
    def _mean_absolute_error(actuals: List[float], predictions: List[float]) -> float:
        if not actuals or not predictions or len(actuals) != len(predictions):
            raise ValueError("Listen für tatsächliche und vorhergesagte Werte müssen gleich lang und nicht leer sein.")

        absolute_errors = [abs(a - p) for a, p in zip(actuals, predictions)]
        mae = sum(absolute_errors) / len(absolute_errors)
        return mae

    def _summarize_test_results(self) -> pd.DataFrame:
        if not self.results:
            logger.info("Keine Testergebnisse vorhanden.")
            return

        summary_df = pd.DataFrame([{
            "Testname": result.name,
            "Recomendertyp": result.type,
            "Modus": result.mode if result.type == "collaborative_filtering" else "/",
            "k-Wert": "/" if result.type == "deep_learning" else result.k_value,
            "Metrik": result.metric if result.type == "collaborative_filtering" else "/",
            "Berechnungsvariante": result.calculation_variety if result.type == "collaborative_filtering" else "/",
            "Alpha (weight)": result.alpha if result.type == "hybrid" else "/",
            "MAE": result.mae
        } for result in self.results])

        print("-" * 50)
        print("Zusammenfassung der Testergebnisse:")
        print(summary_df.to_string(index=False))
        print("-" * 50)

        return summary_df


# Beginn der Aufrufe und Nutzung der Recommender

Wir haben viele Testfälle durchgeführt und uns sowohl mit dem klassischen Collaborative Filtering (User- & Itembased) sowie Content-Based Filtering beschäftigt. Anschließend wurden beide Methoden kombiniert in einem Hybriden Recommender. Dieser Ansatz verbesserte den MAE minimal weshalb wir uns noch mit einem Deep-Learning Recommender beschäftigten.

---

Der Deep-Learning Recommender performte besser, jedoch aufgrund der geringen Datenmenge nach wie vor nicht perfekt. Zusätzlich zum besten MAE liefert der Deep-Learning Recommender den Vorteil, nach einmaligem Training schnell Empfehlungen liefern zu können.

Um diesen Recommender zu testen stellen wir eine struktur bereit wobei lediglich der parameter `testdata` angepasst werden muss um den evaluationsdatensatz zu verwenden. *(zu sehen etwas weiter unten)*

In [None]:
choosen_recommender_profile = [
    Test(name="ChoosenDeepLearning", type="deep_learning")
]

---

#### Optionale (andere) Testprofile die verwendet wurden und vollständigkeitshalber hier aufgeführt sind

In [None]:
collaborative_filtering_profiles = [
    Test(name="UserBased_1_cosine", type="collaborative_filtering", mode="user", k_value=4, metric="cosine", calculation_variety="weighted"),
    Test(name="UserBased_2_cosine", type="collaborative_filtering", mode="user", k_value=3, metric="cosine", calculation_variety="weighted"),
    Test(name="UserBased_3_cosine", type="collaborative_filtering", mode="user", k_value=4, metric="cosine", calculation_variety="weighted"),
    Test(name="UserBased_4_cosine", type="collaborative_filtering", mode="user", k_value=5, metric="cosine", calculation_variety="weighted"),

    Test(name="ItemBased_1_cosine", type="collaborative_filtering", mode="item", k_value=4, metric="cosine", calculation_variety="weighted"),
    Test(name="ItemBased_2_cosine", type="collaborative_filtering", mode="item", k_value=3, metric="cosine", calculation_variety="weighted"),
    Test(name="ItemBased_3_cosine", type="collaborative_filtering", mode="item", k_value=4, metric="cosine", calculation_variety="weighted"),
    Test(name="ItemBased_4_cosine", type="collaborative_filtering", mode="item", k_value=5, metric="cosine", calculation_variety="weighted"),

    Test(name="UserBased_1_pearson", type="collaborative_filtering", mode="user", k_value=4, metric="pearson", calculation_variety="weighted"),
    Test(name="UserBased_2_pearson", type="collaborative_filtering", mode="user", k_value=3, metric="pearson", calculation_variety="weighted"),
    Test(name="UserBased_3_pearson", type="collaborative_filtering", mode="user", k_value=4, metric="pearson", calculation_variety="weighted"),
    Test(name="UserBased_4_pearson", type="collaborative_filtering", mode="user", k_value=5, metric="pearson", calculation_variety="weighted"),

    Test(name="ItemBased_1_pearson", type="collaborative_filtering", mode="item", k_value=4, metric="pearson", calculation_variety="weighted"),
    Test(name="ItemBased_2_pearson", type="collaborative_filtering", mode="item", k_value=3, metric="pearson", calculation_variety="weighted"),
    Test(name="ItemBased_3_pearson", type="collaborative_filtering", mode="item", k_value=4, metric="pearson", calculation_variety="weighted"),
    Test(name="ItemBased_4_pearson", type="collaborative_filtering", mode="item", k_value=5, metric="pearson", calculation_variety="weighted"),
]

In [None]:
content_based_profiles = [
    Test(name="ContentBased_1", type="content_based", k_value=3),
    Test(name="ContentBased_2", type="content_based", k_value=4),
    Test(name="ContentBased_3", type="content_based", k_value=5),
    Test(name="ContentBased_4", type="content_based", k_value=6),
    Test(name="ContentBased_5", type="content_based", k_value=7),
    Test(name="ContentBased_6", type="content_based", k_value=8),
    Test(name="ContentBased_7", type="content_based", k_value=9),
    Test(name="ContentBased_8", type="content_based", k_value=10),
    Test(name="ContentBased_9", type="content_based", k_value=11),
    Test(name="ContentBased_10", type="content_based", k_value=12),
    Test(name="ContentBased_11", type="content_based", k_value=13),
    Test(name="ContentBased_12", type="content_based", k_value=14),
]

In [None]:
hybrid_profiles = [
    Test(name="Hybrid_1", type="hybrid", mode="user", k_value=5, metric="cosine", calculation_variety="weighted", alpha=0.5),
    Test(name="Hybrid_2", type="hybrid", mode="user", k_value=5, metric="cosine", calculation_variety="weighted", alpha=0.75),
    Test(name="Hybrid_3", type="hybrid", mode="user", k_value=5, metric="cosine", calculation_variety="weighted", alpha=0.25),

    Test(name="Hybrid_4", type="hybrid", mode="user", k_value=5, second_k_value=14, metric="cosine", calculation_variety="weighted", alpha=0.5),
    Test(name="Hybrid_5", type="hybrid", mode="user", k_value=5, second_k_value=14, metric="cosine", calculation_variety="weighted", alpha=0.75),
    Test(name="Hybrid_6", type="hybrid", mode="user", k_value=5, second_k_value=14, metric="cosine", calculation_variety="weighted", alpha=0.25),

    Test(name="Hybrid_7", type="hybrid", mode="user", k_value=5, metric="pearson", calculation_variety="weighted", alpha=0.5),
    Test(name="Hybrid_8", type="hybrid", mode="user", k_value=5, metric="pearson", calculation_variety="weighted", alpha=0.75),
    Test(name="Hybrid_9", type="hybrid", mode="user", k_value=5, metric="pearson", calculation_variety="weighted", alpha=0.25),

    Test(name="Hybrid_10", type="hybrid", mode="user", k_value=5, second_k_value=14, metric="pearson", calculation_variety="weighted", alpha=0.5),
    Test(name="Hybrid_11", type="hybrid", mode="user", k_value=5, second_k_value=14, metric="pearson", calculation_variety="weighted", alpha=0.75),
    Test(name="Hybrid_12", type="hybrid", mode="user", k_value=5, second_k_value=14, metric="pearson", calculation_variety="weighted", alpha=0.25),

]

---

## Aufruf des MAE Testers
Dem Tester werden die entscheidenden Daten übergeben und anschließend wird der MAE für die ausgewählten / übergebenene Test-Profile ermittelt

In [None]:
#csv-Datei einlesen
from google.colab import files
uploaded = files.upload()

In [None]:
eval_data_path = "Testdaten_FlixNet.csv"   # TODO: change to use another Eval-Dataset

In [None]:
tester = MAETester(
        tests=choosen_recommender_profile,
        test_data_path="Testdaten_FlixNet.csv",  # also used for the "test/training for neral network"
        eval_data_path=eval_data_path,  # TODO: Here you need to set your path
        data_path="Bewertungsmatrix_FlixNet.csv",
        ratings="Ratings_FlixNet.csv",
        item_profile_path="Itemprofile_FlixNet.csv",
    )

In [None]:
1 result = tester.run_tests()
result # print result here as dataframe
result = tester.run_tests()
result # print result here as dataframe

[32m2025-06-16 13:23:37.906[0m | [1mINFO    [0m | [36m__main__[0m:[36m_run_test[0m:[36m65[0m - [1mRunning test: ChoosenDeepLearning[0m
Vorhersagen werden berechnet:   0%|          | 0/1595 [00:00<?, ?it/s][32m2025-06-16 13:23:38.730[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfit[0m:[36m156[0m - [34m[1mEpoche 1	| Training Loss: 2.077234741597801 	| Validation MAE: 1.7166857776223305[0m
[32m2025-06-16 13:23:39.436[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfit[0m:[36m156[0m - [34m[1mEpoche 2	| Training Loss: 0.6823071730907425 	| Validation MAE: 0.8501930045483628[0m
[32m2025-06-16 13:23:40.175[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfit[0m:[36m156[0m - [34m[1mEpoche 3	| Training Loss: 0.4008083528212268 	| Validation MAE: 0.794121946137527[0m
[32m2025-06-16 13:23:40.975[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfit[0m:[36m156[0m - [34m[1mEpoche 4	| Training Loss: 0.35764197563263855 	| Validation MAE: 0.7

--------------------------------------------------
Zusammenfassung der Testergebnisse:
           Testname Recomendertyp Modus k-Wert Metrik Berechnungsvariante Alpha (weight)      MAE
ChoosenDeepLearning deep_learning     /      /      /                   /              / 0.661129
ChoosenDeepLearning deep_learning     /      /      /                   /              / 0.663009
--------------------------------------------------


Unnamed: 0,Testname,Recomendertyp,Modus,k-Wert,Metrik,Berechnungsvariante,Alpha (weight),MAE
0,ChoosenDeepLearning,deep_learning,/,/,/,/,/,0.661129
1,ChoosenDeepLearning,deep_learning,/,/,/,/,/,0.663009


## Ergebnis aller Testcases (ohne DeepLearning)

| Testname              | Recomendertyp           | Modus                  | k-Wert | Metrik  | Berechnungsvariante | Alpha (weight) | MAE      |
|-----------------------|--------------------------|-------------------------|--------|---------|----------------------|----------------|----------|
| UserBased_1_cosine    | collaborative_filtering | user                   | 2      | cosine  | weighted             | /              | 0.870846 |
| UserBased_2_cosine    | collaborative_filtering | user                   | 3      | cosine  | weighted             | /              | 0.815674 |
| UserBased_3_cosine    | collaborative_filtering | user                   | 4      | cosine  | weighted             | /              | 0.785266 |
| UserBased_4_cosine    | collaborative_filtering | user                   | 5      | cosine  | weighted             | /              | 0.774922 |
| ItemBased_1_cosine    | collaborative_filtering | item                   | 2      | cosine  | weighted             | /              | 0.933542 |
| ItemBased_2_cosine    | collaborative_filtering | item                   | 3      | cosine  | weighted             | /              | 0.884013 |
| ItemBased_3_cosine    | collaborative_filtering | item                   | 4      | cosine  | weighted             | /              | 0.855799 |
| ItemBased_4_cosine    | collaborative_filtering | item                   | 5      | cosine  | weighted             | /              | 0.841066 |
| UserBased_1_pearson   | collaborative_filtering | user                   | 2      | pearson | weighted             | /              | 0.865831 |
| UserBased_2_pearson   | collaborative_filtering | user                   | 3      | pearson | weighted             | /              | 0.805956 |
| UserBased_3_pearson   | collaborative_filtering | user                   | 4      | pearson | weighted             | /              | 0.781505 |
| UserBased_4_pearson   | collaborative_filtering | user                   | 5      | pearson | weighted             | /              | 0.771473 |
| ItemBased_1_pearson   | collaborative_filtering | item                   | 2      | pearson | weighted             | /              | 0.927273 |
| ItemBased_2_pearson   | collaborative_filtering | item                   | 3      | pearson | weighted             | /              | 0.873981 |
| ItemBased_3_pearson   | collaborative_filtering | item                   | 4      | pearson | weighted             | /              | 0.847022 |
| ItemBased_4_pearson   | collaborative_filtering | item                   | 5      | pearson | weighted             | /              | 0.825078 |
| ContentBased_1        | content_based           | /                      | 3      | /       | /                    | /              | 0.878683 |
| ContentBased_2        | content_based           | /                      | 4      | /       | /                    | /              | 0.839812 |
| ContentBased_3        | content_based           | /                      | 5      | /       | /                    | /              | 0.822571 |
| ContentBased_4        | content_based           | /                      | 6      | /       | /                    | /              | 0.830721 |
| ContentBased_5        | content_based           | /                      | 7      | /       | /                    | /              | 0.827273 |
| ContentBased_6        | content_based           | /                      | 8      | /       | /                    | /              | 0.816928 |
| ContentBased_7        | content_based           | /                      | 9      | /       | /                    | /              | 0.813166 |
| ContentBased_8        | content_based           | /                      | 10     | /       | /                    | /              | 0.813793 |
| ContentBased_9        | content_based           | /                      | 11     | /       | /                    | /              | 0.815047 |
| ContentBased_10       | content_based           | /                      | 12     | /       | /                    | /              | 0.800313 |
| ContentBased_11       | content_based           | /                      | 13     | /       | /                    | /              | 0.803448 |
| ContentBased_12       | content_based           | /                      | 14     | /       | /                    | /              | 0.802821 |
| Hybrid_1              | hybrid                  | /                      | 5      | /       | /                    | 0.5            | 0.739812 |
| Hybrid_2              | hybrid                  | /                      | 5      | /       | /                    | 0.75           | 0.743887 |
| Hybrid_3              | hybrid                  | /                      | 5      | /       | /                    | 0.25           | 0.786834 |
| Hybrid_4              | hybrid                  | /                      | 5      | /       | /                    | 0.5            | 0.720376 |
| Hybrid_5              | hybrid                  | /                      | 5      | /       | /                    | 0.75           | 0.737304 |
| Hybrid_6              | hybrid                  | /                      | 5      | /       | /                    | 0.25           | 0.741379 |
| Hybrid_7              | hybrid                  | /                      | 5      | /       | /                    | 0.5            | 0.741379 |
| Hybrid_8              | hybrid                  | /                      | 5      | /       | /                    | 0.75           | 0.742006 |
| Hybrid_9              | hybrid                  | /                      | 5      | /       | /                    | 0.25           | 0.787147 |
| Hybrid_10             | hybrid                  | /                      | 5      | /       | /                    | 0.5            | 0.720376 |
| Hybrid_11             | hybrid                  | /                      | 5      | /       | /                    | 0.75           | 0.735110 |
| Hybrid_12             | hybrid                  | /                      | 5      | /       | /                    | 0.25           | 0.740125 |
