---
# Setup

In [1]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from time import sleep
import datetime
import sqlite3
import requests
import json
from JulianFinal import Cosine_Similarity




[9575, 16265, 9628, 5486, 10704, 12095, 10126, 9159, 8687, 276, 2003, 11996, 14247, 12365, 13987, 17742, 10, 9462, 1246, 2584, 7354, 14503, 13536, 8014, 4383, 895, 9886, 16279, 11430, 3205, 11284, 7952, 17064, 259, 10493, 10129, 11664, 15547, 13484, 7233, 6579, 17243, 5778, 8304, 9199, 11950, 16887, 6908, 11874, 15826, 11466, 2114, 13844, 16307, 6193, 8891, 11765, 9862, 10172, 14227, 13762, 3953, 15914, 6810, 5583, 9235, 3873, 9204, 7353, 9582, 16582, 2174, 15962, 6704, 15308, 15674, 10729, 9768, 12801, 11929, 4695, 1175, 8088, 14608, 9727, 3687, 12333, 12493, 14110, 5137, 15887, 6902, 11183, 12293, 14419, 16759, 16613, 2230, 6958, 15472, 3943, 8487, 9758, 13338, 1922, 8666, 702, 6328, 6362, 12745, 13489, 5666, 9442, 13288, 13607, 13954, 12582, 8849, 7080, 16041, 4231, 1700, 10038, 11134, 2179, 4123, 1449, 9428, 12581, 5449, 1860, 16522, 2588, 3743, 16243, 11418, 2851, 12945, 5939, 7895, 3507, 12513, 241, 7729, 12466, 3672, 5756, 6207, 12092, 14670, 3642, 4980, 8099, 4758, 15191, 12591

In [2]:

cosine_similarity = Cosine_Similarity()



In [4]:

cosine_similarity.save_to_csv(0)
cosine_similarity.save_to_csv(1)
cosine_similarity.save_to_csv(2)

In [3]:
# small size for testing (100.000 rows)
db_dev_path = 'netflix_dev.db'
db_dev_conn = 'sqlite://' + db_dev_path

# full size for production (~100.000.000 rows)
db_prod_path = 'netflix.db'
db_prod_conn = 'sqlite://' + db_prod_path

netflix_data = pl.read_database("SELECT * FROM netflix_data", db_prod_conn)
movie_titles = pl.read_database("SELECT * FROM movie_titles", db_dev_conn)

In [6]:
netflix_data

film,user,rating,date
i64,i64,i64,str
1,1488844,3,"""2005-09-06 """
1,822109,5,"""2005-05-13 """
1,885013,4,"""2005-10-19 """
1,30878,4,"""2005-12-26 """
1,823519,3,"""2004-05-03 """
1,893988,3,"""2005-11-17 """
1,124105,4,"""2004-08-05 """
1,1248029,3,"""2004-04-22 """
1,1842128,4,"""2004-05-09 """
1,2238063,3,"""2005-05-11 """


---
# Functions

Implement your recommendation engine logic in `predict_rating()`. Then, call
`full_evaluation()` to evaluate your engine.

| Function | Description |
| --- | --- |
| `get_user_ratings(user_id)` | Returns a DataFrame of all ratings for a given user. |
| `ratings_to_vector(ratings, placeholder=-1)` | Transforms a DataFrame of ratings into a vector. |
| `mask_ratings(ratings, split)` | Mask ratings in a user's rating DataFrame. Works like a training-/test-set split |
| `predict_rating(user_id, movie_id)` | Predict how a user would rate a movie. **IMPLEMENT YOUR ENGINE HERE**|
| `rate_all(user_id)` | Predict ratings of all movies for a given user |
| `get_top_rated(ratings, split)` | Get the top rated movies for a user. |
| `evaluate(masked_ratings, top_ratings)` | Evaluate the accuracy of the predictions. |
| `full_evaluation(user_id, split_mask, split_top)` | Like evaluate() but contains all the steps. |

In [4]:
# get all ratings for a specific user
def get_user_ratings(user_id):
    return netflix_data.filter(pl.col("user") == user_id)

In [5]:
# get all ratings for a specific user with rating > 3
def get_highly_rated_user_ratings(user_id):
    return netflix_data.filter((pl.col("user") == user_id) & (pl.col("rating") >= 5))


In [6]:
# ratings to vector
def ratings_to_vector(ratings, placeholder=-1):
    """
    Convert a DataFrame of ratings to a vector of ratings

    Args:
        ratings (DataFrame): DataFrame of ratings
        placeholder (int, optional): Placeholder value for missing ratings. Defaults to -1.

    Returns:
        list[float]: List of ratings
    """

    ratings_list = [placeholder] * len(movie_titles)

    # get the ratings as a numpy array
    for current_rating in ratings.rows():

        film = (current_rating[0]) - 1   # film id starts at 1, but list index starts at 0, so subtract 1
        rating = current_rating[2]
        ratings_list[film] = rating

    return ratings_list


In [7]:
# split ratings into masked and unmasked ratings based on a date
# all ratings before the date are unmasked, all ratings after the date are masked
def mask_ratings(ratings, split_date):
    """Mask ratings in a user's rating DataFrame

    Args:
        ratings (pl.DataFrame): A User's ratings
        split_date (datetime.datetime): Date to split ratings on

    Returns:
        [pl.Dataframe, pl.Dataframe]: Masked ratings, unmasked ratings
    """

    unmasked_ratings = []
    masked_ratings = []

    for rating in ratings.rows():

        # get the ratings date as datetime
        rating_date = datetime.datetime.strptime(rating[3][:-1], "%Y-%m-%d")

        # if the rating is before the split date, mask it
        if rating_date > split_date:
            unmasked_ratings.append(rating)
        else:
            masked_ratings.append(rating)

    # convert the lists to DataFrames
    unmasked_ratings = pl.DataFrame(unmasked_ratings, schema=ratings.columns)
    masked_ratings = pl.DataFrame(masked_ratings, schema=ratings.columns)

    #keep only masked_ratings with a score of 4 or higher
    masked_ratings = masked_ratings.filter(pl.col("rating") >= 4)

    return masked_ratings, unmasked_ratings

In [8]:
# sort sums by score
def sort_sums(sums):

    # convert the sums to a list of tuples
    sums_list = []
    for i in range(len(sums)):
        sums_list.append((i, sums[i]))

    # sort the list
    sums_list.sort(key=lambda x: x[1], reverse=True)

    return sums_list

In [9]:
# predict how a user would rate a movie
import random


def predict_rating(user_id, movie_id):
    """Predict how a user would rate a movie
    
    Args:
        user_id (int): User ID
        movie_id (int): Movie ID

    Returns:
        float: Predicted rating
    """

    ### PUT YOUR EVALUATION CODE HERE ###

    return random.uniform(1, 5)

In [10]:
def get_cosine_similarities(movie_id):
    """Ermittelt die Kosinus-Ähnlichkeiten für alle Filme.
    
    Args:
        movie_id (int): Die Film-ID, für die die Kosinus-Ähnlichkeiten berechnet werden sollen.
        
    Returns:
        pl.Series: Eine Serie, die die Kosinus-Ähnlichkeiten für alle Filme enthält.
    """
    
    scores = []
    
    # Durchläuft alle Filme in der movie_titles-Liste
    for movie in movie_titles["film"]:
        
        # Berechnet die Kosinus-Ähnlichkeit für den aktuellen Film
        score = cosine_similarity.get_cosine_sim_scores(movie)
        
        # Fügt den Ähnlichkeitsscore der scores-Liste hinzu
        scores.append(score)
        
        # Setzt den Ähnlichkeitsscore auf 0, wenn der aktuelle Film der Ziel-Film ist
        if movie == movie_id:
            score = 0

    # Wandelt die scores-Liste in eine Pandas-Serie um
    scores = pl.Series("scores", scores)
    
    return scores



In [11]:
def sum_cosine_similarities(unmasked_ratings):
    """Berechnet die Summe der Kosinus-Ähnlichkeiten für nicht maskierte Bewertungen.
    
    Args:
        unmasked_ratings (DataFrame): Die Bewertungen, die nicht maskiert sind.
        
    Returns:
        DataFrame: Ein DataFrame, der die Summe der Kosinus-Ähnlichkeiten für jede Film-ID enthält.
    """
    
    # Hole die user_id aus unmasked_ratings
    user_id = [unmasked_ratings["user"][0]] * len(movie_titles["film"])

    # Initialisiere die sum_scores-Liste mit Nullen
    sum_scores = [0] * len(movie_titles["film"])
    sum_scores = pl.Series("sum_scores", sum_scores)

    # Iteriere durch jede Bewertungszeile in unmasked_ratings
    for rating in unmasked_ratings.rows():

        # Überprüfe, ob die Bewertung größer oder gleich 3 ist
        if rating[2] >= 3:

            # Hole die vorausberechneten Kosinus-Ähnlichkeiten für den aktuellen Film
            scores = get_cosine_similarities(rating)

            # Aktualisiere sum_scores durch Addition der aktuellen scores
            sum_scores = sum_scores + scores

    # Füge die sum_scores mit den Film-IDs aus movie_titles zusammen
    sum_scores = pl.DataFrame({"user": user_id, "film": movie_titles["film"], "rating": sum_scores})
            
    return sum_scores


In [12]:
# rate all movies for a given user
def rate_all(user_id):
    """Returns a dataframe with all ratings for a given user

    Args:
        user_id (int): User ID

    Returns:
        pl.DataFrame: Ratings for a given user
    """


    # get all movies
    movies = movie_titles["film"]
    ratings = []

    for movie in movies:
        # predict rating
        rating = predict_rating(user_id, movie)

        # add rating to dataframe
        ratings.append([user_id, movie, rating])

    ratings = pl.DataFrame(ratings, schema=["user", "film", "rating"])

    return ratings

In [13]:
# get the top n ratings for a given user
def get_top_rated(ratings, n):
    """Get the top n of ratings for a given user

    Args:
        ratings (pl.DataFrame): Ratings
        n (int): Number of ratings to get

    Returns:
        pl.DataFrame: Top n ratings
    """

    # sort ratings by rating
    ratings = ratings.sort(by="rating", descending=True)

    # take the top n ratings
    top_ratings = ratings.head(n)

    return top_ratings

In [14]:
# evaluate how well a model performs
# for this, we compare how well the model predicts the top m% of ratings
# we calculate the intersection between the masked ratings and the top m% of ratings
def evaluate(masked_ratings, top_ratings):
    """Evaluate how well a model performs

    Args:
        masked_ratings (pl.DataFrame): Masked ratings
        top_ratings (pl.DataFrame): Top m% of ratings

    Returns:
        float: Percentage of ratings that were correctly predicted
    """

    # get intersection between masked ratings and top ratings
    intersection = masked_ratings.join(top_ratings, on=["user", "film"], how="inner")

    # calculate percentage of ratings that were correctly predicted
    percentage = len(intersection) / len(masked_ratings)

    return percentage

In [15]:
# all steps combined
# TODO: update the parameters
def full_evaluation(user_id, split_mask, split_top):
    """Evaluate how well a model performs (for a given user)

    Args:
        user_id (int): User ID
        split_mask (float): Percentage of ratings to mask
        split_top (float): Percentage of top ratings to compare with the masked ratings

    Returns:
        float: Percentage of ratings that were correctly predicted
    """

    masked_ratings, unmasked_ratings = mask_ratings(get_user_ratings(user_id), split_mask)

    all_ratings = rate_all(user_id)
    top_ratings = get_top_rated(all_ratings, split_top)

    evaluation =  evaluate(masked_ratings, top_ratings)

    return evaluation

In [16]:
def full_evaluate_cosine_similarity(user_id, split_date, top_n):
    """Bewertet die Leistung eines Modells für einen bestimmten Benutzer.
    
    Achtung: Diese Funktion kann bei großen Datensätzen Laufzeitprobleme verursachen.

    Args:
        user_id (int): Benutzer-ID
        split_date (datetime.datetime): Datum, an dem die Bewertungen aufgeteilt werden
        top_n (int): Anzahl der besten Bewertungen, die mit den maskierten Bewertungen verglichen werden sollen

    Returns:
        float: Prozentsatz der Bewertungen, die korrekt vorhergesagt wurden
    """
    
    # Hole alle hoch bewerteten Filme für den Benutzer
    ratings = get_highly_rated_user_ratings(user_id)

    # Teile die Bewertungen in maskierte und nicht maskierte Bewertungen auf
    unmasked_ratings, masked_ratings = mask_ratings(ratings, split_date)

    # Berechne die Summe der Kosinus-Ähnlichkeiten für die nicht maskierten Bewertungen
    sums = sum_cosine_similarities(unmasked_ratings)

    # Hole die Top-n-Bewertungen basierend auf den Kosinus-Ähnlichkeiten
    top_ratings = get_top_rated(sums, top_n)

    # Evaluierung durchführen
    evaluation = evaluate(masked_ratings, top_ratings)

    return evaluation


In [None]:
# Funktion zur Berechnung der fokussierten Genauigkeit der Filmempfehlungen für einen Benutzer
def calculate_focused_accuracy(user, recommended_movies, user_ratings):
    # Filtert den DataFrame user_ratings, um nur die Bewertungen des gegebenen Benutzers zu erhalten
    user_ratings_filtered = user_ratings.filter(user_ratings['user'] == user)
    
    # Erstellt ein Wörterbuch aus den gefilterten Bewertungen für einfachen Zugriff
    user_ratings_dict = {row[0]: row[2] for row in user_ratings_filtered.iter_rows()}

    # Ermittelt die Schnittmenge der Filme, die dem Benutzer empfohlen wurden und die er bewertet hat
    intersect_movies = set(recommended_movies).intersection(set(user_ratings_dict.keys()))

    # Überprüft, ob die Schnittmenge leer ist
    if len(intersect_movies) == 0:
        return "Keiner der empfohlenen Filme wurde vom Nutzer bewertet"

    # Zählt, wie viele der Filme in der Schnittmenge vom Benutzer positiv bewertet wurden
    count_positive_ratings = sum(1 for movie in intersect_movies if user_ratings_dict[movie] >= 3)

    # Gibt die fokussierte Genauigkeit zurück, die das Verhältnis der positiv bewerteten Filme zur Gesamtzahl ist
    return count_positive_ratings / len(intersect_movies)


# Funktion zur Ermittlung des am besten bewerteten Films eines Benutzers
def get_best_rated_movie(user_id):
    # Holt alle Bewertungen des gegebenen Benutzers
    user_ratings = get_user_ratings(user_id)
    
    # Sortiert die Bewertungen in absteigender Reihenfolge und wählt den ersten Film aus
    best_rated_movie = user_ratings.sort(by=['rating'], descending=True).row(0)[0]
    
    return best_rated_movie

# Beispielanwendung
# Ermittlung des am besten bewerteten Films für einen bestimmten Benutzer
best_rated_movie = get_best_rated_movie(387418)

# Ermittlung aller Bewertungen, die der Benutzer abgegeben hat
user_ratings_given = get_user_ratings(387418)

# Ermittlung der Top-50-Filmempfehlungen basierend auf dem am besten bewerteten Film
top_50_movies_for_movie = cosine_similarity.get_movie_cos_scores(best_rated_movie, bonus_actor=0.08, 
                            bonus_same_director=0.02)

# Ermittlung der fokussierten Genauigkeit der Filmempfehlungen
recommended_movies = top_50_movies_for_movie
focused_accuracy = calculate_focused_accuracy(387418, recommended_movies, user_ratings_given)

# Ausgabe der fokussierten Genauigkeit
print("Fokussierte Genauigkeit für den Benutzer:", focused_accuracy)



In [None]:
# Funktion zur Berechnung der mittleren fokussierten Genauigkeit für eine Liste von Benutzern
def calculate_mean_focused_accuracy(list_of_users, netflix_data):
    # Initialisiere Variablen für die Summe der fokussierten Genauigkeiten und die Anzahl der Benutzer mit Schnittmengen
    sum_focused_accuracy = 0
    count_users_with_intersection = 0
    
    # Iteriere durch die Liste der Benutzer
    for user in list_of_users:
        # Hole alle Bewertungen des Benutzers
        user_ratings_given = get_user_ratings(user)
        
        # Ermittle den am besten bewerteten Film für den Benutzer
        best_rated_movie = get_best_rated_movie(user)
        
        # Erhalte die Top-50-Filmempfehlungen für den am besten bewerteten Film
        recommended_movies = cosine_similarity.get_movie_cos_scores(best_rated_movie, bonus_actor=0.08, bonus_same_director=0.02)
        
        # Berechne die fokussierte Genauigkeit für den Benutzer
        focused_accuracy = calculate_focused_accuracy(user, recommended_movies, user_ratings_given)
        
        # Füge die fokussierte Genauigkeit zur Gesamtsumme hinzu, falls sie existiert
        if focused_accuracy != "Keiner der empfohlenen Filme wurde vom Nutzer bewertet":
            sum_focused_accuracy += focused_accuracy
            count_users_with_intersection += 1
            
    # Berechne die durchschnittliche fokussierte Genauigkeit
    if count_users_with_intersection == 0:
        return "Keiner der empfohlenen Filme wurde von den Benutzern bewertet"
    else:
        return sum_focused_accuracy / count_users_with_intersection

# Erzeuge eine Liste aller einzigartigen Benutzer aus den Daten
list_of_unique_users = netflix_data['user'].unique().to_list()

# Berechne und drucke die durchschnittliche fokussierte Genauigkeit
mean_focused_accuracy = calculate_mean_focused_accuracy(list_of_unique_users, netflix_data)
print("Durchschnittliche fokussierte Genauigkeit:", mean_focused_accuracy)


---
# Testing

In [18]:
# find the 5 users with the most ratings
user_rating_counts = netflix_data.groupby("user").count().sort(by="count", descending=True)
user_rating_counts.head(5)

user,count
i64,u32
305344,17653
387418,17436
2439493,16565
1664010,15813
2118461,14831


In [23]:
user_ratings = get_highly_rated_user_ratings(10)
num_ratings = len(user_ratings)
print(f"User 10 hat {num_ratings} Bewertungen abgegeben, die größer gleich 4 sind.")


User 10 hat 40 Bewertungen abgegeben, die größer gleich 4 sind.


In [47]:
# get all ratings for a user
get_user_ratings(10)
user_ratings_list = get_user_ratings(10).to_pandas().to_dict(orient='records')
print(user_ratings_list)

film,user,rating,date
i64,i64,i64,str
175,10,3,"""2003-04-18 """
191,10,4,"""2003-12-30 """
197,10,4,"""2005-08-08 """
285,10,3,"""2002-12-28 """
299,10,2,"""2003-05-21 """
468,10,3,"""2004-06-13 """
473,10,5,"""2003-07-29 """
483,10,2,"""2004-02-05 """
571,10,4,"""2002-12-20 """
886,10,3,"""2005-08-08 """


In [48]:
user_ratings_list = get_user_ratings(10).to_pandas().to_dict(orient='records')
print(user_ratings_list)

[{'film': 175, 'user': 10, 'rating': 3, 'date': '2003-04-18\n'}, {'film': 191, 'user': 10, 'rating': 4, 'date': '2003-12-30\n'}, {'film': 197, 'user': 10, 'rating': 4, 'date': '2005-08-08\n'}, {'film': 285, 'user': 10, 'rating': 3, 'date': '2002-12-28\n'}, {'film': 299, 'user': 10, 'rating': 2, 'date': '2003-05-21\n'}, {'film': 468, 'user': 10, 'rating': 3, 'date': '2004-06-13\n'}, {'film': 473, 'user': 10, 'rating': 5, 'date': '2003-07-29\n'}, {'film': 483, 'user': 10, 'rating': 2, 'date': '2004-02-05\n'}, {'film': 571, 'user': 10, 'rating': 4, 'date': '2002-12-20\n'}, {'film': 886, 'user': 10, 'rating': 3, 'date': '2005-08-08\n'}, {'film': 900, 'user': 10, 'rating': 2, 'date': '2004-05-31\n'}, {'film': 985, 'user': 10, 'rating': 5, 'date': '2002-12-31\n'}, {'film': 1145, 'user': 10, 'rating': 3, 'date': '2003-02-14\n'}, {'film': 1180, 'user': 10, 'rating': 3, 'date': '2004-06-13\n'}, {'film': 1542, 'user': 10, 'rating': 5, 'date': '2003-02-12\n'}, {'film': 1645, 'user': 10, 'rating':

In [46]:
get_user_ratings(10).write_csv('user_10_ratings.csv')


In [None]:
# transform the ratings into a vector
# here, all missing ratings get the placeholder value 0
ratings_to_vector(get_user_ratings(7), 0)

In [214]:
# evaluate the model for a user
full_evaluation(2118461, 0.5, 0.1)

0.14285714285714285

In [22]:
split_date = datetime.datetime(2005, 9, 1)
full_evaluate_cosine_similarity(10, split_date, 500)

KeyboardInterrupt: 

In [None]:
split_date = datetime.datetime(2005, 9, 1)
full_evaluate_cosine_similarity(305344, split_date, 500)

In [43]:


def get_best_rated_movie(user_id):
    user_ratings = get_user_ratings(user_id)
    best_rated_movie = user_ratings.sort(by=['rating'], descending=True).row(0)[0]
    return best_rated_movie

best_rated_movie = get_best_rated_movie(10)
user_ratings_given = get_user_ratings(10)

top_50_movies_for_movie = cosine_similarity.get_movie_cos_scores(best_rated_movie, bonus_actor=0.1, 
                            bonus_same_director=0.2, bonus_same_genre=0.2, min_same_genre_count=4)

def calculate_MAP(user, recommended_movies, user_ratings):
    sum_precision = 0
    relevant_items = 0
    total_recommended = 0
    
    # Filtere den DataFrame nach dem Benutzer
    user_ratings_filtered = user_ratings.filter(user_ratings['user'] == user)
    
    # Erstelle ein Dictionary der Bewertungen für schnellen Zugriff
    user_ratings_dict = {row[0]: row[1] for row in user_ratings_filtered.iter_rows()}
    
    for i, movie in enumerate(recommended_movies):
        user_rating = user_ratings_dict.get(movie, None)
        
        if user_rating is not None:
            if user_rating >= 4:
                relevant_items += 1
            total_recommended += 1
            sum_precision += (relevant_items / total_recommended)
    
    MAP = sum_precision / total_recommended if total_recommended > 0 else 0
    
    return MAP






MAP_for_movie = calculate_MAP(10, top_50_movies_for_movie, user_ratings_given)
print(MAP_for_movie)

0


In [28]:
def calculate_focused_accuracy(user, recommended_movies, user_ratings):
    # Filtere den DataFrame nach dem Benutzer
    user_ratings_filtered = user_ratings.filter(user_ratings['user'] == user)
    user_ratings_dict = {row[0]: row[2] for row in user_ratings_filtered.iter_rows()}

    # Finde die Schnittmenge der empfohlenen und bewerteten Filme
    intersect_movies = set(recommended_movies).intersection(set(user_ratings_dict.keys()))

    # Überprüfe, ob es eine Schnittmenge gibt
    if len(intersect_movies) == 0:
        return "Keiner der empfohlenen Filme wurde vom Nutzer bewertet"

    # Zähle, wie viele der Schnittmengenfilme positiv bewertet wurden
    count_positive_ratings = sum(1 for movie in intersect_movies if user_ratings_dict[movie] >= 3)

    return count_positive_ratings / len(intersect_movies)

def get_best_rated_movie(user_id):
    user_ratings = get_user_ratings(user_id)
    best_rated_movie = user_ratings.sort(by=['rating'], descending=True).row(0)[0]
    return best_rated_movie

best_rated_movie = get_best_rated_movie(387418)
user_ratings_given = get_user_ratings(387418)

top_50_movies_for_movie = cosine_similarity.get_movie_cos_scores(best_rated_movie, bonus_actor=0.08, 
                            bonus_same_director=0.02)
# Beispielanwendung
recommended_movies = top_50_movies_for_movie
focused_accuracy = calculate_focused_accuracy(387418, recommended_movies, user_ratings_given)
print("Fokussierte Genauigkeit für den Benutzer:", focused_accuracy)


Fokussierte Genauigkeit für den Benutzer: 0.24067796610169492


In [25]:
def get_best_rated_movie(user_id):
    user_ratings = get_user_ratings(user_id)
    best_rated_movie = user_ratings.sort(by=['rating'], descending=True).row(0)[0]
    return best_rated_movie

  
print(get_best_rated_movie(10))
    


['__add__', '__annotations__', '__array__', '__bool__', '__class__', '__contains__', '__copy__', '__dataframe__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__new__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '_accessors', '_cast_all_from_to', '_comp', '_compare_to_non_df', '_compare_to_other_df', '_df', '_div', '_from_arrow', '_from_dict', '_from_dicts', '_from_numpy', '_from_pandas', '_from_pydf', '_from_records', '_ipython_key_completions_', '_pos_idx', '_pos_idxs', '_read_avro', '_read_csv', '_read_ipc', '_read_json', '_read_ndjson', '_read_parquet', '_repr_html_',

In [11]:
# Funktion zur Berechnung der fokussierten Genauigkeit
def calculate_focused_accuracy(user, recommended_movies, user_ratings):
    user_ratings_filtered = user_ratings.filter(user_ratings['user'] == user)
    user_ratings_dict = {row[0]: row[2] for row in user_ratings_filtered.iter_rows()}
    intersect_movies = set(recommended_movies).intersection(set(user_ratings_dict.keys()))
    if len(intersect_movies) == 0:
        return "Keiner der empfohlenen Filme wurde vom Nutzer bewertet"
    count_positive_ratings = sum(1 for movie in intersect_movies if user_ratings_dict[movie] > 4)
    return count_positive_ratings / len(intersect_movies)

# Initialisierung
unique_users = user_ratings['user'].unique()
total_accuracy = 0
count_users = 0

# Durch alle Benutzer iterieren
for user_id in unique_users:
    best_rated_movie = get_best_rated_movie(user_id)
    user_ratings_given = get_user_ratings(user_id)
    top_50_movies_for_movie = cosine_similarity.get_movie_cos_scores(best_rated_movie)
    focused_accuracy = calculate_focused_accuracy(user_id, top_50_movies_for_movie, user_ratings_given)
    
    if isinstance(focused_accuracy, float):
        total_accuracy += focused_accuracy
        count_users += 1

# Durchschnittliche Genauigkeit berechnen
if count_users > 0:
    average_accuracy = total_accuracy / count_users
    print("Durchschnittliche fokussierte Genauigkeit:", average_accuracy)
else:
    print("Keine auswertbaren Daten gefunden.")


def get_unique_users():
    return netflix_data.select("user").unique().to_list()

# Beispielanwendung
list_of_unique_users = get_unique_users()
mean_focused_accuracy = calculate_mean_focused_accuracy(list_of_unique_users, recommended_movies, netflix_data)
print("Mittlere fokussierte Genauigkeit:", mean_focused_accuracy)

NameError: name 'user_ratings' is not defined

In [18]:
def get_best_rated_movie(user_id):
    user_ratings = get_user_ratings(user_id)
    best_rated_movie = user_ratings.sort(by=['rating'], descending=True).row(0)[0]
    return best_rated_movie

In [29]:
def calculate_mean_focused_accuracy(list_of_users, netflix_data):
    sum_focused_accuracy = 0
    count_users_with_intersection = 0

    for user in list_of_users:
        user_ratings_given = get_user_ratings(user)
        
        # Finde den am besten bewerteten Film für den Benutzer
        best_rated_movie = get_best_rated_movie(user)
        
        # Erhalte Top-50-Filme basierend auf dem am besten bewerteten Film
        recommended_movies = cosine_similarity.get_movie_cos_scores(best_rated_movie, bonus_actor=0.08, bonus_same_director=0.02)
        
        # Berechne die fokussierte Genauigkeit
        focused_accuracy = calculate_focused_accuracy(user, recommended_movies, user_ratings_given)
        
        if focused_accuracy != "Keiner der empfohlenen Filme wurde vom Nutzer bewertet":
            sum_focused_accuracy += focused_accuracy
            count_users_with_intersection += 1
            
    if count_users_with_intersection == 0:
        return "Keiner der empfohlenen Filme wurde von den Benutzern bewertet"
    else:
        return sum_focused_accuracy / count_users_with_intersection
    


# Extrahiere eine Liste der einzigartigen Benutzer
list_of_unique_users = netflix_data['user'].unique().to_list()

# Berechne die durchschnittliche fokussierte Genauigkeit
mean_focused_accuracy = calculate_mean_focused_accuracy(list_of_unique_users, netflix_data)

print("Durchschnittliche fokussierte Genauigkeit:", mean_focused_accuracy)

KeyboardInterrupt: 

In [52]:
#Finde User mit den meisten Bewertungen
user_rating_counts = netflix_data.groupby("user").count().sort(by="count", descending=True)
user_rating_counts.head(5)

user,count
i64,u32
305344,17653
387418,17436
2439493,16565
1664010,15813
2118461,14831
