In [12]:
import polars as pl
import pandas as pd
import json
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

Create a recommendation system. Usage of surprise package. Using the k-NN Baseline algorithm

In [13]:
# small size for testing
db_dev_path = 'netflix_dev.db'
db_dev_conn = 'sqlite://' + db_dev_path

# full size for production
db_prod_path = 'netflix.db'
db_prod_conn = 'sqlite://' + db_prod_path

In [16]:
netflix_data = pl.read_database("SELECT * FROM netflix_data", db_dev_conn)
movie_titles = pl.read_database("SELECT * FROM movie_titles", db_dev_conn)
# combined     = pl.read_database("SELECT * FROM netflix_data, movie_titles \
#                                   WHERE netflix_data.film = movie_titles.film", db_prod_path)

In [14]:
netflix_data = pl.read_database("SELECT * FROM netflix_data", db_dev_conn)
movie_titles = pl.read_database("SELECT * FROM movie_titles", db_dev_conn)

In [18]:
# the number of ratings
len(netflix_data)

100000

In [33]:
# return the title for a given item_id (column film)
# i.e. get_title(16242) -> "Con Air"
def get_title(item_id):
    return movie_titles.filter(pl.col("film") == item_id)["title"].to_list()[0]

- most_rated hat die 100 most rated Filme
- best_rated hat die 100 best rated Filme
- not_rated Filme die kein rating haben
- rated hat Filme mit mind 1 rating

In [20]:
most_rated = pl.read_database("SELECT netflix_data.film, COUNT(*) AS 'num_ratings', AVG(netflix_data.rating) AS 'avg_rating' \
                               FROM netflix_data \
                               GROUP BY netflix_data.film \
                               ORDER BY num_ratings DESC \
                               LIMIT 100 \
                               ", db_prod_conn)

most_rated

film,num_ratings,avg_rating
i64,i64,f64
5317,232944,3.361267
15124,216596,3.724238
14313,200832,3.783854
15205,196397,3.442166
1905,193941,4.153908
6287,193295,3.905047
11283,181508,4.29991
16377,181426,4.306941
16242,178068,3.454411
12470,177556,3.41187


In [21]:
# remove movies with less than 100 ratings
# get the number of ratings for each movie
pre_ratings = netflix_data.groupby("film").count()

# keep only movies with at least 100 ratings
pre_ratings = pre_ratings.filter(pl.col("count") >= 200)

# join the dataframes
pre_ratings = netflix_data.join(pre_ratings, on="film", how="inner")

# bring the ratings into a format that surprise can work with
ratings = pre_ratings.drop("date").drop("count")
ratings

film,user,rating
i64,i64,i64
15205,2523958,4
5317,843821,4
15124,65908,4
5317,191646,3
15124,2255575,5
15124,465480,4
15205,142234,3
5317,2564388,3
15205,52203,3
15205,1091345,4


In [23]:
# average rating for each user
avg_rating_user = ratings.groupby("user").mean().sort(pl.col("rating")).drop("film")

# average rating for each movie
avg_rating_film = ratings.groupby("film").mean().sort(pl.col("rating")).drop("user")

# Count of ratings for each movie
film_rating_counts = ratings.groupby("film").agg(
    [
        pl.count("rating").alias("count")
    ]
)

# Count of ratings for each user
user_rating_counts = ratings.groupby("user").agg(
    [
        pl.count("rating").alias("count")
    ]
)

avg_rating_film = avg_rating_film.join(film_rating_counts, on="film")
avg_rating_user = avg_rating_user.join(user_rating_counts, on="user")

In [9]:
movie_data = pd.read_csv("data/movie_data.csv", sep="|")
movie_data

Unnamed: 0.1,Unnamed: 0,Title,Year,Response,Rated,Released,Runtime,Genre,Director,Writer,...,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website
0,0,Dinosaur Planet,2003–,True,Not Rated,14 Dec 2003,50 min,"Documentary, Animation, Family",,,...,"[{'Source': 'Internet Movie Database', 'Value'...",,7.7,531,tt0389605,series,,,,
1,1,Isle of Man TT 2004 Review,,False,,,,,,,...,,,,,,,,,,
2,2,Character,1997,True,R,27 Mar 1998,122 min,"Crime, Drama, Mystery",Mike van Diem,"Ferdinand Bordewijk, Laurens Geels, Mike van Diem",...,"[{'Source': 'Internet Movie Database', 'Value'...",,7.7,11037,tt0119448,movie,04 Feb 2003,"$623,983",,
3,3,Sick,2022,True,R,13 Jan 2023,83 min,"Horror, Thriller",John Hyams,"Kevin Williamson, Katelyn Crabb",...,"[{'Source': 'Internet Movie Database', 'Value'...",62.0,6.1,12303,tt14642626,movie,13 Jan 2023,,,
4,4,What the #$*! Do We Know!?,2017,True,,04 Oct 2017,97 min,Documentary,"Kip Andersen, Keegan Kuhn","Kip Andersen, Keegan Kuhn",...,"[{'Source': 'Internet Movie Database', 'Value'...",,7.2,29844,tt5541848,movie,16 Jun 2017,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9230,9230,Shakespeare in Love,1998,True,R,08 Jan 1999,123 min,"Comedy, Drama, History",John Madden,"Marc Norman, Tom Stoppard",...,"[{'Source': 'Internet Movie Database', 'Value'...",87.0,7.1,228514,tt0138097,movie,07 Dec 1999,"$100,317,794",,
9231,9231,Fidel Castro: American Experience,,False,,,,,,,...,,,,,,,,,,
9232,9232,Epoch,2001,True,PG-13,24 Nov 2001,96 min,"Sci-Fi, Thriller",Matt Codd,"Jonathan Raymond, Phillip J. Roth",...,"[{'Source': 'Internet Movie Database', 'Value'...",,4.8,2474,tt0233657,movie,25 Aug 2005,,,
9233,9233,The Company,2003,True,PG-13,07 May 2004,112 min,"Drama, Music, Romance",Robert Altman,"Neve Campbell, Barbara Turner",...,"[{'Source': 'Internet Movie Database', 'Value'...",73.0,6.2,6708,tt0335013,movie,01 Jun 2004,"$2,283,914",,


In [32]:
# Annahme: Der Netflix-Datensatz ist als DataFrame namens "netflix_data" verfügbar
# Überprüfe die tatsächlichen Spaltennamen in deinem DataFrame

# Erstelle einen Reader für den Surprise-Datensatz
reader = Reader(rating_scale=(1, 5))

# Lade den DataFrame in einen Surprise-Datensatz
data = Dataset.load_from_df(movie_data[['film', 'user', 'rating']], reader)

# Teile den Datensatz in Trainings- und Testdaten auf
trainset = data.build_full_trainset()
testset = trainset.build_testset()

KeyError: "None of [Index(['film', 'user', 'rating'], dtype='object')] are in the [columns]"

In [30]:
trainset = netflix_data.build_trainset()
testset = trainset.build_testset()

AttributeError: 'DataFrame' object has no attribute 'build_trainset'

In [None]:
algo.fit(trainset)

In [None]:
# Beispiel: Gib Empfehlungen für Benutzer mit der ID 42 aus
user_id = 42
predictions = algo.test(testset)
user_predictions = [pred for pred in predictions if pred[0] == user_id]
top_n = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:10]
for pred in top_n:
    print(pred.iid, pred.est)

In [29]:
# Erstelle den KNN-Algorithmus
algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})

# Trainiere den Algorithmus mit den Trainingsdaten
algo.fit(trainset)

# Lasse den Algorithmus Vorhersagen für die Testdaten machen
predictions = algo.test(testset)

# Zeige einige Vorhersagen an
for prediction in predictions[:5]:
    print(prediction)

NameError: name 'trainset' is not defined

In [4]:
import polars as pl
import pandas as pd
import json
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

# small size for testing
db_dev_path = 'netflix_dev.db'
db_dev_conn = 'sqlite://' + db_dev_path

# full size for production
db_prod_path = 'netflix.db'
db_prod_conn = 'sqlite://' + db_prod_path

netflix_data = pl.read_database("SELECT * FROM netflix_data", db_dev_conn)
movie_titles = pl.read_database("SELECT * FROM movie_titles", db_dev_conn)

# the number of ratings
len(netflix_data)

# return the title for a given item_id (column film)
# i.e. get_title(16242) -> "Con Air"
def get_title(item_id):
    return movie_titles.filter(pl.col("film") == item_id)["title"].to_list()[0]

# remove movies with less than 100 ratings
# get the number of ratings for each movie
pre_ratings = netflix_data.groupby("film").count()

# keep only movies with at least 100 ratings
pre_ratings = pre_ratings.filter(pl.col("count") >= 200)

# join the dataframes
pre_ratings = netflix_data.join(pre_ratings, on="film", how="inner")

# bring the ratings into a format that surprise can work with
ratings = pre_ratings.drop("date").drop("count")
ratings

# average rating for each user
avg_rating_user = ratings.groupby("user").mean().sort(pl.col("rating")).drop("film")

# average rating for each movie
avg_rating_film = ratings.groupby("film").mean().sort(pl.col("rating")).drop("user")

# Count of ratings for each movie
film_rating_counts = ratings.groupby("film").agg(
    [
        pl.count("rating").alias("count")
    ]
)

# Count of ratings for each user
user_rating_counts = ratings.groupby("user").agg(
    [
        pl.count("rating").alias("count")
    ]
)

avg_rating_film = avg_rating_film.join(film_rating_counts, on="film")
avg_rating_user = avg_rating_user.join(user_rating_counts, on="user")

algo = KNNBasic()

# Annahme: Der Netflix-Datensatz ist als DataFrame namens "movie_data" verfügbar
# Überprüfe die tatsächlichen Spaltennamen in deinem DataFrame

# Erstelle einen Reader für den Surprise-Datensatz
reader = Reader(rating_scale=(1, 5))

# Lade den DataFrame in einen Surprise-Datensatz
data = Dataset.load_from_df(ratings[['film', 'user', 'rating']], reader)

# Teile den Datensatz in Trainings- und Testdaten auf
trainset = data.build_full_trainset()
testset = trainset.build_testset()


AttributeError: 'DataFrame' object has no attribute 'itertuples'

In [37]:
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1468e071060>

In [38]:
# Beispiel: Gib Empfehlungen für Benutzer mit der ID 42 aus
user_id = 42
predictions = algo.test(testset)
user_predictions = [pred for pred in predictions if pred[0] == user_id]
top_n = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:10]
for pred in top_n:
    print(pred.iid, pred.est)

In [39]:
# Erstelle den KNN-Algorithmus
algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})

# Trainiere den Algorithmus mit den Trainingsdaten
algo.fit(trainset)

# Lasse den Algorithmus Vorhersagen für die Testdaten machen
predictions = algo.test(testset)

# Zeige einige Vorhersagen an
for prediction in predictions[:5]:
    print(prediction)

Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 15205      item: 2523958    r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 142234     r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 52203      r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 1091345    r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 1250371    r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}


In [2]:
import polars as pl
import pandas as pd
#import json
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

In [3]:
# small size for testing
db_dev_path = 'netflix_dev.db'
db_dev_conn = 'sqlite://' + db_dev_path

# full size for production
db_prod_path = 'netflix.db'
db_prod_conn = 'sqlite://' + db_prod_path

In [4]:
netflix_data = pl.read_database("SELECT * FROM netflix_data", db_dev_conn)
movie_titles = pl.read_database("SELECT * FROM movie_titles", db_dev_conn)


In [5]:
# remove movies with less than 200 ratings
# get the number of ratings for each movie
pre_ratings = netflix_data.groupby("film").count()

# keep only movies with at least 200 ratings
pre_ratings = pre_ratings.filter(pl.col("count") >= 200)

# join the dataframes
pre_ratings = netflix_data.join(pre_ratings, on="film", how="inner")

# bring the ratings into a format that surprise can work with
ratings = pre_ratings.drop("date").drop("count")


In [6]:
# Convert the polars dataFrame to a pandas dataframe
ratings_pandas = ratings.to_pandas()

In [7]:
# create a reader for surprise dataset
reader = Reader(rating_scale=(1, 5))

# bring the dataframe to a surprise dataset
data = Dataset.load_from_df(ratings_pandas[['film', 'user', 'rating']], reader)


In [8]:
# create a train and a test data
trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [9]:
# crete the KNN algorithm
algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})

# train the algorithm with train data
algo.fit(trainset)

# let the algorthm make some predictions 
predictions = algo.test(testset)

# show the predictions
for prediction in predictions[:5]:
    print(prediction)

Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 15205      item: 2523958    r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 142234     r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 52203      r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 1091345    r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 1250371    r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}


- user:        Die ID des Benutzers, für den die Vorhersage gemacht wurde.
- item:     Die ID des Artikels (Films), für den die Vorhersage gemacht wurde.
- r_ui:     Der tatsächliche Wert (Rating) des Benutzers für den Artikel in den Testdaten.
- est:      Die vom Algorithmus vorhergesagte Bewertung für den Benutzer und den Artikel.
- actual_k:         Die Anzahl der tatsächlich verwendeten Nachbarn (K) für die Vorhersage.
- was_impossible:   Ein Flag, das angibt, ob die Vorhersage aufgrund von fehlenden Daten oder anderen Gründen unmöglich war.

In [25]:
# give predictions for user_id = 42 as an example
user_id = 387418
user_predictions = [pred for pred in predictions if pred.uid == str(user_id)]
top_n = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:10]
for pred in top_n:
    print(pred.iid, pred.est)

    


In [26]:
# change k
min_k = 2  # least amount to be considered
algo = KNNBasic(k=40, min_k=min_k, sim_options={'name': 'cosine', 'user_based': True})

k = 40  # number of neigbours


# train the algo
algo.fit(trainset)

# let the algo predict
predictions = algo.test(testset)

# show the predictions
for prediction in predictions[:5]:
    print(prediction)


Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 15205      item: 2523958    r_ui = 4.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 15205      item: 142234     r_ui = 3.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 15205      item: 52203      r_ui = 3.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 15205      item: 1091345    r_ui = 4.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 15205      item: 1250371    r_ui = 3.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}


In [21]:
# change k
min_k = 2  # least amount to be considered
sim_options = {"name": "pearson_baseline", "shrinkage": 0}  # no shrinkage
algo = KNNBasic(sim_options=sim_options)

k = 40  # number of neigbours


# train the algo
algo.fit(trainset)

# let the algo predict
predictions = algo.test(testset)

# show the predictions
for prediction in predictions[:5]:
    print(prediction)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
user: 15205      item: 2523958    r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 142234     r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 52203      r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 1091345    r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 1250371    r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}


Eigene ID bei Surprise. 

In [13]:
user_rating_counts = netflix_data.groupby("user").count().sort(by="count", descending=True)
user_rating_counts.head(5)

user,count
i64,u32
387418,22
1461435,17
305344,15
2118461,15
2439493,14


In [12]:
'''import polars as pl
import pandas as pd
#import json
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

# small size for testing
db_dev_path = 'netflix_dev.db'
db_dev_conn = 'sqlite://' + db_dev_path

# full size for production
db_prod_path = 'netflix.db'
db_prod_conn = 'sqlite://' + db_prod_path

netflix_data = pl.read_database("SELECT * FROM netflix_data", db_dev_conn)
movie_titles = pl.read_database("SELECT * FROM movie_titles", db_dev_conn)

# remove movies with less than 100 ratings
# get the number of ratings for each movie
pre_ratings = netflix_data.groupby("film").count()

# keep only movies with at least 100 ratings
pre_ratings = pre_ratings.filter(pl.col("count") >= 200)

# join the dataframes
pre_ratings = netflix_data.join(pre_ratings, on="film", how="inner")

# bring the ratings into a format that surprise can work with
ratings = pre_ratings.drop("date").drop("count")

# Convert the polars dataFrame to a pandas dataframe
ratings_pandas = ratings.to_pandas()

# create a reader for surprise dataset
reader = Reader(rating_scale=(1, 5))

# bring the dataframe to a surprise dataset
data = Dataset.load_from_df(ratings_pandas[['film', 'user', 'rating']], reader)

# create a train and a test data
trainset = data.build_full_trainset()
testset = trainset.build_testset()

# crete the KNN algorithm
algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})

# train the algorithm with train data
algo.fit(trainset)

# let the algorthm make some predictions 
predictions = algo.test(testset)

# show the predictions
for prediction in predictions[:5]:
    print(prediction)

# give predictions for user_id = 42 as an example
user_id = 42
user_predictions = [pred for pred in predictions if pred.uid == str(user_id)]
top_n = sorted(user_predictions, key=lambda x: x.est, reverse=True)[:10]
for pred in top_n:
    print(pred.iid, pred.est)


Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 15205      item: 2523958    r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 142234     r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 52203      r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 1091345    r_ui = 4.00   est = 4.00   {'actual_k': 1, 'was_impossible': False}
user: 15205      item: 1250371    r_ui = 3.00   est = 3.00   {'actual_k': 1, 'was_impossible': False}


In [13]:
'''# Erstelle den KNN-Algorithmus mit geänderter Anzahl der Nachbarn
k = 5  # Anzahl der Nachbarn
min_k = 2  # Mindestanzahl an Nachbarn für eine Vorhersage
algo = KNNBasic(k=k, min_k=min_k, sim_options={'name': 'cosine', 'user_based': True})

# Trainiere den Algorithmus mit den Trainingsdaten
algo.fit(trainset)

# Lasse den Algorithmus Vorhersagen für die Testdaten machen
predictions = algo.test(testset)

# Zeige einige Vorhersagen an
for prediction in predictions[:5]:
    print(prediction)


Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 15205      item: 2523958    r_ui = 4.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 15205      item: 142234     r_ui = 3.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 15205      item: 52203      r_ui = 3.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 15205      item: 1091345    r_ui = 4.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 15205      item: 1250371    r_ui = 3.00   est = 3.59   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
