In [224]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
import random
from sklearn.linear_model import LinearRegression

# Step 1: Load and Explore the Dataset

In [225]:
df_ratings = pd.read_csv("ml-latest-small/ratings.csv")

In [226]:
df_ratings.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [227]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [228]:
display(df_ratings.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [229]:
df_ratings.loc[df_ratings['rating'] <= 2, 'rating_ordinal'] = "Don't like"
df_ratings.loc[(df_ratings['rating'] > 2) & (df_ratings['rating'] <= 4), 'rating_ordinal'] = "Like"
df_ratings.loc[df_ratings['rating'] > 4, 'rating_ordinal'] = "Really like" 

In [230]:
ratings_ordinals = df_ratings['rating_ordinal'].unique()
ratings_ordinals

array(['Like', 'Really like', "Don't like"], dtype=object)

In [231]:
display(df_ratings.head())

Unnamed: 0,userId,movieId,rating,timestamp,rating_ordinal
0,1,1,4.0,964982703,Like
1,1,3,4.0,964981247,Like
2,1,6,4.0,964982224,Like
3,1,47,5.0,964983815,Really like
4,1,50,5.0,964982931,Really like


In [232]:
df_movies = pd.read_csv("ml-latest-small/movies.csv")

In [233]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [234]:
df_user_movie_ratings = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating_ordinal'
)
df_user_movie_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Like,,Like,,,Like,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,Like,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,Like,,,,,,Like,,,,...,,,,,,,,,,
607,Like,,,,,,,,,,...,,,,,,,,,,
608,Like,Don't like,Don't like,,,,,,,Like,...,,,,,,,,,,
609,Like,,,,,,,,,Like,...,,,,,,,,,,


In [235]:
all_films = df_ratings["movieId"].unique()

In [236]:
# Crea una Serie booleana: True dove il rating è mancante
missing = df_user_movie_ratings.isnull()

# Trasforma in formato "long" (una riga per ogni cella)
missing = missing.stack()

# Filtra solo le celle in cui il rating è mancante
missing = missing[missing].reset_index()
missing.columns = ['userId', 'movieId', 'is_missing']

# Aggiungi la colonna 'flag' con valore False
df_ratings_missing = missing[['userId', 'movieId']].copy()
df_ratings_missing


Unnamed: 0,userId,movieId
0,1,2
1,1,4
2,1,5
3,1,7
4,1,8
...,...,...
5830799,610,193581
5830800,610,193583
5830801,610,193585
5830802,610,193587


In [221]:
# itera per tutte le valutazioni manacanti
for row in df_ratings_missing.itertuples():
    userId = row.userId
    movieId = row.movieId

    # film seen from the user
    films_seen = df_user_movie_ratings.loc[userId].dropna().index

    for category in ratings_ordinals:
        # P(r31 = 1)

        df_current_movie = df_ratings[(df_ratings['movieId'] == movieId)]
        df_current_movie_category = df_current_movie[(df_current_movie['rating_ordinal'] == category)]
        users_that_voted_current_movies_with_this_category = df_current_movie_category['userId'].unique()

        p_r31 = len(df_current_movie_category) / len(df_current_movie)

        probs = [p_r31]
        for film_seen in films_seen:
            # P(r32 = 1 | r31 = 1)
            # prendo la valutazione che l'utente ha dato al film visto
            assigned_rating = df_user_movie_ratings.loc[userId, film_seen]

            # cerco tutti gli utenti che hanno votato il film visto come l'utente corrente è che hanno valutato il
            # film corrente con la stessa categoria

            df_seen_movie = df_ratings[(df_ratings['movieId'] == film_seen)]
            df_seen_movie_category = df_seen_movie[df_seen_movie['userId'].isin(users_that_voted_current_movies_with_this_category)]

            # di quelli, cerco quanti hanno votato il film che l'utente ha visto con la stessa categoria
            df_seen_movie_category = df_seen_movie_category[df_seen_movie_category['rating_ordinal'] == assigned_rating]

            # if no one has voted the movie with the same category, we skip it
            if len(df_seen_movie_category) == 0:
                continue

            probs.append(len(df_seen_movie_category) / len(df_seen_movie))
        
        # P(r32 = 1 | r31 = 1) * P(r31 = 1)
        prob = np.prod(probs)
        df_ratings_missing.loc[(df_ratings_missing['userId'] == userId) & (df_ratings_missing['movieId'] == movieId), category] = prob


    

KeyboardInterrupt: 

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

def compute_missing_prob(row):
    userId = row['userId']
    movieId = row['movieId']
    # Film che l'utente ha visto
    films_seen = df_user_movie_ratings.loc[userId].dropna().index.tolist()
    
    # Dizionario che conterrà il risultato per ogni categoria
    result = {}
    
    # Per ciascuna categoria (rating) da considerare
    for category in ratings_ordinals:
        # Calcola P(r(movieId) = category)
        df_current_movie = df_ratings[df_ratings['movieId'] == movieId]
        df_current_movie_category = df_current_movie[df_current_movie['rating_ordinal'] == category]
        # Gestione di eventuale divisione per zero:
        if len(df_current_movie) == 0:
            p_r31 = 0
        else:
            p_r31 = len(df_current_movie_category) / len(df_current_movie)
        
        # Lista delle probabilità da moltiplicare
        probs = [p_r31]
        
        # Itera sui film che l'utente ha visto
        for film_seen in films_seen:
            assigned_rating = df_user_movie_ratings.loc[userId, film_seen]
            # Filtra i voti del film visto
            df_seen_movie = df_ratings[df_ratings['movieId'] == film_seen]
            # Limita agli utenti che hanno votato il film mancante con 'category'
            users_voted_current = df_current_movie_category['userId'].unique()
            df_seen_movie_category = df_seen_movie[df_seen_movie['userId'].isin(users_voted_current)]
            # Filtra in base al rating assegnato dall'utente al film visto
            df_seen_movie_category = df_seen_movie_category[df_seen_movie_category['rating_ordinal'] == assigned_rating]
            
            # Se non ci sono voti, puoi decidere se saltare il film oppure applicare uno smoothing (qui si salta)
            if len(df_seen_movie) == 0:
                p_cond = 1  # oppure 0 oppure applicare smoothing
            else:
                if len(df_seen_movie_category) == 0:
                    continue
                p_cond = len(df_seen_movie_category) / len(df_seen_movie)
            probs.append(p_cond)
        
        # Il prodotto delle probabilità
        result[category] = np.prod(probs)
    
    return pd.Series(result)

# Applica la funzione a df_ratings_missing
df_ratings_missing[ratings_ordinals] = df_ratings_missing.progress_apply(compute_missing_prob, axis=1)



  0%|          | 0/5830804 [00:00<?, ?it/s]