In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import implicit
import pickle
import os
from tqdm import tqdm

In [7]:
games =  pd.read_parquet('../data/cleaned/games_cleaned.parquet')
train_df = pd.read_parquet('../data/cleaned/train_recommendations.parquet')
test_df = pd.read_parquet('../data/cleaned/test_recommendations.parquet')

In [5]:
train_df

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,440,2,0,2010-10-15,True,622.8,11064475,16249835
1,7760,11,0,2010-10-15,True,37.7,11294447,29871669
2,41100,0,0,2010-10-15,True,15.7,7408951,28831996
3,2100,0,0,2010-10-15,True,2.7,9263211,18760261
4,6040,0,0,2010-10-15,True,1.0,8447888,40837424
...,...,...,...,...,...,...,...,...
37063354,1313140,0,0,2022-08-13,True,29.0,13087857,22692141
37063355,1139940,0,0,2022-08-13,True,63.3,3688502,31009780
37063356,1313140,0,0,2022-08-13,True,10.2,7381355,5130045
37063357,1313140,0,0,2022-08-13,True,14.6,9647003,22521477


In [8]:
games

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,median_playtime,proportion_recommended,description,tags
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True,12.9,0.845789,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True,0.0,0.000000,,[Action]
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True,6.9,0.908541,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True,8.1,0.625998,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True,23.4,0.885567,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50214,2455060,Taboo Trial,2023-07-31,True,False,False,Very Positive,94,494,12.00,0.00,0.0,True,0.0,0.000000,"In the rogue action game ""Taboo Trial"", you wi...","[RPG, Indie, Nudity, Action Roguelike, Female ..."
50215,1138640,Hometopia,2023-09-27,True,False,False,Mixed,61,248,17.00,0.00,0.0,True,0.0,0.000000,"Build better, together 🏡 Hometopia is a seriou...","[Early Access, Life Sim, City Builder, Immersi..."
50216,2515460,Northgard - Kernev Clan of the Stoat,2023-08-24,True,True,True,Mixed,67,80,5.00,0.00,0.0,True,0.0,0.000000,,"[Strategy, Indie, Simulation]"
50217,1687000,Fading Afternoon,2023-09-14,True,False,False,Mostly Positive,79,358,20.00,0.00,0.0,True,0.0,0.000000,Seiji Maruyama is a middle-aged yakuza recentl...,"[Side Scroller, Beat 'em up, Sandbox, Martial ..."


In [9]:
# Step 1: Crea le mappature
unique_users = train_df['user_id'].unique()
unique_items = games['app_id'].unique()

user_to_idx = {uid: idx for idx, uid in enumerate(unique_users)}
item_to_idx = {iid: idx for idx, iid in enumerate(unique_items)}
idx_to_user = {idx: uid for uid, idx in user_to_idx.items()}
idx_to_item = {idx: iid for iid, idx in item_to_idx.items()}

# Step 2: Crea la matrice sparse
user_indices = train_df['user_id'].map(user_to_idx)
item_indices = train_df['app_id'].map(item_to_idx)
interaction_strength = train_df['is_recommended'] * np.log1p(train_df['hours'])

interaction_matrix = coo_matrix((interaction_strength, (user_indices, item_indices))).tocsr()

# Step 3: Allena ALS
model = implicit.als.AlternatingLeastSquares(
    factors=20, regularization=0.1, iterations=10, num_threads=8
)
model.fit(interaction_matrix)

  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
train_df[train_df["user_id"] == 13087857].sort_values(by='app_id', ascending=True).head(10)["app_id"].values

array([    70,    220,    380,    400,    420,    620,   3590, 206440,
       219150, 246620])

In [11]:
import numpy as np

user_id = 13087857  
row_id = user_to_idx[user_id] 

# Estrai la riga dell'utente e converti in un array denso
user_interactions_non_zero = interaction_matrix.getrow(row_id).toarray().flatten()

# Trova gli indici degli articoli con cui l'utente ha interagito (non zero)
non_zero_interactions = np.nonzero(user_interactions_non_zero)[0]

# Traduci gli indici in app_id usando la mappatura inversa
articoli_interagiti = [idx_to_item[idx] for idx in non_zero_interactions]

# Visualizza gli articoli con cui l'utente ha interagito
print("Articoli con cui l'utente ha interagito:", sorted(articoli_interagiti))


Articoli con cui l'utente ha interagito: [70, 220, 380, 400, 420, 620, 3590, 206440, 219150, 246620, 261570, 268910, 274170, 282140, 286690, 324160, 374320, 409720, 418370, 420530, 434650, 447530, 460950, 483980, 501300, 504230, 537110, 557340, 570940, 588650, 632470, 635320, 668630, 683320, 746850, 753640, 774171, 774181, 860510, 885810, 953490, 966330, 977880, 1057090, 1071870, 1091500, 1092790, 1093910, 1109570, 1122750, 1139900, 1150690, 1182620, 1195290, 1196590, 1230140, 1238840, 1252330, 1288310, 1313140, 1321230, 1325890, 1330470, 1361510, 1388880, 1426210, 1451940, 1483870, 1592670, 1703340, 1708870, 1710930, 1715460]


In [12]:
# Save the model

directory = '../models'

# Controlla se la directory esiste, altrimenti creala
if not os.path.exists(directory):
    os.makedirs(directory)

with open('../models/als_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../models/interaction_matrix.pkl', 'wb') as f:
    pickle.dump(interaction_matrix, f)

In [13]:
def get_recommendations_for_user(user_id, N=10, filter_already_recommended=True):
    # Verifica se l'user_id esiste nel dizionario
    if user_id not in user_to_idx:
        raise ValueError(f"User ID {user_id} non trovato nelle mappature.")
    
    # Ottieni l'indice dell'utente
    user_idx = user_to_idx[user_id]
    
    # Genera le raccomandazioni (item_idx, score)
    recommended_items = model.recommend(user_idx, interaction_matrix[user_idx], N=N)
    
    if not filter_already_recommended:
        # Se non vogliamo filtrare gli articoli già visti, restituiamo direttamente
        return [(idx_to_item[idx], score) for idx, score in recommended_items]
    
    # Filtra gli articoli già visti dall'utente
    recommended_items_filtered = []
    for idx, score in zip(recommended_items[0], recommended_items[1]):
        # Controlla se l'articolo è già stato visto dall'utente (se è già nella matrice di interazione)
        if interaction_matrix[user_idx, idx] == 0:  # 0 significa che l'utente non ha interagito
            recommended_items_filtered.append((idx_to_item[idx], score))
    
    # Ordina per punteggio (in ordine decrescente)
    recommended_items_filtered = sorted(recommended_items_filtered, key=lambda x: x[1], reverse=True)
    
    return recommended_items_filtered

# Test the recommendation function
user_id = 12584260
recommended_items = get_recommendations_for_user(user_id, N=10)
for item, score in recommended_items:
    print(f"Item ID: {item}, Score: {score:.4f}")

Item ID: 431960, Score: 0.7049
Item ID: 105600, Score: 0.6640
Item ID: 262060, Score: 0.5765
Item ID: 570, Score: 0.5652
Item ID: 242760, Score: 0.5574
Item ID: 1172470, Score: 0.5185
Item ID: 367520, Score: 0.5102
Item ID: 374320, Score: 0.5035
Item ID: 814380, Score: 0.4990
Item ID: 49520, Score: 0.4566


In [14]:
test_df[test_df["user_id"] == 12584260].sort_values(by='app_id', ascending=True)

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
1374130,377160,0,0,2022-10-13,True,30.5,12584260,25429168
848360,602960,37,52,2022-09-21,True,3.9,12584260,25101228
852746,1091500,0,0,2022-09-21,True,46.0,12584260,22536432
852251,1313140,10,21,2022-09-21,True,0.6,12584260,23976633


In [16]:
import numpy as np

# Funzione per calcolare la precisione
def calculate_precision(user_id, recommended_items, test_df):
    # Ottieni gli articoli effettivi con cui l'utente ha interagito nel test set (dove is_recommended == 1)
    actual_items = test_df[(test_df['user_id'] == user_id) & (test_df['is_recommended'] == 1)]['app_id']
    
    # Confronta gli articoli raccomandati con quelli effettivi
    recommended_item_ids = [item[0] for item in recommended_items]
    
    # Calcola la precisione come frazione di articoli raccomandati che sono effettivamente rilevanti
    relevant_recommendations = sum(1 for item in recommended_item_ids if item in actual_items.values)
    precision = relevant_recommendations / len(recommended_item_ids) if len(recommended_item_ids) > 0 else 0
    
    return precision

# Funzione per calcolare il recall
def calculate_recall(user_id, recommended_items, test_df):
    # Ottieni gli articoli effettivi con cui l'utente ha interagito nel test set (dove is_recommended == 1)
    actual_items = test_df[(test_df['user_id'] == user_id) & (test_df['is_recommended'] == 1)]['app_id']
    
    # Confronta gli articoli raccomandati con quelli effettivi
    recommended_item_ids = [item[0] for item in recommended_items]
    
    # Calcola il recall come frazione di articoli rilevanti che sono stati raccomandati
    relevant_recommendations = sum(1 for item in recommended_item_ids if item in actual_items.values)
    recall = relevant_recommendations / len(actual_items) if len(actual_items) > 0 else 0
    
    return recall

# Funzione per calcolare il F1-Score
def calculate_f1_score(user_id, recommended_items, test_df):
    precision = calculate_precision(user_id, recommended_items, test_df)
    recall = calculate_recall(user_id, recommended_items, test_df)
    
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

def calculate_hit_ratio(user_id, recommended_items, test_df, N=10):
    actual_items = test_df[(test_df['user_id'] == user_id) & (test_df['is_recommended'] == 1)]['app_id']
    recommended_item_ids = [item[0] for item in recommended_items][:N]  # Limita alle prime N raccomandazioni
    hit = 1 if any(item in actual_items.values for item in recommended_item_ids) else 0
    return hit

user_id = 12584260  # esempio di user_id nel test set
recommended_items = get_recommendations_for_user(user_id)

# Calcola la precisione, recall e F1-Score
precision = calculate_precision(user_id, recommended_items, test_df)
recall = calculate_recall(user_id, recommended_items, test_df)
f1 = calculate_f1_score(user_id, recommended_items, test_df)
hit = calculate_hit_ratio(user_id, recommended_items, test_df)

# Stampa i risultati
print(f"Precisione per l'utente {user_id}: {precision}")
print(f"Recall per l'utente {user_id}: {recall}")
print(f"F1-Score per l'utente {user_id}: {f1}")
print(f"Hit Ratio per l'utente {user_id}: {hit}")


Precisione per l'utente 12584260: 0.0
Recall per l'utente 12584260: 0.0
F1-Score per l'utente 12584260: 0
Hit Ratio per l'utente 12584260: 0


In [17]:
# Ottieni gli utenti presenti nel test set e nel train set come set per una ricerca più veloce
test_users_set = set(test_df['user_id'].unique())
train_users_set = set(train_df['user_id'].unique())

In [18]:
known_users = list(test_users_set & train_users_set)  # Intersezione come lista
cold_start_users = list(test_users_set - train_users_set)

In [19]:
# Conta il numero di righe per ogni user_id nel test set
user_counts = test_df.groupby('user_id').size()

# Filtra gli utenti che hanno almeno 10 righe
known_users_filtered = [user_id for user_id in known_users if user_counts[user_id] >= 10]

# Stampa gli utenti filtrati
print(f"Utenti noti con almeno 10 righe nel test set: {len(known_users_filtered)}")


Utenti noti con almeno 10 righe nel test set: 15029


In [20]:
known_user_metrics = []
for user_id in tqdm(known_users_filtered[0:1000]):
    recommended_items = get_recommendations_for_user(user_id)
    precision = calculate_precision(user_id, recommended_items, test_df)
    recall = calculate_recall(user_id, recommended_items, test_df)
    f1 = calculate_f1_score(user_id, recommended_items, test_df)
    hit = calculate_hit_ratio(user_id, recommended_items, test_df)
    known_user_metrics.append((user_id, precision, recall, f1, hit))

100%|██████████| 1000/1000 [01:24<00:00, 11.89it/s]


In [21]:
# Calcola la media delle metriche per gli utenti noti
if len(known_user_metrics) > 0:
    avg_precision_known = np.mean([metric[1] for metric in known_user_metrics])
    avg_recall_known = np.mean([metric[2] for metric in known_user_metrics])
    avg_f1_known = np.mean([metric[3] for metric in known_user_metrics])
    avg_hit_known = np.mean([metric[4] for metric in known_user_metrics])
else:
    avg_precision_known, avg_recall_known, avg_f1_known = 0, 0, 0

# Stampa i risultati per gli utenti noti
print("Metriche per gli utenti noti:")
print(f"Precisione media: {avg_precision_known:.4f}")
print(f"Recall medio: {avg_recall_known:.4f}")
print(f"F1-Score medio: {avg_f1_known:.4f}")
print(f"Hit Ratio medio: {avg_hit_known:.4f}")

Metriche per gli utenti noti:
Precisione media: 0.0482
Recall medio: 0.0354
F1-Score medio: 0.0387
Hit Ratio medio: 0.3240
