In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from collections import defaultdict

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_json('./combined_df.json', lines=True)

# Combine genre list into a string for each game
df['genre_str'] = df['genre'].apply(lambda x: ' '.join(x))

# -----------------------------
# Build TF-IDF + Cosine Similarity Matrix
# -----------------------------
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genre_str'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# -----------------------------
# Recommendation Function
# -----------------------------
def get_recommendations(title, cosine_sim=cosine_sim, k=10):
    idx = df[df['game'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:k+1]
    game_indices = [i[0] for i in sim_scores]
    return df['game'].iloc[game_indices].tolist()

# -----------------------------
# Generate Proxy Ground Truth from Genre Overlap
# -----------------------------
def build_genre_based_ground_truth(df):
    genre_to_games = defaultdict(set)
    for idx, row in df.iterrows():
        for genre in row['genre']:
            genre_to_games[genre].add(row['game'])

    ground_truth = {}
    for idx, row in df.iterrows():
        similar_games = set()
        for genre in row['genre']:
            similar_games.update(genre_to_games[genre])
        similar_games.discard(row['game'])  # exclude self
        ground_truth[row['game']] = list(similar_games)

    return ground_truth

# -----------------------------
# Precision@K and Recall@K
# -----------------------------
def precision_recall_at_k(df, cosine_sim, ground_truth, k=10):
    precision_scores = []
    recall_scores = []

    for title, relevant_games in ground_truth.items():
        if title not in df['game'].values:
            continue

        idx = df[df['game'] == title].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:k+1]
        recommended_games = [df.iloc[i[0]]['game'] for i in sim_scores]

        relevant_set = set(relevant_games)
        recommended_set = set(recommended_games)

        true_positives = len(recommended_set & relevant_set)
        precision = true_positives / k
        recall = true_positives / len(relevant_set) if len(relevant_set) > 0 else 0

        precision_scores.append(precision)
        recall_scores.append(recall)

    avg_precision = sum(precision_scores) / len(precision_scores)
    avg_recall = sum(recall_scores) / len(recall_scores)

    return avg_precision, avg_recall

# -----------------------------
# Run Everything
# -----------------------------
k = 30
ground_truth = build_genre_based_ground_truth(df)
avg_precision, avg_recall = precision_recall_at_k(df, cosine_sim, ground_truth, k)

print(f"Proxy Precision@{k}: {avg_precision:.4f}")
print(f"Proxy Recall@{k}: {avg_recall:.4f}")

# -----------------------------
# Example Recommendation
# -----------------------------
print("\nExample Recommendations for 'Hollow Knight':")
print(get_recommendations('Hollow Knight', cosine_sim, k=10))
