In [126]:
import pandas as pd
import numpy as np
import altair as alt
import json
import tensorflow as tf
from transformers import pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import pairwise_distances
from sklearn.impute import SimpleImputer


In [139]:
ratings=pd.read_csv('../csv/user_ratings.csv')
artists=pd.read_csv('../csv/artists_reduced.csv')
designers=pd.read_csv('../csv/designers_reduced.csv')
games=pd.read_csv('../csv/games.csv')
mechanics=pd.read_csv('../csv/mechanics.csv')
themes=pd.read_csv('../csv/themes.csv')
publishers=pd.read_csv('../csv/publishers_reduced.csv')
ratings_distr=pd.read_csv('../csv/ratings_distribution.csv')

In [77]:
publisher_cols = [c for c in publishers.columns if c not in ['BGGId', 'Low-Exp Publisher']]
df_pub = publishers.melt(
    id_vars=["BGGId"], 
    value_vars=publisher_cols, 
    var_name="Publisher", 
    value_name="flag"
)

df_pub = df_pub[df_pub["flag"] == 1].drop(columns="flag")
df_pub["Publisher"] = df_pub["Publisher"].str.replace("Publisher_", "")

In [78]:
df_pub_popular = (
    df_pub.groupby(["BGGId", "Publisher"])
    .size()                               
    .reset_index(name="count")
    .sort_values(["BGGId", "count"], ascending=[True, False])
    .drop_duplicates("BGGId")            
    .reset_index(drop=True)
)
df_pub_popular = df_pub_popular.drop(columns="count")

In [79]:
cols_artistas = [col for col in artists.columns if col not in ['BGGId', 'Low-Exp Artist']]
df_artistas_long = artists.melt(
    id_vars='BGGId',
    value_vars=cols_artistas,
    var_name='artist',
    value_name='present'
)
df_artistas_long = df_artistas_long[df_artistas_long['present'] == 1].drop(columns='present')
todos_los_juegos = artists[['BGGId']].drop_duplicates()
juegos_con_artista = df_artistas_long['BGGId'].drop_duplicates()
juegos_sin_artista = todos_los_juegos[~todos_los_juegos['BGGId'].isin(juegos_con_artista)]

In [80]:
df_unknown = juegos_sin_artista.copy()
df_unknown['artist'] = "Unknown"
df_artistas_completo = pd.concat([df_artistas_long, df_unknown], ignore_index=True)
df_artistas_completo = df_artistas_completo.sort_values(by='BGGId').reset_index(drop=True)

In [81]:
df_artistas_completo = df_artistas_completo.sort_values(by='BGGId').reset_index(drop=True)
df_artists = df_artistas_completo.groupby('BGGId', as_index=False).first()

In [82]:
cols_designers = [col for col in designers.columns if col not in ['BGGId', 'Low-Exp Designer']]
df_long = designers.melt(
    id_vars='BGGId',
    value_vars=cols_designers,
    var_name='designer',
    value_name='present'
)
df_long = df_long[df_long['present'] == 1].drop(columns='present')
todos_los_juegos = designers[['BGGId']].drop_duplicates()
juegos_con_dise単ador = df_long['BGGId'].drop_duplicates()
juegos_sin_dise単ador = todos_los_juegos[~todos_los_juegos['BGGId'].isin(juegos_con_dise単ador)]
df_unknown = juegos_sin_dise単ador.copy()
df_unknown['designer'] = "Unknown"
df_designers_completo = pd.concat([df_long, df_unknown], ignore_index=True)
df_designer = df_designers_completo.groupby('BGGId', as_index=False).first()
df_designer = df_designer.sort_values(by='BGGId').reset_index(drop=True)


In [83]:
rank_cols = [c for c in games.columns if c.startswith("Rank:")]
for c in rank_cols:
    games[c] = games[c].apply(lambda x: 1 if pd.notnull(x) and x < 3000 else 0)

In [84]:
final_data=games[['BGGId',
 'Name',
 'Description',
 'YearPublished',
 'AvgRating',
 'BayesAvgRating',
 'MinPlayers',
 'MaxPlayers',
 'ComAgeRec',
 'LanguageEase',
 'BestPlayers',
 'MfgPlaytime',
 'MfgAgeRec',
 'Kickstarted',
 'ImagePath',
 'Rank:boardgame',
 'Rank:strategygames',
 'Rank:abstracts',
 'Rank:familygames',
 'Rank:thematic',
 'Rank:cgs',
 'Rank:wargames',
 'Rank:partygames',
 'Rank:childrensgames',]]

In [86]:
df_merged = final_data.merge(df_artists, on='BGGId', how='left')
df_merged = df_merged.merge(df_designer, on='BGGId', how='left')
df_merged = df_merged.merge(mechanics, on='BGGId', how='left')
df_merged = df_merged.merge(themes, on='BGGId', how='left')
df_merged = df_merged.merge(df_pub_popular, on='BGGId', how='left')

In [87]:
numeric_cols = [
    'YearPublished', 'AvgRating', 'BayesAvgRating',
    'MinPlayers', 'MaxPlayers', 'BestPlayers',
    'MfgPlaytime', 'ComAgeRec', 'MfgAgeRec', 'LanguageEase'
]

scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(df_merged[numeric_cols])
excluded = set(numeric_cols + ['BGGId','Name','Description','ImagePath','artist','designer','Kickstarted'])
binary_cols = [col for col in df_merged.columns if col not in excluded and df_merged[col].dropna().isin([0,1]).all()]
X_bin = df_merged[binary_cols].fillna(0).astype(int)


In [None]:
min_freq = 20
valid_cols = X_bin.columns[X_bin.sum(axis=0) >= min_freq]
X_bin_filtered = X_bin[valid_cols]
sim_bin = 1 - pairwise_distances(X_bin_filtered.values, metric='jaccard')




   Rank:boardgame  Rank:strategygames  Rank:abstracts  Rank:familygames  Rank:thematic  Rank:cgs  Rank:wargames  Rank:partygames  Rank:childrensgames  Alliances  Area Majority / Influence  Auction/Bidding  Dice Rolling  Hand Management  Simultaneous Action Selection  Trick-taking  Hexagon Grid  Once-Per-Game Abilities  Set Collection  Tile Placement  Action Points  Investment  Market  Square Grid  Stock Holding  Victory Points as a Resource  Enclosure  Pattern Building  Pattern Recognition  Modular Board  Network and Route Building  Point to Point Movement  Melding and Splaying  Negotiation  Trading  Push Your Luck  Income  Race  Random Production  Variable Set-up  Roll / Spin and Move  Variable Player Powers  Action Queue  Bias  Grid Movement  Lose a Turn  Programmed Movement  Scenario / Mission / Campaign Game  Voting  Events  Paper-and-Pencil  Player Elimination  Role Playing  Movement Points  Simulation  Variable Phase Order  Area Movement  Commodity Speculation  Cooperative Game  

In [None]:
imputer = SimpleImputer(strategy='median')
X_num_clean = imputer.fit_transform(df_merged[numeric_cols])
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(X_num_clean)
sim_num = 1 - pairwise_distances(X_num_scaled, metric='cosine')
similarity_matrix = 0.7 * sim_bin + 0.3 * sim_num

In [None]:
def get_similar_games(game_index, similarity_matrix, df, top_n=10):
    scores = similarity_matrix[game_index]
    indices = scores.argsort()[::-1][1:top_n+1]  # Excluye el propio juego
    return df.iloc[indices][['Name', 'YearPublished', 'AvgRating']]

In [None]:
def get_top_similar(game_name, top_n=10, exclude_same_prefix=True):
    idx = df_merged[df_merged['Name']==game_name].index[0]
    scores = similarity_matrix[idx]
    indices = scores.argsort()[::-1][1:]
    names, sims = [], []
    for i in indices:
        candidate = df_merged.iloc[i]['Name']
        if exclude_same_prefix and candidate.startswith(game_name):
            continue
        names.append(candidate)
        sims.append(scores[i])
        if len(names) >= top_n:
            break
    
    return names, sims


['Cascadia', 'Seikatsu', 'O Zoo le Mio', "Ankh'or", 'Aquaretto', 'Zooloretto', 'Sagani', 'Maharani', 'Scarabya', 'Azul: Summer Pavilion', 'Azul', 'Overboss: A Boss Monster Adventure', 'Ohanami', 'Lanterns: The Harvest Festival', 'FITS', 'Cathedral', 'Take it Easy!', 'Qwirkle', 'Kerala: The Way of the Elephant', 'Hive']
[np.float64(0.6993901196773793), np.float64(0.6720519846997945), np.float64(0.6443592269268885), np.float64(0.6150672123444176), np.float64(0.6139527860888556), np.float64(0.6136181168415833), np.float64(0.613282967157851), np.float64(0.6103694905807443), np.float64(0.6090759926802387), np.float64(0.5986906273733623), np.float64(0.5981822525445578), np.float64(0.5948839837523181), np.float64(0.5887408940086403), np.float64(0.5875623967550945), np.float64(0.5867598184248921), np.float64(0.5864650631199954), np.float64(0.586370639093668), np.float64(0.5858778060339984), np.float64(0.5855935820499264), np.float64(0.5789526658787686)]


CREAR LOS JSON FILES


In [None]:
def precalc_neighbors(df, similarity_matrix, top_n=300):
    names = df["Name"].to_numpy()
    results = {}
    for idx in range(len(df)):
        scores = similarity_matrix[idx]
        top_idx = np.argpartition(scores, -(top_n+1))[-(top_n+1):]
        top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]
        top_idx = [i for i in top_idx if i != idx][:top_n]
        neighbors = [{"name": names[i], "score": float(scores[i])} for i in top_idx]
        results[names[idx]] = neighbors
    return results

def build_game_info(df, mechanics, themes):
    mechanic_cols = [c for c in mechanics.columns if c != "BGGId"]
    theme_cols = [c for c in themes.columns if c != "BGGId"]
    rank_cols = [c for c in df.columns if c.startswith("Rank:")]
    for c in rank_cols:
        df[c] = df[c].apply(lambda x: 1 if pd.notnull(x) and x < 3000 else 0)

    def extract_tags(row, cols):
        return [c for c in cols if row[c] == 1]
    df["ImagePath"] = df["ImagePath"].fillna("../error.jpg")
    results = {}
    for _, row in df.iterrows():
        bggid = row["BGGId"]

        mech_row = mechanics.loc[mechanics["BGGId"] == bggid]
        theme_row = themes.loc[themes["BGGId"] == bggid]

        mechanics_list = extract_tags(mech_row.iloc[0], mechanic_cols) if not mech_row.empty else []
        themes_list = extract_tags(theme_row.iloc[0], theme_cols) if not theme_row.empty else []
        rank_list = [c.replace("Rank:", "") for c in rank_cols if row[c] == 1]

        results[row["Name"]] = {
            "BGGId": int(row["BGGId"]),
            "image": row["ImagePath"],
            "minPlayers": int(row["MinPlayers"]),
            "maxPlayers": int(row["MaxPlayers"]),
            "playtime": int(row["MfgPlaytime"]),
            "age": int(row["MfgAgeRec"]),
            "year": int(row["YearPublished"]),
            "avgRating": float(row["AvgRating"]),
            "designer": str(row["designer"]) if pd.notnull(row["designer"]) else "",
            "publisher": str(row["Publisher"]) if pd.notnull(row["Publisher"]) else "",
            "mechanics": mechanics_list,
            "categories": themes_list,
            "rankCategories": rank_list
        }
    return results

In [None]:
descriptions_dict = {
    str(row["BGGId"]): row["Description"] if pd.notna(row["Description"]) else ""
    for _, row in df_merged.iterrows()
}
with open("descriptions.json", "w", encoding="utf-8") as f:
    json.dump(descriptions_dict, f, ensure_ascii=False, indent=2)

Archivo descriptions.json generado con 21925 entradas.


In [138]:
with open("neighbors.json", "w", encoding="utf-8") as f:
    json.dump(precalc_neighbors(df_merged, similarity_matrix), f, ensure_ascii=False, indent=2)


In [103]:

with open("games.json", "w", encoding="utf-8") as f:
    json.dump(build_game_info(df_merged, mechanics, themes), f, ensure_ascii=False, indent=2)