In [None]:
# Instalacion librerias
!pip install implicit -q

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, save_npz
from implicit.nearest_neighbours import CosineRecommender
from collections import defaultdict
from tqdm import tqdm
import os

# -----------------------
# 1. Cargar datos y artefactos
# -----------------------
# Cargamos los datos crudos y diccionarios de mapeo
df_pairs_unique = pd.read_pickle("data/df_pairs_unique.pkl")
author2idx = np.load("data/author_to_idx.npy", allow_pickle=True).item()
idx2author = np.load("data/idx_to_author.npy", allow_pickle=True).item()
n_authors = len(author2idx)

# Filtrar autores con >= 2 colaboraciones para permitir LOO riguroso
author_counts = df_pairs_unique[['pair_min', 'pair_max']].stack().value_counts()
eligible_authors = set(author_counts[author_counts >= 2].index)

df_filtered = df_pairs_unique[
    df_pairs_unique['pair_min'].isin(eligible_authors) &
    df_pairs_unique['pair_max'].isin(eligible_authors)
].reset_index(drop=True)

print(f"Autores elegibles (>=2): {len(eligible_authors):,}")
print(f"Pares tras filtrado: {len(df_filtered):,}")

# -----------------------
# 2. Split LOO Triple (Train/Val/Test)
# -----------------------
def triple_loo_split(df, seed=42):
    rng = np.random.default_rng(seed)
    adj = defaultdict(list)
    for idx, row in enumerate(df.itertuples()):
        adj[row.pair_min].append(idx)
        adj[row.pair_max].append(idx)

    test_idx = set()
    val_idx = set()
    assigned = set()

    # Selección aleatoria para Validación y Test
    for author in adj:
        # Solo consideramos índices que no hayan sido asignados a otro autor aún
        indices = [i for i in adj[author] if i not in assigned]
        if len(indices) >= 3:
            rng.shuffle(indices)
            test_idx.add(indices[0])
            val_idx.add(indices[1])
            assigned.update([indices[0], indices[1]])
        elif len(indices) == 2:
            t = rng.choice(indices)
            test_idx.add(t)
            assigned.add(t)

    all_indices = np.arange(len(df))
    train_idx = np.setdiff1d(all_indices, list(test_idx | val_idx))

    return (
        df.iloc[train_idx].reset_index(drop=True),
        df.iloc[list(val_idx)].reset_index(drop=True),
        df.iloc[list(test_idx)].reset_index(drop=True)
    )

print("Generando splits...")
df_train, df_val, df_test = triple_loo_split(df_filtered)

# -----------------------
# 3. Funciones de Métricas y Recomendación
# -----------------------
def build_csr(df, author2idx, n):
    u_idx = df['pair_min'].map(author2idx).values
    v_idx = df['pair_max'].map(author2idx).values
    rows = np.concatenate([u_idx, v_idx])
    cols = np.concatenate([v_idx, u_idx])
    data = np.ones(len(rows), dtype=np.float32)
    return csr_matrix((data, (rows, cols)), shape=(n, n))

def build_gt(df):
    gt = defaultdict(set)
    for row in df.itertuples():
        gt[row.pair_min].add(row.pair_max)
        gt[row.pair_max].add(row.pair_min)
    return gt

def recall_ndcg_at_k(recs, gt, k=20):
    r_sum, n_sum, count = 0.0, 0.0, 0
    for u, rel in gt.items():
        if not rel: continue
        ranked = recs.get(u, [])[:k]
        hits = [1 if i in rel else 0 for i in ranked]
        r_sum += sum(hits) / len(rel)
        dcg = sum(h / np.log2(i + 2) for i, h in enumerate(hits))
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(rel), k)))
        n_sum += (dcg / idcg) if idcg > 0 else 0
        count += 1
    return (r_sum / count, n_sum / count) if count > 0 else (0, 0)

def calculate_coverage(recs, all_possible_items):
    recommended_items = set()
    for items in recs.values():
        recommended_items.update(items)
    return len(recommended_items) / len(all_possible_items)

def calculate_novelty(recs, train_df, k=20):
    # Popularidad basada en interacciones de entrenamiento
    item_counts = train_df[['pair_min', 'pair_max']].stack().value_counts().to_dict()
    total_interactions = sum(item_counts.values())

    nov_sum, count = 0.0, 0
    for u, ranked_list in recs.items():
        if not ranked_list: continue
        # Self-information: -log2(p(i))
        u_nov = sum(-np.log2(item_counts.get(it, 1) / total_interactions) for it in ranked_list[:k])
        nov_sum += (u_nov / k)
        count += 1
    return nov_sum / count if count > 0 else 0

def get_recommendations(model, train_matrix, target_authors, topk=20):
    recs = {}
    train_matrix = train_matrix.tocsr().astype(np.float32)
    for author in tqdm(target_authors, desc="Recomendando", leave=False):
        u_idx = author2idx[author]
        ids, _ = model.recommend(userid=int(u_idx), user_items=train_matrix[u_idx],
                                 N=topk, filter_already_liked_items=True)
        recs[author] = [idx2author[x] for x in ids]
    return recs

# -----------------------
# 4. Afinamiento de Hiperparámetros (Validation)
# -----------------------
X_train_v = build_csr(df_train, author2idx, n_authors)
gt_val = build_gt(df_val)
val_authors = list(gt_val.keys())
val_sample = list(np.random.default_rng(42).choice(val_authors, size=min(20000, len(val_authors)), replace=False))

best_k, best_ndcg = 0, -1
ks_to_test = [20, 50, 100, 150, 200]

print("\n--- TUNING: RECALL & NDCG ---")
for k in ks_to_test:
    model = CosineRecommender(K=k)
    model.fit(X_train_v, show_progress=False)
    recs = get_recommendations(model, X_train_v, val_sample)
    rec, ndcg = recall_ndcg_at_k(recs, {a: gt_val[a] for a in val_sample})
    print(f"K={k:3} | Recall@20: {rec:.4f} | NDCG@20: {ndcg:.4f}")
    if ndcg > best_ndcg:
        best_ndcg, best_k = ndcg, k

# -----------------------
# 5. Evaluación Final (Test)
# -----------------------
print(f"\n--- EVALUACIÓN FINAL (K={best_k}) ---")
# Unimos Train + Val para el entrenamiento final antes de test
df_final_train = pd.concat([df_train, df_val]).reset_index(drop=True)
X_final_train = build_csr(df_final_train, author2idx, n_authors)
gt_test = build_gt(df_test)
test_authors = list(gt_test.keys())

final_model = CosineRecommender(K=best_k)
final_model.fit(X_final_train, show_progress=True)

# Generar recomendaciones para todos los autores en Test
recs_test = get_recommendations(final_model, X_final_train, test_authors)

# Métricas Finales
f_rec, f_ndcg = recall_ndcg_at_k(recs_test, gt_test)
f_cov = calculate_coverage(recs_test, set(df_final_train['pair_min']) | set(df_final_train['pair_max']))
f_nov = calculate_novelty(recs_test, df_final_train)

print("\n" + "="*50)
print(f"RESULTADOS FINALES (LOO)")
print(f"Muestra Test: {len(test_authors):,} autores")
print(f"Recall@20:   {f_rec:.4f}")
print(f"NDCG@20:     {f_ndcg:.4f}")
print(f"Coverage:    {f_cov:.4f}")
print(f"Novelty:     {f_nov:.4f}")
print("="*50)

# -----------------------
# 6. Guardado y Producción
# -----------------------
X_full = build_csr(df_pairs_unique, author2idx, n_authors)
final_model.fit(X_full, show_progress=True)

folder = "models_final"
os.makedirs(folder, exist_ok=True)
final_model.save(f"{folder}/itemknn_best.npz")
save_npz(f"{folder}/X_full.npz", X_full)
print(f"Proceso finalizado. Modelo guardado en {folder}/")