In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.nearest_neighbours import CosineRecommender
from collections import defaultdict
from tqdm import tqdm
import os

# -----------------------
# 1. Cargar datos
# -----------------------
df_pairs_unique = pd.read_pickle("data/df_pairs_unique.pkl")
author2idx = np.load("data/author_to_idx.npy", allow_pickle=True).item()
idx2author = np.load("data/idx_to_author.npy", allow_pickle=True).item()
n_authors = len(author2idx)

print(f"Autores: {n_authors:,}")
print(f"Pares únicos: {len(df_pairs_unique):,}")

# -----------------------
# 2. Split LOO (Optimizado)
# -----------------------
def train_val_test_split(df, seed=42):
    rng = np.random.default_rng(seed)
    
    # Agrupar por autor usando diccionarios de listas de índices
    adj = defaultdict(list)
    for idx, row in enumerate(df.itertuples()):
        adj[row.pair_min].append(idx)
        adj[row.pair_max].append(idx)

    test_idx = set()
    val_idx = set()
    assigned = set()

    # Selección aleatoria para LOO
    for author, indices in adj.items():
        available = [i for i in indices if i not in assigned]
        if len(available) >= 3:
            # Shuffle local para elegir
            rng.shuffle(available)
            t = available[0]
            v = available[1]
            test_idx.add(t)
            val_idx.add(v)
            assigned.update([t, v])
        elif len(available) == 2:
            t = rng.choice(available)
            test_idx.add(t)
            assigned.add(t)

    all_indices = np.arange(len(df))
    train_idx = np.setdiff1d(all_indices, list(test_idx | val_idx))

    return (
        df.iloc[train_idx].reset_index(drop=True),
        df.iloc[list(val_idx)].reset_index(drop=True),
        df.iloc[list(test_idx)].reset_index(drop=True),
    )

df_train, df_val, df_test = train_val_test_split(df_pairs_unique)

# -----------------------
# 3. Construcción CSR Binaria (Corregida con Dtypes)
# -----------------------
def build_csr(df, author2idx, n):
    # Usar arrays de numpy directamente es mucho más rápido y seguro para dtypes
    u_indices = df['pair_min'].map(author2idx).values
    v_indices = df['pair_max'].map(author2idx).values
    
    # Grafo no dirigido: duplicamos las conexiones
    rows = np.concatenate([u_indices, v_indices])
    cols = np.concatenate([v_indices, u_indices])
    # Implicit prefiere float32 para los datos de la matriz
    data = np.ones(len(rows), dtype=np.float32)
    
    return csr_matrix((data, (rows, cols)), shape=(n, n))

X_train = build_csr(df_train, author2idx, n_authors)

# -----------------------
# 4. Ground Truth
# -----------------------
def build_gt(df):
    gt = defaultdict(set)
    for row in df.itertuples():
        gt[row.pair_min].add(row.pair_max)
        gt[row.pair_max].add(row.pair_min)
    return gt

gt_val = build_gt(df_val)

# -----------------------
# 5. Métricas
# -----------------------
def recall_ndcg_at_k(recs, gt, k=20):
    r_sum, n_sum, count = 0.0, 0.0, 0
    for u, rel in gt.items():
        if not rel: continue
        ranked = recs.get(u, [])[:k]
        
        hits = [1 if i in rel else 0 for i in ranked]
        # Recall
        r_sum += sum(hits) / len(rel)
        # NDCG
        dcg = sum(h / np.log2(i + 2) for i, h in enumerate(hits))
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(rel), k)))
        n_sum += (dcg / idcg) if idcg > 0 else 0
        count += 1
    return r_sum / count, n_sum / count

# -----------------------
# 6. Recomendación (Solución al TypeError)
# -----------------------
def get_recommendations(model, train_matrix, target_authors, author2idx, idx2author, topk=20):
    recs = {}
    # Aseguramos que la matriz sea CSR y float32 antes de entrar al bucle
    train_matrix = train_matrix.tocsr().astype(np.float32)
    
    for author in tqdm(target_authors, desc="Recomendando", leave=False):
        u_idx = author2idx[author]
        
        # El error "No matching signature" ocurre por el tipo de datos en u_idx o train_matrix
        ids, scores = model.recommend(
            userid=u_idx, 
            user_items=train_matrix[u_idx], 
            N=topk, 
            filter_already_liked_items=True
        )
        recs[author] = [idx2author[x] for x in ids]
    return recs

# -----------------------
# 7. Grid Search
# -----------------------
sample_authors = list(gt_val.keys())
if len(sample_authors) > 20000:
    rng = np.random.default_rng(42)
    sample_authors = list(rng.choice(sample_authors, size=20000, replace=False))

gt_val_sample = {a: gt_val[a] for a in sample_authors}

results = []
ks_to_test = [20, 50, 100, 150, 200]

print("\nIniciando Grid Search...")
for k in ks_to_test:
    model = CosineRecommender(K=k)
    # Fit requiere que la matriz sea CSR
    model.fit(X_train, show_progress=False)
    
    recs = get_recommendations(model, X_train, sample_authors, author2idx, idx2author, topk=20)
    recall, ndcg = recall_ndcg_at_k(recs, gt_val_sample, k=20)
    
    results.append((k, recall, ndcg))
    print(f"K={k} | Recall@20: {recall:.4f} | NDCG@20: {ndcg:.4f}")

# -----------------------
# 8. Guardar
# -----------------------
df_results = pd.DataFrame(results, columns=["k", "recall@20", "ndcg@20"])
df_results.to_csv("knn_grid_results.csv", index=False)


In [None]:
# ============================================================
# SCRIPT ITEMKNN CON LOO CLÁSICO — Evaluación FINAL
# ============================================================

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, save_npz
from implicit.nearest_neighbours import CosineRecommender
from collections import defaultdict
from tqdm import tqdm
import os

# -----------------------
# 1. Cargar artefactos y el mejor K
# -----------------------
best_k = 200
df_pairs_unique = pd.read_pickle("data/df_pairs_unique.pkl")
author2idx = np.load("data/author_to_idx.npy", allow_pickle=True).item()
idx2author = np.load("data/idx_to_author.npy", allow_pickle=True).item()
n_authors = len(author2idx)

print(f"Evaluación final con hiperparámetro k={best_k}")

# -----------------------
# 2. Filtrar autores elegibles para LOO clásico (≥2 colaboraciones)
# -----------------------
author_counts = df_pairs_unique[['pair_min','pair_max']].stack().value_counts()
eligible_authors = author_counts[author_counts >= 2].index

df_test_candidates = df_pairs_unique[
    df_pairs_unique['pair_min'].isin(eligible_authors) &
    df_pairs_unique['pair_max'].isin(eligible_authors)
]

# -----------------------
# 3. Split LOO clásico
# -----------------------
def train_test_split_LOO(df, seed=42):
    rng = np.random.default_rng(seed)
    adj = defaultdict(list)
    
    for idx, row in enumerate(df.itertuples()):
        adj[row.pair_min].append(idx)
        adj[row.pair_max].append(idx)

    test_idx = set()
    for author, indices in adj.items():
        if len(indices) < 2:
            continue
        t = rng.choice(indices)
        test_idx.add(t)

    all_indices = np.arange(len(df))
    train_idx = np.setdiff1d(all_indices, list(test_idx))

    return df.iloc[train_idx].reset_index(drop=True), df.iloc[list(test_idx)].reset_index(drop=True)

df_train, df_test = train_test_split_LOO(df_test_candidates)

print(f"Split final: {len(df_train)} train / {len(df_test)} test")

# -----------------------
# 4. Construcción CSR Binaria
# -----------------------
def build_csr(df, author2idx, n):
    u_idx = df['pair_min'].map(author2idx).values
    v_idx = df['pair_max'].map(author2idx).values
    rows = np.concatenate([u_idx, v_idx])
    cols = np.concatenate([v_idx, u_idx])
    data = np.ones(len(rows), dtype=np.float32)
    return csr_matrix((data, (rows, cols)), shape=(n, n))

X_train = build_csr(df_train, author2idx, n_authors)

# -----------------------
# 5. Ground Truth Test
# -----------------------
def build_gt(df):
    gt = defaultdict(set)
    for row in df.itertuples():
        gt[row.pair_min].add(row.pair_max)
        gt[row.pair_max].add(row.pair_min)
    return gt

gt_test = build_gt(df_test)

# -----------------------
# 6. Métricas Recall/NDCG
# -----------------------
def recall_ndcg_at_k(recs, gt, k=20):
    r_sum, n_sum, count = 0.0, 0.0, 0
    for u, rel in gt.items():
        if not rel: continue
        ranked = recs.get(u, [])[:k]
        hits = [1 if i in rel else 0 for i in ranked]
        r_sum += sum(hits) / len(rel)
        dcg = sum(h / np.log2(i + 2) for i, h in enumerate(hits))
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(rel), k)))
        n_sum += (dcg / idcg) if idcg > 0 else 0
        count += 1
    return (r_sum / count, n_sum / count) if count > 0 else (0, 0)

# -----------------------
# 7. Entrenamiento modelo final LOO clásico
# -----------------------
model = CosineRecommender(K=best_k)
print("Entrenando modelo final LOO clásico...")
model.fit(X_train.astype(np.float32), show_progress=True)

# -----------------------
# 8. Generación de recomendaciones para test
# -----------------------
def get_final_recommendations(model, train_matrix, target_authors, author2idx, idx2author, topk=20):
    final_recs = {}
    train_matrix = train_matrix.tocsr().astype(np.float32)
    for author in tqdm(target_authors, desc="Recomendando para Test"):
        u_idx = author2idx[author]
        ids, scores = model.recommend(
            userid=int(u_idx),
            user_items=train_matrix[u_idx],
            N=topk,
            filter_already_liked_items=True
        )
        final_recs[author] = [idx2author[x] for x in ids]
    return final_recs

target_authors = list(gt_test.keys())
recs = get_final_recommendations(model, X_train, target_authors, author2idx, idx2author)

recall, ndcg = recall_ndcg_at_k(recs, gt_test, k=20)
print("\n" + "="*40)
print(f"RESULTADOS FINALES LOO CLÁSICO (K={best_k})")
print(f"Muestra de evaluación: {len(target_authors):,} autores")
print(f"Recall@20 = {recall:.4f}")
print(f"NDCG@20  = {ndcg:.4f}")
print("="*40)

# -----------------------
# 9. Entrenamiento final para producción (todos los autores)
# -----------------------
X_train_full = build_csr(df_pairs_unique, author2idx, n_authors)
model.fit(X_train_full.astype(np.float32), show_progress=True)

# -----------------------
# 10. Guardar artefactos
# -----------------------
folder = "models_final"
os.makedirs(folder, exist_ok=True)

model.save(f"{folder}/itemknn_model_full.npz")
save_npz(f"{folder}/X_train_final.npz", X_train_full.astype(np.float32))

print("¡Todo guardado correctamente!")
print(f"Archivos: {os.listdir(folder)}")
