In [None]:
# 1. Instalaci√≥n de dependencias necesarias en Colab
!pip install implicit -q

In [None]:
import os
import time
import gc
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import torch
from collections import defaultdict
from tqdm import tqdm

# Optimizaci√≥n de fragmentaci√≥n de memoria CUDA
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ---------------------------------------------------------------
# 1. PREPARACI√ìN Y FILTRADO
# ---------------------------------------------------------------
def preprocess_data(df_pairs):
    print("üîç Filtrando autores con al menos 2 colaboraciones...")
    counts = pd.concat([df_pairs['pair_min'], df_pairs['pair_max']]).value_counts()
    eligible_authors = counts[counts >= 2].index

    df_filtered = df_pairs[
        df_pairs['pair_min'].isin(eligible_authors) &
        df_pairs['pair_max'].isin(eligible_authors)
    ].copy()

    print(f"‚úÖ Autores elegibles: {len(eligible_authors):,}")
    print(f"‚úÖ Interacciones resultantes: {len(df_filtered):,}")
    return df_filtered

# ---------------------------------------------------------------
# 2. SPLIT LOO TRIPLE
# ---------------------------------------------------------------
def triple_loo_split(df, seed=42):
    print("‚úÇÔ∏è Generando Split LOO (Train/Val/Test)...")
    rng = np.random.default_rng(seed)
    adj = defaultdict(list)
    for idx, row in enumerate(df.itertuples()):
        adj[row.pair_min].append(idx)
        adj[row.pair_max].append(idx)

    test_idx, val_idx, assigned = set(), set(), set()

    for author in adj:
        indices = [i for i in adj[author] if i not in assigned]
        if len(indices) >= 3:
            rng.shuffle(indices)
            test_idx.add(indices[0])
            val_idx.add(indices[1])
            assigned.update([indices[0], indices[1]])
        elif len(indices) == 2:
            t = rng.choice(indices)
            test_idx.add(t)
            assigned.add(t)

    all_indices = np.arange(len(df))
    train_idx = np.setdiff1d(all_indices, list(test_idx | val_idx))

    return df.iloc[train_idx], df.iloc[list(val_idx)], df.iloc[list(test_idx)]

# ---------------------------------------------------------------
# 3. M√âTRICAS VECTORIZADAS (CON FIX OOM)
# ---------------------------------------------------------------
def calculate_metrics_batch(U, R_train, R_target, users_to_eval, item_popularity, total_authors, K=20, batch_size=5000):
    """
    Evaluaci√≥n en GPU. batch_size reducido a 500 para evitar OutOfMemory.
    """
    u_tensor = torch.from_numpy(U).cuda()
    recall_list, ndcg_list = [], []
    recommended_items_all = set()
    novelty_sum = 0.0

    total_train_interactions = sum(item_popularity.values())

    # Pre-calcular log2 para NDCG
    log2_ranks = np.log2(np.arange(K) + 2)

    for i in range(0, len(users_to_eval), batch_size):
        batch_idx = users_to_eval[i:i + batch_size]

        with torch.inference_mode():
            # C√°lculo de similitud: (Batch x Factors) @ (Factors x Total)
            scores = torch.matmul(u_tensor[batch_idx], u_tensor.t())
            scores_cpu = scores.cpu().numpy()
            del scores # Liberar memoria GPU inmediatamente

        for idx, u_idx in enumerate(batch_idx):
            u_scores = scores_cpu[idx]

            # Filtro: ya vistos y self
            seen = R_train.getrow(u_idx).indices
            u_scores[seen] = -1e10
            u_scores[u_idx] = -1e10

            # Top-K
            idx_part = np.argpartition(-u_scores, K - 1)[:K]
            preds = idx_part[np.argsort(-u_scores[idx_part])]

            recommended_items_all.update(preds)

            # Novelty
            u_nov = sum([-np.log2(item_popularity.get(p, 1)/total_train_interactions) for p in preds])
            novelty_sum += (u_nov / K)

            # Recall & NDCG
            target_indices = R_target.getrow(u_idx).indices
            hits_mask = np.isin(preds, target_indices)
            hits_count = np.sum(hits_mask)

            recall_list.append(hits_count / len(target_indices))

            if hits_count > 0:
                dcg = np.sum(1.0 / log2_ranks[hits_mask])
                idcg = np.sum(1.0 / log2_ranks[:min(len(target_indices), K)])
                ndcg_list.append(dcg / idcg)
            else:
                ndcg_list.append(0.0)

        torch.cuda.empty_cache()

    del u_tensor
    return {
        "recall": np.mean(recall_list),
        "ndcg": np.mean(ndcg_list),
        "coverage": len(recommended_items_all) / total_authors,
        "novelty": novelty_sum / len(users_to_eval)
    }

# ---------------------------------------------------------------
# 4. ENTRENAMIENTO ALS
# ---------------------------------------------------------------
def train_mf_optimized(R_train, factors, reg, alpha, iterations=20):
    R_conf = (R_train * alpha).astype(np.float32)
    model = AlternatingLeastSquares(
        factors=factors, regularization=reg, iterations=iterations,
        use_gpu=True, random_state=42
    )
    model.fit(R_conf.tocsr().T, show_progress=False)

    P, Q = model.user_factors, model.item_factors
    if not isinstance(P, np.ndarray):
        P, Q = P.to_numpy(), Q.to_numpy()

    U = (P + Q) / 2.0
    norms = np.linalg.norm(U, axis=1, keepdims=True)
    norms[norms == 0] = 1e-10
    return (U / norms).astype(np.float32)

# ---------------------------------------------------------------
# 5. PIPELINE PRINCIPAL
# ---------------------------------------------------------------
def run_experiment(df_raw, author_to_idx):
    df_filtered = preprocess_data(df_raw)
    df_train, df_val, df_test = triple_loo_split(df_filtered)

    n_authors = len(author_to_idx)

    def to_csr(df):
        r = df['pair_min'].map(author_to_idx).values
        c = df['pair_max'].map(author_to_idx).values
        data = np.ones(len(r))
        rows = np.concatenate([r, c]); cols = np.concatenate([c, r])
        d = np.concatenate([data, data])
        return csr_matrix((d, (rows, cols)), shape=(n_authors, n_authors))

    R_train = to_csr(df_train)
    R_val = to_csr(df_val)
    R_test = to_csr(df_test)

    train_counts = pd.concat([df_train['pair_min'], df_train['pair_max']]).map(author_to_idx).value_counts().to_dict()

    # --- FASE 1: GRID SEARCH (20,000 Autores) ---
    print("\nüîç FASE 1: Tuning con muestra de 20k autores...")
    val_users = np.where(R_val.getnnz(axis=1) > 0)[0]
    sample_val = np.random.choice(val_users, min(20000, len(val_users)), replace=False)

    best_ndcg = -1
    best_params = {}

    for f in [128, 256]:
        for a in [10, 40]:
            for r in [0.01, 0.1, 1.0]: # Regularizaci√≥n a√±adida
                t0 = time.time()
                U = train_mf_optimized(R_train, factors=f, reg=r, alpha=a, iterations=15)
                m = calculate_metrics_batch(U, R_train, R_val, sample_val, train_counts, n_authors)

                print(f"Factors: {f} | Alpha: {a} | Reg: {r} -> NDCG: {m['ndcg']:.4f} | Recall: {m['recall']:.4f} ({time.time()-t0:.1f}s)")

                if m['ndcg'] > best_ndcg:
                    best_ndcg = m['ndcg']
                    best_params = {'factors': f, 'alpha': a, 'reg': r}

                # Limpieza estricta de memoria en cada iteraci√≥n del grid
                del U
                gc.collect()
                torch.cuda.empty_cache()

    # --- FASE 2: EVALUACI√ìN FINAL EN TEST ---
    print(f"\nüèÜ FASE 2: Evaluaci√≥n Final con par√°metros {best_params}...")
    R_train_full = R_train + R_val
    full_counts = pd.concat([df_train, df_val])[['pair_min', 'pair_max']].stack().map(author_to_idx).value_counts().to_dict()

    U_final = train_mf_optimized(R_train_full, **best_params, iterations=30)

    test_users = np.where(R_test.getnnz(axis=1) > 0)[0]
    print(f"üìä Evaluando {len(test_users):,} autores en Test...")

    final_m = calculate_metrics_batch(U_final, R_train_full, R_test, test_users, full_counts, n_authors)

    print("\n" + "="*50)
    print("üéØ RESULTADOS FINALES EN TEST (LOO)")
    print(f"Recall@20:   {final_m['recall']:.6f}")
    print(f"NDCG@20:     {final_m['ndcg']:.6f}")
    print(f"Coverage:    {final_m['coverage']:.6f}")
    print(f"Novelty:     {final_m['novelty']:.6f}")
    print("="*50)

if __name__ == "__main__":
    # Cargar datos
    df_path = "data/df_pairs_unique.pkl"
    a2idx_path = "data/author_to_idx.npy"

    if os.path.exists(df_path) and os.path.exists(a2idx_path):
        df_pairs_unique = pd.read_pickle(df_path)
        author2idx = np.load(a2idx_path, allow_pickle=True).item()
        run_experiment(df_pairs_unique, author2idx)
    else:
        print("‚ùå Error: No se encontraron los archivos de datos.")