In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF, TruncatedSVD
from implicit.als import AlternatingLeastSquares
from sklearn.metrics import mean_squared_error, ndcg_score
from scipy.stats import spearmanr
from scipy.sparse import csr_matrix
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

#df = pd.read_json('reviewer_topic_probs.json', lines=True)
df = pd.read_json('reviewer_lr_embeddings.json', lines=True)
df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,ReviewerID,embedding
0,AZB4CQ9JZSUQB,"[-0.1002893806, 0.1910113383, -0.0663955192, 0..."
1,A101IGU6UDKW3X,"[-0.0683784493, 0.0418556511, -0.1108795497000..."
2,A1L2DLWYRNHKDR,"[-0.1914820764, -0.0327386724, 0.3748987079, 0..."
3,A3I4C4LS3ID7Y4,"[-0.0352246741, -0.0711002168, 0.1764542804, 0..."
4,A3ISFNQ9FJSPFP,"[-0.1741380509, 0.0417162208, 0.2620199448, -0..."
...,...,...
2273,A1VF5LN6SHFVFJ,"[-0.0930313624, 0.2948077503, 0.19831378330000..."
2274,A2PVW85TT5U7MR,"[-0.0744494782, 0.0978276403, -0.0207729830000..."
2275,A2NOLI10R74GTE,"[-0.0330551456, -0.130288697, 0.2677684896, 0...."
2276,A7QZOJZX85TIR,"[-0.0777991547, -0.1210835935, -0.184228633200..."


In [8]:
min_val = df['embedding'].apply(np.min).min()
max_val = df['embedding'].apply(np.max).max()

print("Menor valor:", min_val)
print("Maior valor:", max_val)

amplitude = max_val - min_val

Menor valor: -0.4195983579
Maior valor: 0.9656952523000001


In [5]:
def make_mask_random(M, test_frac=0.2):
    rng = np.random.default_rng()
    n_users, n_items = M.shape
    mask = np.ones_like(M, dtype=bool)

    for u in range(n_users):
        n_test = int(round(test_frac * n_items))
        if n_test > 0:
            test_cols = rng.choice(n_items, size=n_test, replace=False)
            mask[u, test_cols] = False
    return mask

    
def nmf_reconstruct(M, n_components=16, max_iter=500, random_state=42):
    nmf = NMF(n_components=n_components, random_state=random_state, max_iter=max_iter)
    U = nmf.fit_transform(M)
    V = nmf.components_.T
    R = U @ V.T 
    return R


def svd_reconstruct(M, n_components=16, random_state=42):
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    U = svd.fit_transform(csr_matrix(M))  
    V = svd.components_.T       
    R = U @ V.T
    return R

def als_reconstruct(M, factors=64, regularization=0.05, alpha=2.0, apply_wight=False):
    als = AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        alpha=alpha,
        random_state=42
    )
    X = csr_matrix(M, dtype="float32")
    
    if(apply_wight):
        X = bm25_weight(X).tocsr() 
    
    als.fit(X)

    U = als.user_factors
    V = als.item_factors
    R = U @ V.T

    return R

def eval_on_mask(M_true, M_pred, train_mask, K=10):
    test_mask = ~train_mask

    y_true = M_true[test_mask]
    y_pred = M_pred[test_mask]
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred))) if y_true.size else 0.0
    mae  = float(np.mean(np.abs(y_true - y_pred))) if y_true.size else 0.0

    return {
        "RMSE_test": rmse,
        "MAE_test": mae,
    }
  
M = np.vstack(df['embedding'].values)

train_mask = make_mask_random(M,test_frac=0.2)

M_train =  M * train_mask
M_test = M* (~train_mask)

R_svd = svd_reconstruct(M_train, n_components=15, random_state=42)
R_als = als_reconstruct(M_train)

svd_metrics = eval_on_mask(M, R_svd, train_mask, K=10)
als_metrics = eval_on_mask(M, R_als, train_mask, K=10)

print("SVD:", svd_metrics)
print("ALS:", als_metrics)

  check_blas_config()
100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 57.87it/s]

SVD: {'RMSE_test': 0.13130484416945953, 'MAE_test': 0.0825799950504459}
ALS: {'RMSE_test': 0.13507967405355303, 'MAE_test': 0.08457607949452067}





In [13]:
svd_rmse_relativo = svd_metrics['RMSE_test'] / amplitude
als_rmse_relativo = als_metrics['RMSE_test'] / amplitude

svd_mae_relativo = svd_metrics['MAE_test'] / amplitude
als_mae_relativo = als_metrics['MAE_test'] / amplitude

print(f"(SVD) RMSE RELATIVO:{svd_rmse_relativo}, MAE RELATIVO:{svd_mae_relativo}")
print(f"(ALS) RMSE RELATIVO:{als_rmse_relativo}, MAE RELATIVO:{als_mae_relativo}")

(SVD) RMSE RELATIVO:0.09478484792151935, MAE RELATIVO:0.05961190786011314
(ALS) RMSE RELATIVO:0.09750977919695382, MAE RELATIVO:0.06105281860233954
