In [24]:
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF, TruncatedSVD
from implicit.als import AlternatingLeastSquares
from sklearn.metrics import mean_squared_error, ndcg_score
from scipy.stats import spearmanr
from scipy.sparse import csr_matrix
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

df = pd.read_json('reviewer_topic_probs.json', lines=True)
df

Unnamed: 0,reviewerID,topic_probs_vec
0,A10175AMUHOQC4,"[0.0341578359, 0.0233554812, 0.035577717200000..."
1,A101IGU6UDKW3X,"[0.0150641061, 0.0196110288, 0.0112277638, 0.0..."
2,A103KNDW8GN92L,"[0.027369901300000003, 0.0074546828, 0.0104835..."
3,A106016KSI0YQ,"[0.050918084200000005, 0.020803142400000002, 0..."
4,A106YXO3EHVD3J,"[0.0453389835, 0.0208799938, 0.0079441582, 0.0..."
...,...,...
2123,AZC562U18BK2S,"[0.0190736573, 0.0657807276, 0.0201534128, 0.0..."
2124,AZDH08P9ZMWKJ,"[0.0111420923, 0.008560919200000001, 0.0126432..."
2125,AZDVOFC2MTIM5,"[0.0079680183, 0.006413963700000001, 0.0031580..."
2126,AZQ8EITRKV9GS,"[0.1075458745, 0.0194778105, 0.0135147544, 0.1..."


In [25]:
min_val = df['topic_probs_vec'].apply(np.min).min()
max_val = df['topic_probs_vec'].apply(np.max).max()

print("Menor valor:", min_val)
print("Maior valor:", max_val)

amplitude = max_val - min_val

Menor valor: 0.00040061830000000004
Maior valor: 0.7358338819


In [26]:
def make_mask_random(M, test_frac=0.2):
    rng = np.random.default_rng()
    n_users, n_items = M.shape
    mask = np.ones_like(M, dtype=bool)

    for u in range(n_users):
        n_test = int(round(test_frac * n_items))
        if n_test > 0:
            test_cols = rng.choice(n_items, size=n_test, replace=False)
            mask[u, test_cols] = False
    return mask

    
def nmf_reconstruct(M, n_components=16, max_iter=500, random_state=42):
    nmf = NMF(n_components=n_components, random_state=random_state, max_iter=max_iter)
    U = nmf.fit_transform(M)
    V = nmf.components_.T
    R = U @ V.T 
    return R


def svd_reconstruct(M, n_components=16, random_state=42):
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    U = svd.fit_transform(csr_matrix(M))  
    V = svd.components_.T       
    R = U @ V.T
    return R

def als_reconstruct(M, factors=64, regularization=0.05, alpha=2.0, apply_wight=False):
    als = AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        alpha=alpha,
        random_state=42
    )
    X = csr_matrix(M, dtype="float32")
    
    if(apply_wight):
        X = bm25_weight(X).tocsr() 
    
    als.fit(X)

    U = als.user_factors
    V = als.item_factors
    R = U @ V.T

    return R

def eval_on_mask(M_true, M_pred, train_mask, K=10):
    test_mask = ~train_mask

    y_true = M_true[test_mask]
    y_pred = M_pred[test_mask]
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred))) if y_true.size else 0.0
    mae  = float(np.mean(np.abs(y_true - y_pred))) if y_true.size else 0.0

    return {
        "RMSE_test": rmse,
        "MAE_test": mae,
    }
  
M = np.vstack(df['topic_probs_vec'].values)

train_mask = make_mask_random(M,test_frac=0.2)

M_train =  M * train_mask
M_test = M* (~train_mask)

R_nmf = nmf_reconstruct(M_train, n_components=15, max_iter=600, random_state=42)
R_svd = svd_reconstruct(M_train, n_components=15, random_state=42)
R_als = als_reconstruct(M_train)

nmf_metrics = eval_on_mask(M, R_nmf, train_mask, K=10)
svd_metrics = eval_on_mask(M, R_svd, train_mask, K=10)
als_metrics = eval_on_mask(M, R_als, train_mask, K=10)

print("NMF:", nmf_metrics)
print("SVD:", svd_metrics)
print("ALS:", als_metrics)

100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 58.07it/s]

NMF: {'RMSE_test': 0.025829350205826874, 'MAE_test': 0.011210732101160116}
SVD: {'RMSE_test': 0.02595217282536815, 'MAE_test': 0.011526467786804738}
ALS: {'RMSE_test': 0.02531683348292738, 'MAE_test': 0.011339201469000894}





In [27]:
nmf_rmse_relativo = nmf_metrics['RMSE_test'] / amplitude
svd_rmse_relativo = svd_metrics['RMSE_test'] / amplitude
als_rmse_relativo = als_metrics['RMSE_test'] / amplitude

nmf_mae_relativo = nmf_metrics['MAE_test'] / amplitude
svd_mae_relativo = svd_metrics['MAE_test'] / amplitude
als_mae_relativo = als_metrics['MAE_test'] / amplitude

print(f"(NMF) RMSE RELATIVO:{nmf_rmse_relativo}, MAE RELATIVO:{nmf_mae_relativo}")
print(f"(SVD) RMSE RELATIVO:{svd_rmse_relativo}, MAE RELATIVO:{svd_mae_relativo}")
print(f"(ALS) RMSE RELATIVO:{als_rmse_relativo}, MAE RELATIVO:{als_mae_relativo}")

(NMF) RMSE RELATIVO:0.03512126998361524, MAE RELATIVO:0.015243710960642108
(SVD) RMSE RELATIVO:0.035288277142007905, MAE RELATIVO:0.01567303024938229
(ALS) RMSE RELATIVO:0.03442437911905102, MAE RELATIVO:0.015418396243725322


In [28]:
df = pd.read_json('reviewer_topic_consume.json', lines=True)
df

min_val = df['topic_consume'].apply(np.min).min()
max_val = df['topic_consume'].apply(np.max).max()

print("Menor valor:", min_val)
print("Maior valor:", max_val)

amplitude = max_val - min_val

M = np.vstack(df['topic_consume'].values)

train_mask = make_mask_random(M,test_frac=0.2)

M_train =  M * train_mask
M_test = M* (~train_mask)

R_nmf = nmf_reconstruct(M_train, n_components=15, max_iter=600, random_state=42)
R_svd = svd_reconstruct(M_train, n_components=15, random_state=42)
R_als = als_reconstruct(M_train)

nmf_metrics = eval_on_mask(M, R_nmf, train_mask, K=10)
svd_metrics = eval_on_mask(M, R_svd, train_mask, K=10)
als_metrics = eval_on_mask(M, R_als, train_mask, K=10)

print("NMF:", nmf_metrics)
print("SVD:", svd_metrics)
print("ALS:", als_metrics)

Menor valor: 0
Maior valor: 100


100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 57.03it/s]

NMF: {'RMSE_test': 2.502423136150787, 'MAE_test': 0.8550231937175433}
SVD: {'RMSE_test': 2.5085077117385692, 'MAE_test': 0.8752901896956471}
ALS: {'RMSE_test': 2.5975123315395443, 'MAE_test': 0.8168584585023981}





In [29]:
nmf_rmse_relativo = nmf_metrics['RMSE_test'] / amplitude
svd_rmse_relativo = svd_metrics['RMSE_test'] / amplitude
als_rmse_relativo = als_metrics['RMSE_test'] / amplitude

nmf_mae_relativo = nmf_metrics['MAE_test'] / amplitude
svd_mae_relativo = svd_metrics['MAE_test'] / amplitude
als_mae_relativo = als_metrics['MAE_test'] / amplitude

print(f"(NMF) RMSE RELATIVO:{nmf_rmse_relativo}, NMAE:{nmf_mae_relativo}")
print(f"(SVD) RMSE RELATIVO:{svd_rmse_relativo}, NMAE:{svd_mae_relativo}")
print(f"(ALS) RMSE RELATIVO:{als_rmse_relativo}, NMAE:{als_mae_relativo}")

(NMF) RMSE RELATIVO:0.02502423136150787, NMAE:0.008550231937175433
(SVD) RMSE RELATIVO:0.025085077117385692, NMAE:0.008752901896956471
(ALS) RMSE RELATIVO:0.02597512331539544, NMAE:0.008168584585023982
