In [2]:
import os

import pandas as pd
import numpy as np

import time

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as sf
import pyspark.sql.types as st
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, FMClassifier
from pyspark.ml import PipelineModel
from sim4rec.modules import Simulator

from replay.metrics import NDCG, Precision, RocAuc, Metric
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, f1_score
from IPython.display import clear_output

from sim4rec.utils import VectorElementExtractor
from sim4rec.modules import RealDataGenerator, SDVDataGenerator
from sim4rec.modules import EvaluateMetrics
from sim4rec.response import ParametricResponseFunction, BernoulliResponse

from replay.models import UCB, ThompsonSampling
from replay.models import RandomRec
from replay.splitters import RandomSplitter

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

spark = SparkSession.builder\
    .appName('simulator_movielens')\
    .master('local[*]')\
    .config('spark.sql.shuffle.partitions', '192')\
    .config('spark.default.parallelism', '192')\
    .config('spark.driver.extraJavaOptions', '-XX:+UseG1GC')\
    .config('spark.executor.extraJavaOptions', '-XX:+UseG1GC')\
    .config('spark.sql.autoBroadcastJoinThreshold', '-1')\
    .config('spark.driver.memory', '256g')\
    .config('spark.driver.maxResultSize', '256g')\
    .getOrCreate()
spark.sparkContext.setLogLevel('ERROR')

def calc_metric(response_df):
    return response_df.groupBy("user_idx").agg(sf.sum("response").alias("num_positive")).select(sf.mean("num_positive")).collect()[0][0]

24/05/26 14:42:56 WARN Utils: Your hostname, ecs-syudosaev-big resolves to a loopback address: 127.0.1.1; using 10.11.12.124 instead (on interface eth0)
24/05/26 14:42:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/05/26 14:42:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/26 14:42:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
users_df = spark.read.parquet('preprocessed/users.parquet')
items_df = spark.read.parquet('preprocessed/items.parquet')
log_df   = spark.read.parquet('preprocessed/rating.parquet')

items_df = items_df.dropDuplicates(subset=['item_idx'])
users_df = users_df.dropDuplicates(subset=['user_idx'])
log_df = log_df.dropDuplicates(subset=['user_idx', 'item_idx'])

items_df = items_df.withColumn('item_idx', sf.col('item_idx').cast('int'))
users_df = users_df.withColumn('user_idx', sf.col('user_idx').cast('int'))
log_df = log_df.withColumn('item_idx', sf.col('item_idx').cast('int'))
log_df = log_df.withColumn('user_idx', sf.col('user_idx').cast('int'))

log_df = log_df.join(users_df, log_df['user_idx'] == users_df['user_idx'], 'leftsemi')
log_df = log_df.join(items_df, log_df['item_idx'] == items_df['item_idx'], 'leftsemi')

for c in users_df.columns[1:]:
    users_df = users_df.withColumnRenamed(c, 'user_' + c)

for c in items_df.columns[1:]:
    items_df = items_df.withColumnRenamed(c, 'item_' + c)

users_df = users_df.cache()
items_df = items_df.cache()
log_df = log_df.cache()

In [3]:
full_dataframe = log_df.join(items_df, on='item_idx', how='inner').join(users_df, on='user_idx', how='inner').drop('timestamp')
avg_item_ratings = full_dataframe.select('item_idx', 'relevance').groupBy('item_idx').agg(sf.mean("relevance").alias('item_rating_avg'))
full_dataframe = full_dataframe.join(avg_item_ratings, on='item_idx', how='inner')
train_df, test_df = RandomSplitter(test_size=0.2, seed=9, drop_cold_items=True, drop_cold_users=True).split(full_dataframe)

train_df = train_df.withColumn('relevance', sf.when(sf.col('relevance') >= 1, 1).otherwise(0))
test_df = test_df.withColumn('relevance', sf.when(sf.col('relevance') >= 1, 1).otherwise(0))

                                                                                

In [4]:
# item_svd = spark.read.csv('item_svd.csv', header=True, inferSchema=True)
# user_svd = spark.read.csv('user_svd.csv', header=True, inferSchema=True)

# train_df = train_df.join(item_svd, on='item_idx', how='inner')
# train_df = train_df.join(user_svd, on='user_idx', how='inner')

# test_df = test_df.join(item_svd, on='item_idx', how='inner')
# test_df = test_df.join(user_svd, on='user_idx', how='inner')

In [7]:
print('train statistics')
print(train_df.count())
print(train_df.select('user_idx').distinct().count())
print(train_df.select('item_idx').distinct().count())
print(train_df.count() / (train_df.select('user_idx').distinct().count() * train_df.select('item_idx').distinct().count()))
print()

print('test statistics')
print(test_df.count())
print(test_df.select('user_idx').distinct().count())
print(test_df.select('item_idx').distinct().count())
print(test_df.count() / (test_df.select('user_idx').distinct().count() * test_df.select('item_idx').distinct().count()))

train statistics
639
128


                                                                                

85
0.05873161764705882

test statistics
116
71
64
0.025528169014084508


In [4]:
va = VectorAssembler(
    inputCols=items_df.columns[1:] + users_df.columns[1:],
    outputCol='features'
)

lr = LogisticRegression(
    featuresCol='features',
    labelCol='relevance',
    probabilityCol='proba'
)
fm = FMClassifier(
    featuresCol='features',
    labelCol='relevance',
    probabilityCol='proba'
)
rf = RandomForestClassifier(
    featuresCol='features',
    labelCol='relevance',
    probabilityCol='proba'
)
lr_model = lr.fit(va.transform(train_df))
rf_model = rf.fit(va.transform(train_df))
fm_model = fm.fit(va.transform(train_df))

vee = VectorElementExtractor(inputCol='proba', outputCol='scores', index=1)
mc = ParametricResponseFunction(inputCols=['scores'], outputCol='__pr', weights=[0.25])
br = BernoulliResponse(inputCol='__pr', outputCol='response', seed=1234)
pipeline_lr = PipelineModel(stages=[va, lr_model, vee, mc, br])
pipeline_rf = PipelineModel(stages=[va, rf_model, vee, mc, br])
pipeline_fm = PipelineModel(stages=[va, fm_model, vee, mc, br])

                                                                                

In [5]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score

def get_baseline(pred_df):
    pred_df = pred_df.select('item_rating_avg', 'relevance').toPandas()
    pred_df['baseline'] = (pred_df['item_rating_avg']-pred_df['item_rating_avg'].min())/(pred_df['item_rating_avg'].max()-pred_df['item_rating_avg'].min())
    pred_df['baseline_bin'] = np.where(pred_df['baseline'] > 0.5, 1, 0) 
    print(f"ROC AUC (baseline): {roc_auc_score(pred_df.relevance, pred_df.baseline)}")
    print(f"Precision (baseline): {precision_score(pred_df.relevance, pred_df.baseline_bin)}")
    print(f"Recall (baseline): {recall_score(pred_df.relevance, pred_df.baseline_bin)}")
    print(f"Accuracy (baseline): {accuracy_score(pred_df.relevance, pred_df.baseline_bin)}")
    print()

def assess_models(model, test_df):
    pred_df = model.transform(test_df).select("relevance", "scores").toPandas()
    pred_df['response_bin'] = np.where(pred_df['scores'] > 0.5, 1, 0)
    print(f"ROC AUC (classificator): {roc_auc_score(pred_df.relevance, pred_df.scores)}")
    print(f"Precision (classificator): {precision_score(pred_df.relevance, pred_df.response_bin)}")
    print(f"Recall (classificator): {recall_score(pred_df.relevance, pred_df.response_bin)}")
    print(f"Accuracy (classificator): {accuracy_score(pred_df.relevance, pred_df.response_bin)}")
    print()

get_baseline(test_df)
assess_models(pipeline_lr, test_df)
assess_models(pipeline_rf, test_df)
assess_models(pipeline_fm, test_df)

# 0.7355

ROC AUC (baseline): 0.7902494331065759
Precision (baseline): 0.9433962264150944
Recall (baseline): 0.5102040816326531
Accuracy (baseline): 0.5603448275862069



                                                                                

ROC AUC (classificator): 0.8106575963718821
Precision (classificator): 0.9354838709677419
Recall (classificator): 0.8877551020408163
Accuracy (classificator): 0.853448275862069



                                                                                

ROC AUC (classificator): 0.778344671201814
Precision (classificator): 0.8448275862068966
Recall (classificator): 1.0
Accuracy (classificator): 0.8448275862068966





ROC AUC (classificator): 0.6043083900226757
Precision (classificator): 0.8777777777777778
Recall (classificator): 0.8061224489795918
Accuracy (classificator): 0.7413793103448276



                                                                                

In [6]:
ts_lr = ThompsonSampling(sample = True)
ts_lr.fit(log=train_df.drop('response').limit(1))

ts_rf = ThompsonSampling(sample = True)
ts_rf.fit(log=train_df.drop('response').limit(1))

ts_fm = ThompsonSampling(sample = True)
ts_fm.fit(log=train_df.drop('response').limit(1))

random_uni = RandomRec(distribution="uniform")
random_uni.fit(log=train_df.drop('response').limit(1))

In [10]:
evaluator = EvaluateMetrics(
    userKeyCol='user_idx',
    itemKeyCol='item_idx',
    predictionCol='relevance',
    labelCol='response',
    replay_label_filter=1.0,
    replay_metrics={NDCG() : 1, Precision() : 1, RocAuc(): 1}
)

users_generator = SDVDataGenerator(
    label='synth',
    id_column_name='user_id',
    model_name='gaussiancopula',
    parallelization_level=4,
    device_name='cpu',
    seed=1234
)

items_generator = RealDataGenerator(label='items_real', seed=1234)
users_generator.fit(users_df)
items_generator.fit(items_df)
real_users = users_df.sample(1.0)
syn_users = users_generator.generate(10000)

                                                                                

In [14]:
from sim4rec.modules.evaluation import evaluate_synthetic

gen_score = evaluate_synthetic(
    syn_users.drop('user_idx').drop('user_id'),
    real_users.drop('user_idx')
)
gen_score

In [16]:
def do_a_cycle(simul, model, pipeline, iteration, metrics):
    users = simul.sample_users(1.0).cache()
    log = simul.get_log(user_df=users)
    log = train_df.drop('response').limit(1) if log is None else log

    item_ids = items_df.select("item_idx").sample(0.2).cache()
    recs = model.predict(
        log,
        k=1,
        users=users.select("user_idx"),
        items=item_ids,
        filter_seen_items = False
    )
    resp = simul.sample_responses(
        recs_df=recs, 
        user_features=users,
        item_features=items_df,
        action_models=pipeline,
    ).select('user_idx', 'item_idx', 'relevance', 'response').cache()
    simul.update_log(resp, iteration=iteration)
    met = calc_metric(resp)
    ev = evaluator(resp)
    ev['CR'] = met
    metrics.append(ev)

    model._clear_cache()
    train_log = simul.log.cache()
    model.fit(train_log.select('user_idx', 'item_idx', 'response').withColumnRenamed('response', 'relevance'))

    log.unpersist()
    users.unpersist()
    recs.unpersist()
    resp.unpersist()
    train_log.unpersist()

sim_lr = Simulator(users_generator, items_generator, f'checkpoints/lr', None, 'user_idx', 'item_idx', spark)
sim_rf = Simulator(users_generator, items_generator, f'checkpoints/rf', None, 'user_idx', 'item_idx', spark)
sim_fm = Simulator(users_generator, items_generator, f'checkpoints/fm', None, 'user_idx', 'item_idx', spark)
sim_rand = Simulator(users_generator, items_generator, f'checkpoints/rand', None, 'user_idx', 'item_idx', spark)

lr_metrics = []
rf_metrics = []
fm_metrics = []
rnd_metrics = []

for i in range(50):
    print(f'------------------------Stage {i}------------------------')
    start_iter = time.time()

    do_a_cycle(sim_lr, ts_lr, pipeline_lr, i, lr_metrics)
    do_a_cycle(sim_rf, ts_rf, pipeline_rf, i, rf_metrics)
    do_a_cycle(sim_fm, ts_fm, pipeline_fm, i, fm_metrics)
    do_a_cycle(sim_rand, random_uni, pipeline_lr, i, rnd_metrics)
    # plot_metric(lr_metrics)
    clear_output(wait=True)
    end_iter = time.time()
    print(f"Time of {i+1} iteration: ")
    print(end_iter - start_iter)

Time of 50 iteration: 
35.52098202705383


In [11]:
import pickle
with open('lr_metrics.pickle', 'wb') as f:
    pickle.dump(lr_metrics, f)
with open('rf_metrics.pickle', 'wb') as f:
    pickle.dump(rf_metrics, f)
with open('fm_metrics.pickle', 'wb') as f:
    pickle.dump(fm_metrics, f)
with open('rand_metrics.pickle', 'wb') as f:
    pickle.dump(rnd_metrics, f)