In [1]:
import pandas as pd
import numpy as np

from simulator.utils import pandas_to_spark

users_df = pd.DataFrame(data=np.random.normal(0, 1, size=(1000, 35)), columns=[f'user_attr_{i}' for i in range(35)])
items_df = pd.DataFrame(data=np.random.normal(1, 1, size=(200, 20)), columns=[f'item_attr_{i}' for i in range(20)])
users_df['user_idx'] = np.arange(len(users_df))
items_df['item_idx'] = np.arange(len(items_df))
history_df = pd.DataFrame()
history_df['user_idx'] = np.random.randint(0, 1000, size=3000)
history_df['item_idx'] = np.random.randint(0, 200, size=3000)
history_df['rating'] = np.random.randint(0, 5, size=3000)
history_df['timestamp'] = 0
history_df = history_df.drop_duplicates(subset=['user_idx', 'item_idx'], ignore_index=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from simulator.modules import Simulator
from simulator.utils import load
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .appName('simulator')\
    .master('local[1]')\
    .getOrCreate()

# spark = SparkSession.builder\
#     .appName('simulator')\
#     .master('spark://10.32.1.50:7077')\
#     .getOrCreate()

user_gen = load('models/demo_user_gen.m')
item_gen = load('models/demo_item_gen.m')
resp_func = load('models/demo_response.m')

sim = Simulator(
    user_generators={'main' : user_gen, 'cluster_1' : user_gen},
    item_generator=item_gen,
    spark_session=spark
)

sim.init(
    num_users={'main' : 100, 'cluster_1' : 50},
    num_items=10,
    user_key_col='user_idx',
    item_key_col='item_idx',
    user_df=pandas_to_spark(users_df),
    item_df=pandas_to_spark(items_df),
    history_df=pandas_to_spark(history_df)
)

22/06/09 13:50:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/09 13:50:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2022-06-09 13:50:49.673770: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-09 13:51:03.632669: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13658 MB memory:  -> device: 0, name: A100-SXM4-40GB, pc

In [3]:
from replay.models import RandomRec

train_users = sim.sample_users(
    num_synth_users={'main' : 4, 'cluster_1' : 4},
    num_real_users=4
)

train_log, train_users, train_items = sim.get_train_log(
    user_df=train_users,
    response_func=lambda x,y : resp_func(x.toPandas(), y.toPandas(), [0.0, 0.0, 1.0]),
    use_synth_items=True,
    use_real_items=True
)

model = RandomRec()
model.fit(
    log=train_log
)

                                                                                

In [5]:
from simulator.modules import sample_response

for i in range(1):
    users = sim.sample_users(
        num_synth_users={'main' : 10, 'cluster_1' : 5}
    )

    log, users, items = sim.get_user_items(users, 4)

    recs = model.predict(
        log=log,
        k=2,
        users=users.select('user_idx'),
        items=items.select('item_idx')
    )

    true_resp = sim.sample_responses(
        recommendations_df=recs,
        action_models={
            'rated' : lambda x, y : sample_response(x.toPandas(), y.toPandas(), theta=0.8),
            'relevance' : lambda x, y : resp_func(x.toPandas(), y.toPandas(), [0.0, 0.0, 1.0])
        },
        save_history=False
    )



06-09 16:16:51 I deeptables.m.deeptable.py 685 - Perform prediction...
06-09 16:16:51 I deeptables.m.preprocessor.py 242 - Transform [X]...
06-09 16:16:51 I deeptables.m.preprocessor.py 249 - transform_X taken 0.07953262329101562s
06-09 16:16:51 I deeptables.m.deepmodel.py 130 - Performing predictions...
06-09 16:16:51 I deeptables.u.dataset_generator.py 240 - create dataset generator with _TFDGForPandas, batch_size=128, shuffle=False, drop_remainder=False


                                                                                

06-09 16:16:53 I deeptables.m.deeptable.py 559 - predict_proba taken 1.8718376159667969s
06-09 16:16:53 I deeptables.m.deeptable.py 594 - Reverse indicators to labels.


2022-06-09 16:16:53.477552: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
                                                                                

In [7]:
sim.history_df.toPandas()

                                                                                

Unnamed: 0,user_idx,item_idx,relevance,timestamp
0,5,3,3.0,0
1,48,5,3.0,0
2,67,9,3.0,0
3,29,5,3.0,0


In [8]:
true_resp.toPandas()

                                                                                

Unnamed: 0,user_idx,item_idx,rated,relevance
0,48,5,1,3.0
1,37,5,0,3.0
2,92,9,0,3.0
3,30,9,0,3.0
4,95,1,0,3.0
5,30,5,0,3.0
6,19,1,0,2.0
7,40,2,0,3.0
8,67,9,1,3.0
9,18,9,0,2.0


In [9]:
recs.toPandas()

                                                                                ]

Unnamed: 0,user_idx,item_idx,relevance
0,18,5,1.0
1,18,1,0.5
2,19,5,1.0
3,19,2,0.5
4,40,1,1.0
5,40,5,0.5
6,37,9,1.0
7,37,1,0.5
8,48,9,1.0
9,48,1,0.5
