## Synthetic users, items, history datasets

In [1]:
import pandas as pd
import numpy as np

from simulator.utils import pandas_to_spark

users_df = pd.DataFrame(data=np.random.normal(0, 1, size=(1000, 35)), columns=[f'user_attr_{i}' for i in range(35)])
items_df = pd.DataFrame(data=np.random.normal(1, 1, size=(200, 20)), columns=[f'item_attr_{i}' for i in range(20)])
users_df['user_id'] = np.arange(len(users_df))
items_df['item_id'] = np.arange(len(items_df))
history_df = pd.DataFrame()
history_df['user_id'] = np.random.randint(0, 1000, size=3000)
history_df['item_id'] = np.random.randint(0, 200, size=3000)
history_df['rating'] = np.random.randint(0, 5, size=3000)
history_df['timestamp'] = 0
history_df = history_df.drop_duplicates(subset=['user_id', 'item_id'], ignore_index=True)

  from .autonotebook import tqdm as notebook_tqdm


## Load generator and init simulator

In [2]:
from simulator.modules import Simulator
from simulator.utils import load
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .appName('simulator')\
    .master('local[1]')\
    .getOrCreate()

# spark = SparkSession.builder\
#     .appName('simulator')\
#     .master('spark://10.32.1.50:7077')\
#     .getOrCreate()

user_gen = load('models/demo_user_gen.m')
item_gen = load('models/demo_item_gen.m')
resp_func = load('models/demo_response.m')

sim = Simulator(
    user_generators={'main' : user_gen, 'cluster_1' : user_gen},
    item_generator=item_gen,
    spark_session=spark
)

sim.init(
    num_users={'main' : 100, 'cluster_1' : 50},
    num_items=10,
    user_key_col='user_id',
    item_key_col='item_id',
    user_df=pandas_to_spark(users_df),
    item_df=pandas_to_spark(items_df),
    history_df=pandas_to_spark(history_df)
)

22/06/09 11:01:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/09 11:01:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2022-06-09 11:01:35.194014: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-09 11:01:49.337964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 12145 MB memory:  -> device: 0, name: A100-SXM4-40GB, pc

In [3]:
sim.user_df.toPandas()

Unnamed: 0,user_attr_0,user_attr_1,user_attr_2,user_attr_3,user_attr_4,user_attr_5,user_attr_6,user_attr_7,user_attr_8,user_attr_9,...,user_attr_28,user_attr_29,user_attr_30,user_attr_31,user_attr_32,user_attr_33,user_attr_34,user_id,__is_synth,__cluster
0,-0.273051,-0.680168,-0.463974,0.127935,0.727117,0.490688,-0.130140,-2.600446,1.215670,-0.773716,...,0.547755,-1.042929,-0.033798,0.369477,-0.649508,1.452533,1.879369,0,0,0
1,0.830051,0.207804,-0.153530,-0.293692,0.108648,0.296974,-1.779094,0.173350,-1.316083,0.079337,...,0.456091,1.195296,1.373276,0.735152,-0.017028,0.166922,0.066420,1,0,0
2,0.322885,0.052076,-1.135067,1.062646,0.390178,-0.457007,0.198216,-0.842334,0.838178,-0.961891,...,1.113671,1.581227,0.252404,-1.596583,-0.268708,-1.692902,-0.453871,2,0,0
3,-0.521697,-1.135976,0.301597,-0.786486,-0.553351,1.015656,1.308761,0.448624,0.596315,0.600223,...,1.367203,1.046799,-1.147266,0.206719,2.429459,0.889963,0.423263,3,0,0
4,0.388453,0.732960,0.033508,-0.272537,-1.524282,-1.927247,1.277155,-0.111202,-0.617550,1.420765,...,-1.261282,0.408507,-0.805613,1.437582,0.332817,-0.564912,0.276097,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,-1.067828,-1.041181,-2.650446,-0.164911,-0.339210,-0.863141,-0.996411,-0.226604,0.537983,0.082044,...,-0.029209,-0.872534,-0.256067,-1.848091,-1.801611,-0.389125,-0.220398,1145,1,2
1146,0.007177,-0.192577,0.219897,1.255042,0.132826,-0.152801,1.074970,0.906011,2.252596,0.796913,...,0.630221,0.390196,-0.041469,1.750810,0.024758,1.743661,-0.257998,1146,1,2
1147,1.060553,0.774230,1.016231,0.611955,0.267393,-2.056729,0.825820,-0.789796,-0.482546,1.690116,...,-0.432918,1.729396,-1.613139,-2.047158,-1.260664,1.530624,-0.359631,1147,1,2
1148,-0.285332,0.169134,0.379773,-0.333600,-1.571462,-1.141036,-0.491212,-2.757202,-0.948843,-1.422348,...,-0.249316,-2.683379,0.581483,1.094286,-0.721721,-2.166030,-0.206976,1148,1,2


In [4]:
sim.item_df.toPandas()

Unnamed: 0,item_attr_0,item_attr_1,item_attr_2,item_attr_3,item_attr_4,item_attr_5,item_attr_6,item_attr_7,item_attr_8,item_attr_9,...,item_attr_12,item_attr_13,item_attr_14,item_attr_15,item_attr_16,item_attr_17,item_attr_18,item_attr_19,item_id,__is_synth
0,2.021847,0.239035,1.955474,0.888514,1.404724,0.348102,2.124406,1.425743,-1.983185,-0.021195,...,1.083519,0.925560,-0.494204,0.443225,2.079651,0.424961,1.968796,1.846050,0,0
1,0.826881,2.460224,2.729878,-0.468284,3.255827,2.292463,0.448499,1.319410,1.547617,1.006506,...,-0.011313,2.131609,0.650405,0.194700,0.556251,0.907084,0.522955,1.237403,1,0
2,-1.562596,-0.269226,1.021703,1.058269,0.155833,0.161673,1.390483,2.170699,1.278449,0.939903,...,3.652105,1.965947,-0.280615,1.077489,2.270997,2.237803,-0.457164,1.971611,2,0
3,-0.350692,2.408884,0.166032,1.354679,0.151427,0.948171,1.377985,1.996022,0.222306,2.272332,...,0.931636,1.152436,-0.323896,3.126340,1.351620,1.988612,1.024545,2.125795,3,0
4,1.521410,0.987352,1.198165,2.087700,2.051713,0.521160,2.033033,0.520930,2.410654,0.278572,...,0.011636,1.528674,1.976626,2.231582,1.227067,1.849940,0.154210,-0.529980,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,0.356827,0.964256,0.719241,2.018484,1.376385,1.095376,1.875401,2.045814,1.041324,-0.831530,...,0.655697,2.416604,0.626258,2.549130,1.738445,2.767843,1.511490,2.029167,205,1
206,2.210901,0.978809,0.772584,0.828511,1.305122,0.626367,0.554083,0.210552,-0.240647,-0.275725,...,0.511994,1.004345,0.735656,-1.230223,0.376265,1.259024,0.376349,1.497307,206,1
207,3.011103,1.864067,1.662320,-0.220442,0.416764,2.000214,-0.583644,-0.429603,0.399261,1.018885,...,1.331519,0.227746,0.026967,1.550672,1.047793,2.031570,-0.845538,1.363456,207,1
208,-0.381559,1.842475,1.189865,0.481703,0.953021,-0.370344,1.003059,0.141639,1.006888,2.021386,...,2.458814,1.367089,-0.496305,1.697690,0.162865,0.973330,-1.937208,0.498442,208,1


In [5]:
sim.history_df.toPandas()

Unnamed: 0,user_id,item_id,rating,timestamp
0,633,167,3,0
1,994,106,2,0
2,600,141,0,0
3,338,171,2,0
4,850,162,0,0
...,...,...,...,...
2978,449,23,2,0
2979,708,157,2,0
2980,3,92,0,0
2981,64,134,4,0


In [11]:
users = sim.sample_users({'main' : 15, 'cluster_1' : 5}, 10)
users.toPandas()

Unnamed: 0,user_attr_0,user_attr_1,user_attr_2,user_attr_3,user_attr_4,user_attr_5,user_attr_6,user_attr_7,user_attr_8,user_attr_9,...,user_attr_26,user_attr_27,user_attr_28,user_attr_29,user_attr_30,user_attr_31,user_attr_32,user_attr_33,user_attr_34,user_id
0,1.227369,0.037482,-2.032466,0.885311,-0.411332,1.582784,-0.014773,-0.440624,-0.159077,1.583254,...,-0.428228,0.983802,0.698727,0.75363,-0.061698,1.694005,0.030347,-0.62773,-1.868268,24
1,1.300726,-0.445246,-1.308633,0.524766,0.0434,0.866265,0.279017,0.358744,2.02879,1.334234,...,-0.443807,0.462376,0.872,-0.370292,-0.685748,-1.569713,0.57103,-0.172435,-0.192718,883
2,-0.458706,0.722623,1.085536,1.61661,1.058035,-0.785637,-0.146129,-1.268639,-1.131657,0.652395,...,0.295465,0.842428,-0.313398,-1.382648,0.472162,0.756836,0.186923,-1.768644,0.19573,366
3,1.867464,-0.336633,0.161604,-1.335668,-2.948499,0.503522,-0.167375,-0.604227,-0.003459,1.821329,...,0.54057,-1.217519,1.037127,-0.061586,-1.403053,0.913325,0.355901,-0.386382,-0.575032,353
4,-0.331436,-0.695699,-0.445008,2.082642,0.571009,0.524468,0.406672,-0.360444,0.294329,0.41052,...,-0.27118,-0.34215,-0.684856,-0.831756,-0.194748,1.661263,-1.626846,-0.448058,-0.085603,271
5,-0.653365,-1.549763,-0.243127,-0.7053,0.05419,-0.318513,1.531341,-0.489834,1.257899,-0.216209,...,-1.746945,1.157319,0.197899,-0.490091,0.609163,-0.46787,-0.410355,0.572716,0.903562,38
6,-1.1918,-0.865603,-0.172631,-0.679667,-2.051394,0.500272,-1.080388,0.003229,-0.114574,0.150714,...,-0.442343,0.855296,1.041636,-0.684376,0.374603,-0.123994,-0.019998,-1.02013,-0.56534,330
7,-1.083769,0.997298,1.673615,0.885867,0.486124,-0.94649,-1.674245,0.737417,-0.649792,0.764398,...,0.500943,-1.306086,-1.372001,0.200597,0.850655,1.640866,-0.4066,0.722988,-0.625987,805
8,0.027483,0.479047,-0.141682,-0.700036,-1.176686,0.985773,-1.787594,-0.821154,0.813915,0.451587,...,0.446161,0.457479,1.26522,0.874001,-0.564837,0.767023,0.5143,1.266486,1.972709,101
9,1.194054,-1.510738,-0.761742,0.927599,-1.051338,2.069954,-0.421901,-1.162043,0.151696,-0.153908,...,0.730506,-0.860944,-0.439631,0.809904,-0.930686,0.933133,-0.707849,-2.02723,-0.816366,875


## Getting log to train model. If no real log was presented -- generate it based on some heuristic

In [12]:
log, user_matrix, item_matrix = sim.get_train_log(
    user_df=users,
    response_func=lambda x,y : resp_func(x.toPandas(), y.toPandas(), [0.0, 0.0, 1.0]),
    use_synth_items=True,
    use_real_items=True
)

print('----------------------Log----------------------')
print(log.count())
print(log.printSchema())

print('---------------------Users---------------------')
print(user_matrix.count())
print(user_matrix.printSchema())

print('---------------------Items---------------------')
print(item_matrix.count())
print(item_matrix.printSchema())

                                                                                

----------------------Log----------------------


                                                                                

42
root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: long (nullable = true)

None
---------------------Users---------------------
30
root
 |-- user_attr_0: double (nullable = true)
 |-- user_attr_1: double (nullable = true)
 |-- user_attr_2: double (nullable = true)
 |-- user_attr_3: double (nullable = true)
 |-- user_attr_4: double (nullable = true)
 |-- user_attr_5: double (nullable = true)
 |-- user_attr_6: double (nullable = true)
 |-- user_attr_7: double (nullable = true)
 |-- user_attr_8: double (nullable = true)
 |-- user_attr_9: double (nullable = true)
 |-- user_attr_10: double (nullable = true)
 |-- user_attr_11: double (nullable = true)
 |-- user_attr_12: double (nullable = true)
 |-- user_attr_13: double (nullable = true)
 |-- user_attr_14: double (nullable = true)
 |-- user_attr_15: double (nullable = true)
 |-- user_attr_16: double (nullable = true)
 |-- user_attr_17: double (nullable = true)

## Log, users and items for prediction

In [13]:
inf_log, inf_user_features, inf_item_features = sim.get_user_items(users, 2, 2)

print('----------------------Log----------------------')
print(inf_log.count())
print(inf_log.printSchema())

print('---------------------Users---------------------')
print(inf_user_features.count())
print(inf_user_features.printSchema())

print('---------------------Items---------------------')
print(inf_item_features.count())
print(inf_item_features.printSchema())

----------------------Log----------------------
2983
root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: long (nullable = true)

None
---------------------Users---------------------
30
root
 |-- user_attr_0: double (nullable = true)
 |-- user_attr_1: double (nullable = true)
 |-- user_attr_2: double (nullable = true)
 |-- user_attr_3: double (nullable = true)
 |-- user_attr_4: double (nullable = true)
 |-- user_attr_5: double (nullable = true)
 |-- user_attr_6: double (nullable = true)
 |-- user_attr_7: double (nullable = true)
 |-- user_attr_8: double (nullable = true)
 |-- user_attr_9: double (nullable = true)
 |-- user_attr_10: double (nullable = true)
 |-- user_attr_11: double (nullable = true)
 |-- user_attr_12: double (nullable = true)
 |-- user_attr_13: double (nullable = true)
 |-- user_attr_14: double (nullable = true)
 |-- user_attr_15: double (nullable = true)
 |-- user_attr_16: double (nullable =

## Evaluate actions for users on recommended items

In [14]:
import numpy as np
from simulator.modules import sample_response

recommendations = inf_user_features.select('user_id')\
                    .crossJoin(inf_item_features.select('item_id'))

sim.sample_responses(
    recommendations_df=recommendations,
    action_models={
        'rated' : lambda x,y : sample_response(x.toPandas(), y.toPandas(), 0.8),
        'relevance' : lambda x,y : resp_func(x.toPandas(), y.toPandas(), [0.0, 0.0, 1.0])
    },
).toPandas()

06-09 11:06:50 I deeptables.m.deeptable.py 685 - Perform prediction...
06-09 11:06:50 I deeptables.m.preprocessor.py 242 - Transform [X]...
06-09 11:06:50 I deeptables.m.preprocessor.py 249 - transform_X taken 0.0747230052947998s
06-09 11:06:50 I deeptables.m.deepmodel.py 130 - Performing predictions...
06-09 11:06:50 I deeptables.u.dataset_generator.py 240 - create dataset generator with _TFDGForPandas, batch_size=128, shuffle=False, drop_remainder=False
06-09 11:06:52 I deeptables.m.deeptable.py 559 - predict_proba taken 2.007401704788208s
06-09 11:06:52 I deeptables.m.deeptable.py 594 - Reverse indicators to labels.


2022-06-09 11:06:52.368573: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
                                                                                

Unnamed: 0,user_id,item_id,rated,relevance
0,330,66,0,4.0
1,805,206,1,4.0
2,1053,206,0,3.0
3,271,92,0,4.0
4,1014,66,0,3.0
...,...,...,...,...
115,1083,92,1,4.0
116,1039,206,0,3.0
117,1039,92,0,3.0
118,1014,206,0,3.0
