# Play with SOFA: the Simulator for OFfline leArning and evaluation.
## Load SOFA first

In [1]:
# # set conf and load sofa
import sys
sys.path.append('../src/')
import numpy as np
from time import time, localtime, strftime
import configparser, random

from env.env import SOFAEnv, simulated_data

def _get_conf(conf_name):
    config = configparser.ConfigParser()
    config.read("../conf/"+conf_name+".properties")
    conf=dict(config.items("default"))
    if ('seed' in conf) and (conf['seed'].lower() != 'none'):
        seed = int(conf['seed'])
        np.random.seed(seed)
        random.seed(seed)
    # # for multiple jobs in 
    # args = set_hparams()
    # conf["data.debiasing"] = args.debiasing
    # conf["seed"] = str(args.seed)

    evalProcess = conf['evaluation']
    if evalProcess.lower() == 'false':
        if (conf["data.input.dataset"] in ['sim4', 'sim5']) and (conf["data.debiasing"] == 'GT'):
            rating_file = conf["data.input.path"] + conf["data.input.dataset"] + "_GT_ratingM.ascii"
        else:
            rating_file = conf["data.input.path"] + conf["data.input.dataset"] + '_' + \
            conf["data.gen_model"] + '_' + conf["data.debiasing"] + "_ratingM.ascii"
            if conf["data.debiasing"] == 'GT':
                rating_file = conf["data.input.path"] + conf["data.input.dataset"] + "_pseudoGT_ratingM.ascii"
                print("we use a pseudo GT for yahoo, which is generated by MF on unbiased testset:", rating_file)
    else:
        if conf["data.input.dataset"].lower() in ['sim4', 'sim5']:
            print('now evaluation process only for simulated dataset which has the groundTruth')
            rating_file = conf["data.input.path"] + conf["data.input.dataset"] + "_GT_ratingM.ascii"
        elif conf["data.input.dataset"].lower() in ["yahoo", "coat"]:
            rating_file = conf["data.input.path"] + conf["data.input.dataset"] + '_' + \
            conf["data.gen_model"] + '_' + conf["data.debiasing"] + "_ratingM.ascii" # this simulator is not for evaluation directly, but for several interaction to generate states
            # solution-2 with pseudo GT
            rating_file = conf["data.input.path"] + conf["data.input.dataset"] + "_pseudoGT_ratingM.ascii"
            print("we use a pseudo GT for yahoo, which is generated by MF on unbiased testset:", rating_file)
        else:
            print("check data")
    conf["RATING_TYPE"] = conf["rating_type"]
    conf["RATINGS"] = np.clip(np.round(np.loadtxt(rating_file)).astype('int'), 1, 5)
    conf["EPISODE_LENGTH"] = conf["episode_length"]
    conf['mode'] = conf['mode'].upper()
    if conf['mode'] == 'DOUBLEDQN':
        conf['mode'] = 'DoubleDQN'
    return conf

conf = _get_conf('yahoo')
sofa = SOFAEnv(conf)

## Details in SOFA
The number of users and items.

In [2]:
action_space = sofa.num_items
num_users = sofa.num_users
print("The number of users and items are %d and %d" % (num_users, action_space))

The number of users and items are 300 and 300


## Basic operation of SOFA
### Reset 
To load a new user, and set an empty state
### Step: 
Receive an action from rec-policy, and return the state, reward and done (a signal if user leaves the system.)


In [3]:
# a user enter the system
user = 1
sofa.reset(user)
# we do 10-turn recommendation
for i in range(10):
    action = np.random.randint(action_space)
    state, reward, done = sofa.step(action)
    print("Round-%d, recommend item %d to user %d, and we can observe reward %d" % \
        (i, action, user, reward))
print("The last state is", state)

Round-0, recommend item 0 to user 1, and we can observe reward 0
Round-1, recommend item 83 to user 1, and we can observe reward 0
Round-2, recommend item 235 to user 1, and we can observe reward 0
Round-3, recommend item 282 to user 1, and we can observe reward 0
Round-4, recommend item 74 to user 1, and we can observe reward 1
Round-5, recommend item 250 to user 1, and we can observe reward 0
Round-6, recommend item 56 to user 1, and we can observe reward 0
Round-7, recommend item 131 to user 1, and we can observe reward 0
Round-8, recommend item 79 to user 1, and we can observe reward 0
Round-9, recommend item 156 to user 1, and we can observe reward 1
The last state is [[0, 83, 235, 282, 74, 250, 56, 131, 79, 156], [0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]


In [5]:
# # repeat recommendation would not receive click from simulated user twice
# a user enter the system
user = 1
sofa.reset(user)
# we do 10-turn recommendation
action = 74
for i in range(10):
    state, reward, done = sofa.step(action)
    print("Round-%d, recommend item %d to user %d, and we can observe reward %d" % \
        (i, action, user, reward))
print("The last state is", state)

Round-0, recommend item 74 to user 1, and we can observe reward 1
Round-1, recommend item 74 to user 1, and we can observe reward 0
Round-2, recommend item 74 to user 1, and we can observe reward 0
Round-3, recommend item 74 to user 1, and we can observe reward 0
Round-4, recommend item 74 to user 1, and we can observe reward 0
Round-5, recommend item 74 to user 1, and we can observe reward 0
Round-6, recommend item 74 to user 1, and we can observe reward 0
Round-7, recommend item 74 to user 1, and we can observe reward 0
Round-8, recommend item 74 to user 1, and we can observe reward 0
Round-9, recommend item 74 to user 1, and we can observe reward 0
The last state is [[74, 74, 74, 74, 74, 74, 74, 74, 74, 74], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
