In [1]:
from recommender import Recommender
import pandas as pd 
import numpy as np
import plotly.io as pio
import plotly.offline as pyo
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split

pyo.init_notebook_mode(connected=True)

In [2]:
# read in the json files
portfolio = pd.read_json('example_data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('example_data/profile.json', orient='records', lines=True)
transcript = pd.read_json('example_data/transcript.json', orient='records', lines=True)

In [None]:
# instantiate a recommender object
rec = Recommender()

# fit the recommender to the data
rec.fit(profile, portfolio, transcript)

In [None]:
# get the cleaned data
clean_portfolio = rec.clean_portfolio
clean_profile = rec.clean_profile
clean_transcript = rec.clean_transcript

In [None]:
# split the profile in test and train data
train_percentage = 0.8
train_length = round(train_percentage * clean_transcript.shape[0])
train_transcript = clean_transcript.head(n=train_length)
test_length = clean_transcript.shape[0] - train_length
test_transcript = clean_transcript.tail(n=test_length)

In [None]:
# construct the train user item matrix
train_user_item = rec._user_item(clean_profile, clean_portfolio, train_transcript)
test_user_item = rec._user_item(clean_profile, clean_portfolio, test_transcript)

# get the common users and offers in the test and train dataset
common_users = set(train_user_item.index) & set(test_user_item.index)
common_offers = set(train_user_item.columns) & set(test_user_item.columns)

In [None]:
# define the parameter space for the grid search
latent_feature_space = [2, 5, 10, 20, 30, 40, 50, 100]
learning_rate_space = [1e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2]

# initialize an array with the summed squared error for each parameter setting
summed_squared_error = np.zeros((len(latent_feature_space), len(learning_rate_space)))

for ii, latent_features in enumerate(latent_feature_space):
    for jj, learning_rate in enumerate(learning_rate_space):
        # fill the train user item matrix
        train_full_user_item = rec._matrix_factorization(train_user_item, iters=100, 
                                                         latent_features=latent_features, 
                                                         learning_rate=learning_rate)

        # calculate the squared error
        squared_error = (train_full_user_item.iloc[train_full_user_item.index.isin(common_users), train_full_user_item.columns.isin(common_offers)] \
        - test_user_item.loc[test_user_item.index.isin(common_users), test_user_item.columns.isin(common_offers)]) ** 2

        # get the summed error
        N = squared_error.isnull().sum().sum()
        summed_squared_error[ii, jj] = (squared_error.values[~squared_error.isnull()].sum() / N)

In [None]:
for ii in range(summed_squared_error.shape[1]):
    data.append(
        go.Scatter(
                    x=latent_feature_space,
                    y=summed_squared_error[ii, :],
                    opacity=1,
                    name="$\alpha$ = " + str(learning_rate_space[ii])
                    )
        )

layout =  go.Layout(
                autosize=False,
                width=500,
                height=500,
                title="",
                font=dict(
                    size=18,
                    color='rgb(0, 0, 0)',                    
                ),
                xaxis=dict(
                    title='Latent features',
                ),
                yaxis=dict(
                    title='Frequency (%)',
                )
                    )

fig_opti_scatter = go.Figure(data=data, layout=layout)
    
pyo.iplot(fig_opti_scatter)

In [None]:
pio.write_image(fig_optimize_scatter, 'exports/fig_opti_scatter.png')