In [7]:
from recommender import Recommender
import pandas as pd 
import numpy as np
import plotly.io as pio
import plotly.offline as pyo
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from time import time

pyo.init_notebook_mode(connected=True)

In [2]:
# read in the json files
portfolio = pd.read_json('example_data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('example_data/profile.json', orient='records', lines=True)
transcript = pd.read_json('example_data/transcript.json', orient='records', lines=True)

In [3]:
# instantiate a recommender object
rec = Recommender()

# fit the recommender to the data
rec.fit(profile, portfolio, transcript)

In [4]:
# get the cleaned data
clean_portfolio = rec.clean_portfolio
clean_profile = rec.clean_profile
clean_transcript = rec.clean_transcript

In [5]:
# split the profile in test and train data
train_percentage = 0.8
train_length = round(train_percentage * clean_transcript.shape[0])
train_transcript = clean_transcript.head(n=train_length)
test_length = clean_transcript.shape[0] - train_length
test_transcript = clean_transcript.tail(n=test_length)

In [6]:
# construct the train user item matrix
train_user_item = rec._user_item(clean_profile, clean_portfolio, train_transcript)
test_user_item = rec._user_item(clean_profile, clean_portfolio, test_transcript)

# get the common users and offers in the test and train dataset
common_users = set(train_user_item.index) & set(test_user_item.index)
common_offers = set(train_user_item.columns) & set(test_user_item.columns)

In [56]:
# define the parameter space for the grid search
latent_feature_space = [5, 10, 20, 30, 40, 50, 100]
learning_rate_space = [1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3]

# initialize an array with the summed squared error for each parameter setting
summed_squared_error = np.zeros((len(latent_feature_space), len(learning_rate_space)))

# perform parameter grid search
for ii, latent_features in enumerate(latent_feature_space):
    for jj, learning_rate in enumerate(learning_rate_space):
        # get the start time of the current loop
        start_time = time()
        
        # fill the train user item matrix
        train_full_user_item = rec._matrix_factorization(train_user_item, iters=20, 
                                                         latent_features=latent_features, 
                                                         learning_rate=learning_rate)

        # calculate the squared error
        squared_error = (train_full_user_item.iloc[train_full_user_item.index.isin(common_users), train_full_user_item.columns.isin(common_offers)] \
        - test_user_item.loc[test_user_item.index.isin(common_users), test_user_item.columns.isin(common_offers)]) ** 2

        # get the summed error
        N = squared_error.isnull().sum().sum()
        summed_squared_error[ii, jj] = (squared_error.values[~squared_error.isnull()].sum() / N)
        
        # display loop information
        print("Latent features: {0}".format(latent_features))
        print("Learning rate: {0}".format(learning_rate))
        print("Loop time: {0:.2f}".format(time() - start_time))
        print("--------------------")

Latent features: 5
Learning rate: 1e-05
Loop time: 8.19
--------------------
Latent features: 5
Learning rate: 2e-05
Loop time: 8.21
--------------------
Latent features: 5
Learning rate: 5e-05
Loop time: 7.95
--------------------
Latent features: 5
Learning rate: 0.0001
Loop time: 7.92
--------------------
Latent features: 5
Learning rate: 0.0002
Loop time: 7.97
--------------------
Latent features: 5
Learning rate: 0.0005
Loop time: 7.85
--------------------



overflow encountered in double_scalars


overflow encountered in double_scalars


overflow encountered in double_scalars



Latent features: 5
Learning rate: 0.001
Loop time: 7.13
--------------------
Latent features: 10
Learning rate: 1e-05
Loop time: 11.53
--------------------
Latent features: 10
Learning rate: 2e-05
Loop time: 11.59
--------------------
Latent features: 10
Learning rate: 5e-05
Loop time: 13.60
--------------------
Latent features: 10
Learning rate: 0.0001
Loop time: 18.78
--------------------
Latent features: 10
Learning rate: 0.0002
Loop time: 12.45
--------------------
Latent features: 10
Learning rate: 0.0005
Loop time: 11.58
--------------------
Latent features: 10
Learning rate: 0.001
Loop time: 10.97
--------------------
Latent features: 20
Learning rate: 1e-05
Loop time: 18.62
--------------------
Latent features: 20
Learning rate: 2e-05
Loop time: 20.02
--------------------
Latent features: 20
Learning rate: 5e-05
Loop time: 19.47
--------------------
Latent features: 20
Learning rate: 0.0001
Loop time: 19.37
--------------------
Latent features: 20
Learning rate: 0.0002
Loop tim

In [57]:
data = []

for ii in range(summed_squared_error.shape[1]):
    data.append(
        go.Scatter(
                    x=latent_feature_space,
                    y=summed_squared_error[ii, :],
                    opacity=1,
                    name="$\\alpha = {0}$".format(learning_rate_space[ii])
                    )
        )

layout =  go.Layout(
                autosize=False,
                width=600,
                height=500,
                title="",
                font=dict(
                    size=18,
                    color='rgb(0, 0, 0)',                    
                ),
                xaxis=dict(
                    title='Latent features',
                ),
                yaxis=dict(
                    title='SSE',
                ),
                legend=dict(
                    font=dict(size=14))
                    )

fig_grid_search_scatter = go.Figure(data=data, layout=layout)
    
pyo.iplot(fig_grid_search_scatter)

In [38]:
# define the range of iterations
iters_space = [10, 20, 50, 100, 200, 500]

# initialize an array with the summed squared error for the different numbers of iterations
summed_squared_error = np.zeros((len(iters_space), 1))

# loop through the different numbers of iterations
for ii, iters in enumerate(iters_space):
        # get the start time of the current loop
        start_time = time()
        
        # fill the train user item matrix
        train_full_user_item = rec._matrix_factorization(train_user_item, iters=iters, 
                                                         latent_features=35, 
                                                         learning_rate=2e-4)

        # calculate the squared error
        squared_error = (train_full_user_item.iloc[train_full_user_item.index.isin(common_users), train_full_user_item.columns.isin(common_offers)] \
        - test_user_item.loc[test_user_item.index.isin(common_users), test_user_item.columns.isin(common_offers)]) ** 2

        # get the summed error
        N = squared_error.isnull().sum().sum()
        summed_squared_error[ii] = (squared_error.values[~squared_error.isnull()].sum() / N)
        
        # display loop information
        print("Iterations: {0}".format(iters))
        print("Loop time: {0:.2f}".format(time() - start_time))
        print("--------------------")

Iterations: 10
Loop time: 14.70
--------------------
Iterations: 20
Loop time: 28.97
--------------------
Iterations: 50
Loop time: 74.11
--------------------
Iterations: 100
Loop time: 162.00
--------------------
Iterations: 200
Loop time: 323.86
--------------------
Iterations: 500
Loop time: 757.57
--------------------


In [54]:
trace = go.Scatter(
                x=iters_space,
                y=summed_squared_error.flatten(),
                opacity=1,
                )

layout =  go.Layout(
                autosize=False,
                width=600,
                height=500,
                title="",
                font=dict(
                    size=18,
                    color='rgb(0, 0, 0)',                    
                ),
                xaxis=dict(
                    title='Iterations',
                ),
                yaxis=dict(
                    title='SSE',
                )
            )

data = [trace]

fig_iters_scatter = go.Figure(data=data, layout=layout)
    
pyo.iplot(fig_iters_scatter)

In [58]:
pio.write_image(fig_iters_scatter, 'exports/fig_iters_scatter.png')
pio.write_image(fig_grid_search_scatter, 'exports/fig_grid_search_scatter.png')