script from [here](https://getstream.io/blog/factorization-recommendation-systems/)

In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
from collections import Counter 

from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


  from ._conv import register_converters as _register_converters


In [2]:
buys = open('yoochoose-buys.dat')
clicks = open('yoochoose-clicks.dat')

initial_buys_df = pd.read_csv(buys, 
                              names=['Session ID', 'Timestamp', 'Item ID', 'Category', 'Quantity'],
                              dtype={
                                  'Session ID': 'int32',
                                  'Timestamp': 'str',
                                  'Item ID': 'int32',
                                  'Category': 'category'
                              })
initial_buys_df.set_index('Session ID', inplace=True)


initial_clicks_df = pd.read_csv(clicks, 
                                names=['Session ID', 'Timestamp', 'Item ID', 'Category'],
                                dtype={'Category': 'category'})
initial_clicks_df.set_index('Session ID', inplace=True)

initial_buys_df = initial_buys_df.drop('Timestamp', 1)
initial_clicks_df = initial_clicks_df.drop('Timestamp', 1)

In [5]:
initial_buys_df.dtypes

Item ID        int32
Category    category
Quantity       int64
dtype: object

In [6]:
initial_buys_df.head()

Unnamed: 0_level_0,Item ID,Category,Quantity
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
420374,214537888,12462,1
420374,214537850,10471,1
281626,214535653,1883,1
420368,214530572,6073,1
420368,214835025,2617,1


In [7]:
# mask = initial_buys_df.Quantity==0
# initial_buys_df[~mask]

In [8]:
initial_clicks_df.dtypes

Item ID        int64
Category    category
dtype: object

In [9]:
initial_clicks_df.head()

Unnamed: 0_level_0,Item ID,Category
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,214536502,0
1,214536500,0
1,214536506,0
1,214577561,0
2,214662742,0


pick most common session (top 10000 in `initial_buy_df`), reconstruct smaller df

In [10]:
x = Counter(initial_buys_df.index).most_common(10000)
top_k = dict(x).keys()

In [11]:
top_k_idx = initial_buys_df[['Item ID']].groupby(initial_buys_df.index).count().rename(columns={'Item ID':'cnt'}).sort_values(by='cnt',ascending=False).index[:10000]

In [12]:
temp = set(top_k_idx ) and set(top_k)
assert len(temp)==10000, 'oops, not the same '

In [13]:
initial_buys_df   = initial_buys_df.loc[top_k_idx]
initial_clicks_df = initial_clicks_df.loc[top_k_idx]

In [14]:
initial_buys_df.shape

(106956, 3)

In [15]:
initial_clicks_df.shape

(207783, 2)

In [16]:
initial_buys_df['_Session ID'] = initial_buys_df.index

 we can introduce historical engagement data into our FM model. 

In [17]:
transformed_buys = pd.get_dummies(initial_buys_df)
transformed_clicks = pd.get_dummies(initial_clicks_df)

In [18]:
filtered_buys = transformed_buys.filter(regex="Item.*|Category.*")
filtered_clicks = transformed_clicks.filter(regex="Item.*|Category.*")

In [19]:
filtered_buys.groupby(filtered_buys.index)

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000082BAE35D30>

In [21]:
tmp = filtered_buys.loc[932,].filter(regex='Category.*')
tmp.sum()[tmp.sum()!=0]

Category_1360    2
Category_187     2
Category_523     2
Category_680     2
Category_732     2
dtype: int64

In [22]:
tmp.loc[:,['Category_1360','Category_187','Category_523']]

Unnamed: 0_level_0,Category_1360,Category_187,Category_523
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
932,1,0,0
932,0,0,1
932,0,0,0
932,0,0,0
932,0,1,0
932,0,0,0
932,1,0,0
932,0,0,1
932,0,0,0
932,0,1,0


In [23]:
historical_buy_data = filtered_buys.groupby(filtered_buys.index).sum()
historical_buy_data = historical_buy_data.rename(columns=lambda column_name: 'buy history:' + column_name)

historical_click_data = filtered_clicks.groupby(filtered_clicks.index).sum()
historical_click_data = historical_click_data.rename(columns=lambda column_name: 'click history:' + column_name)

In [24]:
print('shape of history buy:',historical_buy_data.shape)
print('shape of filtered buy:',filtered_buys.shape)

print('shape of history click:',historical_click_data.shape)

shape of history buy: (10000, 736)
shape of filtered buy: (106956, 736)
shape of history click: (10000, 340)


In [25]:
historical_buy_data.head()

Unnamed: 0_level_0,buy history:Item ID,buy history:Category_0,buy history:Category_10052,buy history:Category_1015,buy history:Category_10157,buy history:Category_1024,buy history:Category_10261,buy history:Category_1036,buy history:Category_10366,buy history:Category_10367,...,buy history:Category_61,buy history:Category_69010,buy history:Category_75,buy history:Category_750,buy history:Category_771,buy history:Category_787,buy history:Category_83,buy history:Category_868,buy history:Category_915,buy history:Category_973
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
932,2148020000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3302,1716957000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3687,1717444000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3889,2148006000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4451,1718581000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
tmp = historical_buy_data.loc[932,].filter(regex='Category.*')
tmp.sum()[tmp.sum()!=0]



array([10.])

In [27]:
tmp[tmp!=0]

buy history:Category_1360    2.0
buy history:Category_187     2.0
buy history:Category_523     2.0
buy history:Category_680     2.0
buy history:Category_732     2.0
Name: 932, dtype: float64

In [28]:
merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True)
merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True)

merged2.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID'], 1, inplace=True)

In [29]:
merged2.columns[1500:]

Index(['click history:Category_6', 'click history:Category_7',
       'click history:Category_9', 'click history:Category_S',
       'click history:Category_2088894828',
       'click history:Category_2088900589',
       'click history:Category_2088901091',
       'click history:Category_2088903330',
       'click history:Category_2088918545',
       'click history:Category_2088918717',
       ...
       'click history:Category_2089251022',
       'click history:Category_2089084275',
       'click history:Category_2089759631',
       'click history:Category_2089314197',
       'click history:Category_2088970021', 'click history:Category_8',
       'click history:Category_2089580928', 'click history:Category_193469695',
       'click history:Category_2089573539',
       'click history:Category_2089314263'],
      dtype='object', length=310)

In [30]:
merged2.head()

Unnamed: 0_level_0,Quantity,Category_0,Category_10052,Category_1015,Category_10157,Category_1024,Category_10261,Category_1036,Category_10366,Category_10367,...,click history:Category_2089251022,click history:Category_2089084275,click history:Category_2089759631,click history:Category_2089314197,click history:Category_2088970021,click history:Category_8,click history:Category_2089580928,click history:Category_193469695,click history:Category_2089573539,click history:Category_2089314263
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
932,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
932,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


tffm

In [31]:
model = TFFMRegressor(
    order=2,
    rank=7,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
    n_epochs=100,
    batch_size=-1,
    init_std=0.001,
    input_type='dense'
)

In [32]:
X = np.array(merged2.iloc[:,1:])
X = np.nan_to_num(X)
y = np.array(merged2['Quantity'].values)

In [33]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)

In [34]:
merged2.shape

(106956, 1810)

In [35]:
X_te, X_te_cs, y_te, y_te_cs = train_test_split(X_te, y_te, test_size=0.5)

In [36]:
cold_start = pd.DataFrame(X_te_cs, columns=merged2.columns[1:])

In [37]:
for column in cold_start.columns:
    if ('buy' in column or 'click' in column) and ('Category' not in column):
        cold_start[column] = 0

In [51]:
# for column in cold_start.columns:
#     if ('buy' in column or 'click' in column): #and ('Category' not in column):
#         print(column)

In [49]:
cold_start.columns

Index(['Category_0', 'Category_10052', 'Category_1015', 'Category_10157',
       'Category_1024', 'Category_10261', 'Category_1036', 'Category_10366',
       'Category_10367', 'Category_1037',
       ...
       'click history:Category_2089251022',
       'click history:Category_2089084275',
       'click history:Category_2089759631',
       'click history:Category_2089314197',
       'click history:Category_2088970021', 'click history:Category_8',
       'click history:Category_2089580928', 'click history:Category_193469695',
       'click history:Category_2089573539',
       'click history:Category_2089314263'],
      dtype='object', length=1809)

In [38]:
X_tr.shape

(85564, 1809)

In [39]:
y_tr.shape

(85564,)

In [40]:
X_tr.shape[0] == y_tr.shape[0]

True

In [41]:
model.fit(X_tr, y_tr, show_progress=True)



100%|█████████████████████████████████████| 100/100 [02:04<00:00,  1.25s/epoch]


In [42]:
# model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)

cold_start_predictions = model.predict(X_te_cs)
print('MSE: {}'.format(mean_squared_error(y_te, predictions)))
print('Cold-start MSE: {}'.format(mean_squared_error(y_te_cs, cold_start_predictions)))
# model.destroy()

MSE: 1.1836070140113997
Cold-start MSE: 1.197469593890877
