# Use lightfm for all data and most populat items for cold start recomendations and clean data before training 


In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
NUM_CORE = 6 
from lightfm import LightFM

In [2]:
import sys
sys.path.append('..')
from metrics import apk, mapk, precision_at_k, recall_at_k
from utils import get_train_test_data_split, pd_to_scr_matrix
from collections import defaultdict

In [3]:
data = pd.read_csv('../../data/recsys_data.csv')
data['date'] =pd.to_datetime(data['date'])

In [4]:
train, test  = get_train_test_data_split(data, val_start = pd.Timestamp('2020-06-27 00:00:00'), test_days= 1 )

In [5]:
most_popular_items = list(train.loc[train['status']==1, 'survey'].value_counts().index[0:3])

In [6]:
most_popular_items

[100000513, 100000017, 100000040]

In [7]:
test_actual_results = test.loc[ test['status']==1, 
                         :].groupby( ['user'] ).apply(  lambda x: list( x['survey']) ).reset_index()

test_actual_results.columns = ['user', 'true']



In [8]:
test_actual_results

Unnamed: 0,user,true
0,57141,[100002475]
1,61299,[100000427]
2,64059,"[100006888, 100010236]"
3,65613,"[100006556, 100003291, 100001173]"
4,73206,"[100016937, 100009234]"
...,...,...
25129,62072229,[100001246]
25130,62072244,[100001246]
25131,62072286,[100022773]
25132,62072334,[100005041]


In [9]:
# delete user that have a lot of wrong surveys and surveys that no one can solve

In [10]:
data_user_status = train.groupby( ['user'] ).agg( { 'status': ['mean', 'count'] }  )
data_user_status.columns = ['mean_status' , 'count_status']

In [15]:
bad_users = list(data_user_status.loc [(data_user_status['mean_status'] == 0 )& 
                      (data_user_status['count_status']>100), :].index)

In [16]:
data_survey_status = data.groupby( ['survey'] ).agg( { 'status': ['mean', 'count'] }  )
data_survey_status.columns = ['mean_status' , 'count_status']

In [17]:
data_survey_status

Unnamed: 0_level_0,mean_status,count_status
survey,Unnamed: 1_level_1,Unnamed: 2_level_1
100000000,0.000000,12
100000001,0.000000,18
100000002,0.112245,98
100000003,0.125000,152
100000004,0.247892,593
...,...,...
100042440,0.000000,1
100042441,0.000000,1
100042442,0.000000,1
100042443,1.000000,1


In [19]:
bad_surveys = list(data_survey_status.loc [(data_survey_status['mean_status'] == 0 )& 
                      (data_survey_status['count_status']>100), :].index)

In [21]:
train = train.loc[ train['user'].isin(bad_users) == False, :  ]

In [22]:
train = train.loc[ train['survey'].isin(bad_surveys) == False, :  ]

In [24]:
#train after filtration
train

Unnamed: 0,user,survey,status,date
0,46120029,100000000,0,2020-06-25 16:08:06
1,61685181,100000001,0,2020-06-24 12:29:43
2,61900560,100000002,0,2020-06-24 20:35:37
3,26620554,100000003,0,2020-06-24 22:04:23
4,61973223,100000004,1,2020-06-25 15:15:02
...,...,...,...,...
873900,62031414,100002347,0,2020-06-26 23:42:15
873901,35385993,100000026,0,2020-06-26 23:15:44
873903,43229514,100000206,0,2020-06-26 22:26:04
873904,62010381,100002446,1,2020-06-26 23:24:56


In [25]:
previously_seen_items = train.groupby(['user'])['survey'].apply( lambda x: list(x)).to_dict()

In [26]:
train = train.loc[train['status'] ==1, :]

In [27]:
train_matrix, user_id_cats, survey_id_cats= pd_to_scr_matrix(train)

In [28]:
train_matrix

<38275x13166 sparse matrix of type '<class 'numpy.int64'>'
	with 138362 stored elements in Compressed Sparse Row format>

In [29]:
user_id_cats

CategoricalDtype(categories=[   57141,    61299,    63513,    64407,    65613,    66771,
                     73206,    73368,    73734,    83121,
                  ...
                  62031813, 62031843, 62031858, 62031876, 62031939, 62032119,
                  62032227, 62032293, 62032341, 62032347],
, ordered=True)

In [30]:
survey_id_cats

CategoricalDtype(categories=[100000002, 100000003, 100000004, 100000005, 100000006,
                  100000007, 100000008, 100000009, 100000010, 100000011,
                  ...
                  100042350, 100042351, 100042354, 100042363, 100042364,
                  100042371, 100042374, 100042380, 100042381, 100042425],
, ordered=True)

In [31]:
#use lightfm
model = LightFM(loss='warp')

In [32]:
model.fit(train_matrix, epochs=30)

<lightfm.lightfm.LightFM at 0x7f7baddaf2b0>

In [33]:
n_users, n_items = train_matrix.shape

In [34]:
print(n_users, n_items)

38275 13166


In [35]:
def mp_predict_with_seen(data, k=3):
    """
    make predictions using multiprocessing. delete already seen items
    """
    user_id, previous_items = data
    scores = model.predict(user_id, np.arange(n_items))
    top_items = list(survey_id_cats.categories[np.argsort(-scores)])
    top_items =  [item for item in top_items if item not in previous_items]
    return top_items[:k]


In [36]:
# data with user and his previous items 
user_id_previously_seen_items= defaultdict(list)
for i, user in enumerate(user_id_cats.categories):
    user_id_previously_seen_items[i] = previously_seen_items.get(user, []  )
x= list(zip(list( range(n_users )), [user_id_previously_seen_items[i] for i in range(n_users)] ))

In [37]:
# make prediction for all users in train dataset
pool = mp.Pool(NUM_CORE)
list_of_results_seen = pool.map(mp_predict_with_seen, x )
pool.close()
pool.join()

In [38]:
list_of_results_seen[0:5]

[[100000482, 100000408, 100000630],
 [100000009, 100000616, 100000442],
 [100001892, 100000513, 100001727],
 [100001892, 100000385, 100000079],
 [100000513, 100000017, 100000135]]

In [39]:
predictions = pd.DataFrame( user_id_cats.categories )
predictions.columns  = ['user']
predictions['predicted'] = list_of_results_seen

In [40]:
predictions

Unnamed: 0,user,predicted
0,57141,"[100000482, 100000408, 100000630]"
1,61299,"[100000009, 100000616, 100000442]"
2,63513,"[100001892, 100000513, 100001727]"
3,64407,"[100001892, 100000385, 100000079]"
4,65613,"[100000513, 100000017, 100000135]"
...,...,...
38270,62032119,"[100000307, 100000114, 100000513]"
38271,62032227,"[100000513, 100001524, 100000690]"
38272,62032293,"[100000017, 100000726, 100000513]"
38273,62032341,"[100002534, 100003193, 100000017]"


In [41]:
test_actual_results = pd.merge(test_actual_results, predictions, how= 'left', on = 'user')

In [42]:
test_actual_results

Unnamed: 0,user,true,predicted
0,57141,[100002475],"[100000482, 100000408, 100000630]"
1,61299,[100000427],"[100000009, 100000616, 100000442]"
2,64059,"[100006888, 100010236]",
3,65613,"[100006556, 100003291, 100001173]","[100000513, 100000017, 100000135]"
4,73206,"[100016937, 100009234]","[100000482, 100000408, 100002534]"
...,...,...,...
25129,62072229,[100001246],
25130,62072244,[100001246],
25131,62072286,[100022773],
25132,62072334,[100005041],


### look at metics for non cold start predictions

In [43]:
only_lfm_mapk = mapk( test_actual_results.loc[ test_actual_results['predicted'].notna() , 'true'], 
                     test_actual_results.loc[  test_actual_results['predicted'].notna() ,'predicted'] , k =3)


only_lfm_precision_at_k = np.mean(test_actual_results.loc[  test_actual_results['predicted'].notna() ,:] .apply( lambda x: precision_at_k(x ['true'],
                                                                                 x['predicted'], k= 3 )  , axis=1))

only_lfm_recall_at_k = np.mean(test_actual_results.loc[  test_actual_results['predicted'].notna() ,:].apply( lambda x: recall_at_k(x ['true'],
                                                                                 x['predicted'], k= 3 )  , axis=1))

In [44]:
print(f'only lightfm map@k = {only_lfm_mapk:.5f} ')
print(f'only lightfm precision@k = {only_lfm_precision_at_k:.5f}')
print(f'only lightfm recall@k = {only_lfm_recall_at_k:.5f}')

only lightfm map@k = 0.00448 
only lightfm precision@k = 0.00423
only lightfm recall@k = 0.00582


In [45]:
test_actual_results['predicted'].isna().mean()

0.37363730405029044

## We have no train data and prediction for 37% of test and will use mostpopular baseline for them

In [46]:
test_actual_results.loc[test_actual_results['predicted'].isna(), 'predicted' ] = test_actual_results.loc[test_actual_results['predicted'].isna(), 'predicted' ].apply(lambda x: x if x==np.nan  else most_popular_items)


In [47]:
lfm_mapk = mapk( test_actual_results['true'], test_actual_results['predicted'] , k =3)
lfm_precision_at_k = np.mean(test_actual_results.apply( lambda x: precision_at_k(x ['true'],
                                                                                 x['predicted'], k= 3 )  , axis=1))

lfm_recall_at_k = np.mean(test_actual_results.apply( lambda x: recall_at_k(x ['true'],
                                                                                 x['predicted'], k= 3 )  , axis=1))

In [48]:
print(f'lightfm map@k = {lfm_mapk:.5f} ')
print(f'lightfm precision@k = {lfm_precision_at_k:.5f}')
print(f'lightfm recall@k = {lfm_recall_at_k:.5f}')

lightfm map@k = 0.00288 
lightfm precision@k = 0.00276
lightfm recall@k = 0.00382


# we significantly improve baseline:
* baseline map@k = 0.00019 
* baseline precision@k = 0.00024
* baseline recall@k = 0.00040