In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/risedata/dataset/user.csv
/kaggle/input/risedata/dataset/relationship.csv
/kaggle/input/risedata/dataset/sample_submission.json
/kaggle/input/risedata/dataset/test.csv
/kaggle/input/risedata/dataset/content.csv


In [2]:
!pip install category_encoders



In [3]:
import pandas_profiling
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper
from sklearn import preprocessing
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,recall_at_k
from lightfm.evaluation import auc_score
from scipy.sparse import csr_matrix
from lightfm.data import Dataset
import category_encoders as ce


In [4]:
content = pd.read_csv('../input/risedata/dataset/content.csv')
relationship = pd.read_csv('../input/risedata/dataset/relationship.csv')
user = pd.read_csv('../input/risedata/dataset/user.csv')
test = pd.read_csv('../input/risedata/dataset/test.csv')

In [5]:
def pre_process_data(content, relationship, user):

    encoder = ce.CatBoostEncoder(cols=['content_type','language','genre'])
    content = content.sample(frac=1)
    content_df = encoder.fit_transform(content, content['rating'])
    content_df['content_duration_hrs'] = content.duration.apply(lambda x: (x/(1000*60*60))%24)
    content_features_csr = csr_matrix(content_df.drop(['content_id','release_date'],1).values)
    
    user_df = user[['user_id','user_age','gender','location']]
    user_df = user_df.sample(frac=1)
    encoder = ce.CatBoostEncoder(cols=['gender','location'])
    user_df = encoder.fit_transform(user_df, user_df['user_age'])
    user_features_csr = csr_matrix(user_df.drop(['user_id'],1).values)
    
    
    relationship['watch_duration_hrs'] = relationship.duration.apply(lambda x: (x/(1000*60*60))%24)
    relationship_df = relationship.drop(['duration','date','start_time','end_time'],1)
    relationship_df = relationship_df.groupby(['user_id','content_id']).watch_duration_hrs.mean().reset_index()
    interaction_df = relationship_df.merge(user_df, how='outer',on='user_id')
    interaction_df = interaction_df[['user_id','content_id','watch_duration_hrs']]
    interaction_df.fillna(0, inplace=True)


    
    
    return content_df, content_features_csr, interaction_df, user_df, user_features_csr

In [6]:
content_df, content_features_csr, interaction_df, user_df, user_features_csr = pre_process_data(content, relationship, user)

  import sys
  del sys.path[0]


In [7]:
user_content_interaction = pd.pivot_table(interaction_df, index='user_id', columns='content_id', values='watch_duration_hrs')
user_content_interaction.fillna(0,inplace=True)
user_content_interaction.head(10)

content_id,0,cont_1000_1_10,cont_1000_1_12,cont_1000_1_16,cont_1000_1_25,cont_1000_2_1,cont_1000_2_10,cont_1000_2_13,cont_1000_2_18,cont_1000_3_1,...,cont_994_1_5,cont_996_1_5,cont_996_1_6,cont_996_1_7,cont_998_1_4,cont_999_1_6,cont_99_1_6,cont_99_1_7,cont_9_1_4,cont_9_1_8
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_10013@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10034@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10036@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10042@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10052@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10069@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10070@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10076@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10082@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10086@domain.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
user_content_interaction_csr = csr_matrix(user_content_interaction.values)

In [9]:
model = LightFM(loss='warp',
                random_state=2016,
                learning_rate=0.01,
                no_components=100,learning_schedule='adadelta')

model = model.fit(interactions=user_content_interaction_csr,
                  epochs=150,
                  num_threads=16, verbose=True)

Epoch: 100%|██████████| 150/150 [08:46<00:00,  3.51s/it]


In [10]:
user_id = list(user_content_interaction.index)
user_dict = {}
counter = 0 
for i in user_id:
    user_dict[i] = counter
    counter += 1

In [11]:
user_dict['user_88625@domain.com']

12100

In [12]:
def sample_recommendation_user(model, interactions, user_id, user_dict,threshold = 0,nrec_items = 10, show = True):
    
    n_users, n_items = interactions.shape
    try:
        user_x = user_dict[user_id]
    except:
        user_x = 0
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(known_items)
    scores = list(return_score_list)
    return scores

In [13]:
sample_recommendation_user(model, user_content_interaction, 'user_88625@domain.com', user_dict)

['cont_2853_2_8',
 'cont_4499_5_10',
 'cont_2069_1_18',
 'cont_4434_4_12',
 'cont_475_15_7',
 'cont_303_1_1',
 'cont_2307_7_13',
 'cont_1287_23_19',
 'cont_1077_3_23',
 'cont_1972_3_17']

In [14]:
results = {}
for row in test.itertuples():
    results.update({row.user_id: sample_recommendation_user(model, user_content_interaction, row.user_id, user_dict)})

In [15]:
import json
with open('submission_1.json','w') as fp:
    json.dump(results, fp, sort_keys=True, indent=4)