In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("default")
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder

import pickle
from functions import *

# Import

In [8]:
%%time

path = "C:/riiid-test-answer-prediction/reduced_riiid_train.pkl.gzip"
col = ['timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'part', 'answered_correctly']
train = pd.read_pickle(path)[col]


train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21733593 entries, 0 to 21733592
Data columns (total 7 columns):
 #   Column              Dtype
---  ------              -----
 0   timestamp           int64
 1   user_id             int32
 2   content_id          int16
 3   content_type_id     int8 
 4   task_container_id   int16
 5   part                int64
 6   answered_correctly  int8 
dtypes: int16(2), int32(1), int64(2), int8(2)
memory usage: 704.7 MB
Wall time: 1.3 s


# Features engineering

## Niveau d'usage des cours

In [9]:
help_usage = train[train.answered_correctly == -1].groupby('user_id')['content_id'].count()
help_usage = help_usage.reset_index().rename(columns={'content_id': 'help_usage'})
help_usage.to_csv('help.csv', index=False)

In [10]:
train_tmp = pd.merge(train, help_usage, on='user_id', how='left').fillna(0)
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage
0,0,115,5692,0,1,5,1,0.0
1,436401051,246496,5692,0,127,5,1,2.0
2,406345961,408119,5692,0,44,5,1,0.0
3,1515513043,637773,5692,0,136,5,1,2.0
4,2420045564,999788,5692,0,80,5,1,0.0
...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0
21733589,487996565,2124260567,8051,0,115,7,1,0.0
21733590,487996565,2124260567,8052,0,115,7,1,0.0
21733591,487996565,2124260567,8053,0,115,7,0,0.0


In [11]:
train_tmp['binned_help_usage'] = pd.cut(train_tmp['help_usage'], bins=[-np.inf, 0, 1, 3, np.inf], labels=[0, 1, 2, 3])
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage
0,0,115,5692,0,1,5,1,0.0,0
1,436401051,246496,5692,0,127,5,1,2.0,2
2,406345961,408119,5692,0,44,5,1,0.0,0
3,1515513043,637773,5692,0,136,5,1,2.0,2
4,2420045564,999788,5692,0,80,5,1,0.0,0
...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0


In [12]:
# Je n'ai plus besoin des cours, je les enlève
train_tmp = train_tmp[train_tmp.content_type_id == 0]
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage
0,0,115,5692,0,1,5,1,0.0,0
1,436401051,246496,5692,0,127,5,1,2.0,2
2,406345961,408119,5692,0,44,5,1,0.0,0
3,1515513043,637773,5692,0,136,5,1,2.0,2
4,2420045564,999788,5692,0,80,5,1,0.0,0
...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0


## Niveau de difficulté des questions

In [13]:
# Listening (0) or reading (1)
train_tmp['L | R'] = pd.cut(train_tmp['part'], bins=[-np.inf, 4, np.inf], labels=[0, 1])
train_tmp.sample(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage,L | R
14608818,1988872378,1945769580,481,0,63,2,1,0.0,0,0
5078251,12532750965,283622187,8284,0,51,5,1,1.0,1,1
14411977,2670579,777536352,5632,0,42,5,1,1.0,1,1
6846515,411670,749701121,589,0,12,2,0,0.0,0,0
8835931,2176245642,2008265241,4981,0,313,5,1,6.0,3,1
18185992,1266295402,1582157312,5206,0,126,5,0,4.0,3,1
799460,253776,346631792,3365,0,5,4,1,0.0,0,0
6657368,10247143,958226808,5600,0,70,5,1,0.0,0,1
17261677,12611105,874829112,9501,0,76,5,1,1.0,1,1
1596887,1923124875,504534239,294,0,45,2,1,1.0,1,0


In [14]:
train_tmp['Difficulty_level'] = 0
train_tmp['Difficulty_level'][train_tmp['L | R'] == 0] = train_tmp['part'][train_tmp['L | R'] == 0]
train_tmp['Difficulty_level'][train_tmp['L | R'] == 1] = train_tmp['part'][train_tmp['L | R'] == 1] - 4

train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage,L | R,Difficulty_level
0,0,115,5692,0,1,5,1,0.0,0,1,1
1,436401051,246496,5692,0,127,5,1,2.0,2,1,1
2,406345961,408119,5692,0,44,5,1,0.0,0,1,1
3,1515513043,637773,5692,0,136,5,1,2.0,2,1,1
4,2420045564,999788,5692,0,80,5,1,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0,1,3
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0,1,3
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0,1,3
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0,1,3


In [15]:
col = ['timestamp', 'user_id', 'content_id',
       'task_container_id', 'answered_correctly',
       'binned_help_usage', 'L | R', 'Difficulty_level']
train_tmp = train_tmp[col]
train_tmp

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,binned_help_usage,L | R,Difficulty_level
0,0,115,5692,1,1,0,1,1
1,436401051,246496,5692,127,1,2,1,1
2,406345961,408119,5692,44,1,0,1,1
3,1515513043,637773,5692,136,1,2,1,1
4,2420045564,999788,5692,80,1,0,1,1
...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,115,1,0,1,3
21733589,487996565,2124260567,8051,115,1,0,1,3
21733590,487996565,2124260567,8052,115,1,0,1,3
21733591,487996565,2124260567,8053,115,0,0,1,3


In [16]:
train_tmp.dtypes

timestamp                int64
user_id                  int32
content_id               int16
task_container_id        int16
answered_correctly        int8
binned_help_usage     category
L | R                 category
Difficulty_level         int64
dtype: object

In [17]:
train_tmp.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'answered_correctly', 'binned_help_usage', 'L | R', 'Difficulty_level'],
      dtype='object')

In [18]:
train_tmp[['content_id', 'L | R', 'Difficulty_level']].groupby('content_id').max().to_csv('level.csv')

In [19]:
train_tmp = train_tmp.astype({'user_id': 'object',
                              'content_id': 'object',
                              'task_container_id': 'object'})
train_tmp.dtypes

timestamp                int64
user_id                 object
content_id              object
task_container_id       object
answered_correctly        int8
binned_help_usage     category
L | R                 category
Difficulty_level         int64
dtype: object

In [20]:
X_train, X_test = train_test_split(train_tmp, test_size=0.20, random_state=123)

In [21]:
encoder_user = TargetEncoder()
encoder_content_id = TargetEncoder()
encoder_task_container_id = TargetEncoder()

encoder_user.fit(X_train['user_id'], X_train['answered_correctly'])
encoder_content_id.fit(X_train['content_id'], X_train['answered_correctly'])
encoder_task_container_id.fit(X_train['task_container_id'], X_train['answered_correctly'])

TargetEncoder(cols=['task_container_id'])

In [22]:
X_train['user_id_enc'] = encoder_user.transform(X_train['user_id'])
X_test['user_id_enc'] = encoder_user.transform(X_test['user_id'])

X_train['content_id_enc'] = encoder_content_id.transform(X_train['content_id'])
X_test['content_id_enc'] = encoder_content_id.transform(X_test['content_id'])

X_train['task_container_id_enc'] = encoder_task_container_id.transform(X_train['task_container_id'])
X_test['task_container_id_enc'] = encoder_task_container_id.transform(X_test['task_container_id'])


In [23]:
# A réserver. Je ne sais pas comment transfere cela en soumission
# Car je n'ai pas la cible dans le fichier de test

# X_train['user_id_enc'], X_test['user_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='user_id', alpha=5)
# X_train['content_id_enc'], X_test['content_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='content_id', alpha=5)
# X_train['task_container_id_enc'], X_test['task_container_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='task_container_id', alpha=5)

---

## Training

In [24]:
X_train.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'answered_correctly', 'binned_help_usage', 'L | R', 'Difficulty_level',
       'user_id_enc', 'content_id_enc', 'task_container_id_enc'],
      dtype='object')

In [25]:
feat = ['timestamp',  'binned_help_usage', 'L | R', 'Difficulty_level',
       'user_id_enc', 'content_id_enc', 'task_container_id_enc']
target = 'answered_correctly'

y_train = X_train[target]
y_test = X_test[target]

X_train = X_train[feat]
X_test = X_test[feat]

In [26]:
X_train.dtypes

timestamp                   int64
binned_help_usage        category
L | R                    category
Difficulty_level            int64
user_id_enc               float64
content_id_enc            float64
task_container_id_enc     float64
dtype: object

---

In [21]:
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : 'roc', 
            "eval_set" : [(X_test, y_test)],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': 'auto'
           }

In [22]:
import lightgbm as lgb
#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 1000 define only the absolute maximum
clf = lgb.LGBMClassifier(num_leaves= 15, max_depth=-1, 
                         random_state=314, 
                         silent=True, 
                         metric='None', 
                         n_jobs=4, 
                         n_estimators=1000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.1)

In [23]:
#force larger number of max trees and smaller learning rate
clf.fit(X_train, y_train)

LGBMClassifier(colsample_bytree=0.9, metric='None', n_estimators=1000, n_jobs=4,
               num_leaves=15, random_state=314, subsample=0.9)

In [24]:
y_pred = clf.predict(X_test)

In [30]:
round(roc_auc_score(y_test, y_pred), 3)

0.656

In [28]:
pickle.dump(clf, open('model.sav', 'wb'))
pickle.dump(encoder_user, open('encoder_user.sav', 'wb'))
pickle.dump(encoder_content_id, open('encoder_content_id.sav', 'wb'))
pickle.dump(encoder_task_container_id, open('encoder_task_container_id.sav', 'wb'))