In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("default")
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler

import pickle
from functions import *

# Import

In [2]:
%%time

path = "C:/riiid-test-answer-prediction/reduced_riiid_train.pkl.gzip"
col = ['timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'part', 'answered_correctly']
train = pd.read_pickle(path)[col]


train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21733593 entries, 0 to 21733592
Data columns (total 7 columns):
 #   Column              Dtype
---  ------              -----
 0   timestamp           int64
 1   user_id             int32
 2   content_id          int16
 3   content_type_id     int8 
 4   task_container_id   int16
 5   part                int64
 6   answered_correctly  int8 
dtypes: int16(2), int32(1), int64(2), int8(2)
memory usage: 704.7 MB
Wall time: 2.16 s


# Features engineering

## Niveau d'usage des cours

In [3]:
help_usage = train[train.answered_correctly == -1].groupby('user_id')['content_id'].count()
help_usage = help_usage.reset_index().rename(columns={'content_id': 'help_usage'})
help_usage.to_csv('help.csv', index=False)

In [4]:
train_tmp = pd.merge(train, help_usage, on='user_id', how='left').fillna(0)
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage
0,0,115,5692,0,1,5,1,0.0
1,436401051,246496,5692,0,127,5,1,2.0
2,406345961,408119,5692,0,44,5,1,0.0
3,1515513043,637773,5692,0,136,5,1,2.0
4,2420045564,999788,5692,0,80,5,1,0.0
...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0
21733589,487996565,2124260567,8051,0,115,7,1,0.0
21733590,487996565,2124260567,8052,0,115,7,1,0.0
21733591,487996565,2124260567,8053,0,115,7,0,0.0


In [5]:
train_tmp['binned_help_usage'] = pd.cut(train_tmp['help_usage'], bins=[-np.inf, 0, 1, 3, np.inf], labels=[0, 1, 2, 3])
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage
0,0,115,5692,0,1,5,1,0.0,0
1,436401051,246496,5692,0,127,5,1,2.0,2
2,406345961,408119,5692,0,44,5,1,0.0,0
3,1515513043,637773,5692,0,136,5,1,2.0,2
4,2420045564,999788,5692,0,80,5,1,0.0,0
...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0


In [6]:
# Je n'ai plus besoin des cours, je les enlève
train_tmp = train_tmp[train_tmp.content_type_id == 0]
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage
0,0,115,5692,0,1,5,1,0.0,0
1,436401051,246496,5692,0,127,5,1,2.0,2
2,406345961,408119,5692,0,44,5,1,0.0,0
3,1515513043,637773,5692,0,136,5,1,2.0,2
4,2420045564,999788,5692,0,80,5,1,0.0,0
...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0


## Niveau de difficulté des questions

In [7]:
# Listening (0) or reading (1)
train_tmp['L | R'] = pd.cut(train_tmp['part'], bins=[-np.inf, 4, np.inf], labels=[0, 1])
train_tmp.sample(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage,L | R
16402560,321239022,1735032996,10711,0,101,4,1,3.0,2,0
12805854,1500804810,1693868903,242,0,343,2,1,5.0,3,0
6178913,692965,579542929,3854,0,13,5,1,2.0,2,1
16103787,218785250,215093192,11504,0,63,5,1,0.0,0,1
17049552,584390,104151926,8753,0,14,5,1,0.0,0,1
8430819,96608,1867188193,4535,0,3,5,0,0.0,0,1
15816234,114006539,1586794912,522,0,6,2,1,0.0,0,0
5617186,652031123,1926012083,4476,0,169,5,0,5.0,3,1
19081318,154586370,665835668,551,0,104,2,1,4.0,3,0
8385190,613552655,1803879512,915,0,266,2,1,2.0,2,0


In [8]:
train_tmp['Difficulty_level'] = 0
train_tmp['Difficulty_level'][train_tmp['L | R'] == 0] = train_tmp['part'][train_tmp['L | R'] == 0]
train_tmp['Difficulty_level'][train_tmp['L | R'] == 1] = train_tmp['part'][train_tmp['L | R'] == 1] - 4

train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage,L | R,Difficulty_level
0,0,115,5692,0,1,5,1,0.0,0,1,1
1,436401051,246496,5692,0,127,5,1,2.0,2,1,1
2,406345961,408119,5692,0,44,5,1,0.0,0,1,1
3,1515513043,637773,5692,0,136,5,1,2.0,2,1,1
4,2420045564,999788,5692,0,80,5,1,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0,1,3
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0,1,3
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0,1,3
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0,1,3


In [9]:
col = ['timestamp', 'user_id', 'content_id',
       'task_container_id', 'answered_correctly',
       'binned_help_usage', 'L | R', 'Difficulty_level']
train_tmp = train_tmp[col]
train_tmp

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,binned_help_usage,L | R,Difficulty_level
0,0,115,5692,1,1,0,1,1
1,436401051,246496,5692,127,1,2,1,1
2,406345961,408119,5692,44,1,0,1,1
3,1515513043,637773,5692,136,1,2,1,1
4,2420045564,999788,5692,80,1,0,1,1
...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,115,1,0,1,3
21733589,487996565,2124260567,8051,115,1,0,1,3
21733590,487996565,2124260567,8052,115,1,0,1,3
21733591,487996565,2124260567,8053,115,0,0,1,3


In [10]:
train_tmp.dtypes

timestamp                int64
user_id                  int32
content_id               int16
task_container_id        int16
answered_correctly        int8
binned_help_usage     category
L | R                 category
Difficulty_level         int64
dtype: object

In [11]:
train_tmp.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'answered_correctly', 'binned_help_usage', 'L | R', 'Difficulty_level'],
      dtype='object')

In [12]:
train_tmp[['content_id', 'L | R', 'Difficulty_level']].groupby('content_id').max().to_csv('level.csv')

In [13]:
train_tmp = train_tmp.astype({'user_id': 'object',
                              'content_id': 'object',
                              'task_container_id': 'object'})
train_tmp.dtypes

timestamp                int64
user_id                 object
content_id              object
task_container_id       object
answered_correctly        int8
binned_help_usage     category
L | R                 category
Difficulty_level         int64
dtype: object

In [14]:
X_train, X_test = train_test_split(train_tmp, test_size=0.20, random_state=123)

In [15]:
encoder_user = TargetEncoder()
encoder_content_id = TargetEncoder()
encoder_task_container_id = TargetEncoder()

encoder_user.fit(X_train['user_id'], X_train['answered_correctly'])
encoder_content_id.fit(X_train['content_id'], X_train['answered_correctly'])
encoder_task_container_id.fit(X_train['task_container_id'], X_train['answered_correctly'])

TargetEncoder(cols=['task_container_id'])

In [16]:
X_train['user_id_enc'] = encoder_user.transform(X_train['user_id'])
X_test['user_id_enc'] = encoder_user.transform(X_test['user_id'])

X_train['content_id_enc'] = encoder_content_id.transform(X_train['content_id'])
X_test['content_id_enc'] = encoder_content_id.transform(X_test['content_id'])

X_train['task_container_id_enc'] = encoder_task_container_id.transform(X_train['task_container_id'])
X_test['task_container_id_enc'] = encoder_task_container_id.transform(X_test['task_container_id'])


In [17]:
# A réserver. Je ne sais pas comment transfere cela en soumission
# Car je n'ai pas la cible dans le fichier de test

# X_train['user_id_enc'], X_test['user_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='user_id', alpha=5)
# X_train['content_id_enc'], X_test['content_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='content_id', alpha=5)
# X_train['task_container_id_enc'], X_test['task_container_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='task_container_id', alpha=5)

---

## Training

In [18]:
X_train.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'answered_correctly', 'binned_help_usage', 'L | R', 'Difficulty_level',
       'user_id_enc', 'content_id_enc', 'task_container_id_enc'],
      dtype='object')

In [19]:
feat = ['timestamp',  'binned_help_usage', 'L | R', 'Difficulty_level',
       'user_id_enc', 'content_id_enc', 'task_container_id_enc']
target = 'answered_correctly'

y_train = X_train[target]
y_test = X_test[target]

X_train = X_train[feat]
X_test = X_test[feat]

In [20]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

---

In [71]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [73]:
xg_cl_param_grid = {'learning_rate': np.arange(0.05,1.05,.05),
                'n_estimators': [200],
                'subsample': np.arange(0.05,1.05,.05)}

In [74]:
xg_cl = XGBClassifier(objective='binary:logistic', seed=123)

In [77]:
randomized = RandomizedSearchCV(estimator=xg_cl,
                                param_distributions=xg_cl_param_grid,
                                n_iter=25,
                                scoring='roc_auc',
                                cv=4,
                                verbose=1,
                                n_jobs=4)

In [78]:
randomized.fit(X_train, y_train)

Fitting 4 folds for each of 25 candidates, totalling 100 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
exception calling callback for <Future at 0x15d4941cd60 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "C:\Miniconda\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "C:\Miniconda\lib\site-packages\joblib\parallel.py", line 347, in __call__
    self.parallel.dispatch_next()
  File "C:\Miniconda\lib\site-packages\joblib\parallel.py", line 780, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "C:\Miniconda\lib\site-packages\joblib\parallel.py", line 847, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Miniconda\lib\site-packages\joblib\parallel.py", line 765, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Miniconda\lib\site-packages\joblib\_parallel_backends.py", line 531, in a

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [None]:
display(randomized_mse.best_params_)
display(randomized_mse.best_score_)

In [None]:
y_pred = randomized.predict(X_test)

In [None]:
round(roc_auc_score(y_test, y_pred), 3)

In [None]:
pickle.dump(clf, open('model.sav', 'wb'))
pickle.dump(encoder_user, open('encoder_user.sav', 'wb'))
pickle.dump(encoder_content_id, open('encoder_content_id.sav', 'wb'))
pickle.dump(encoder_task_container_id, open('encoder_task_container_id.sav', 'wb'))