In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("default")
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler

import pickle
from functions import *

# Import

In [10]:
%%time

path = "C:/riiid-test-answer-prediction/reduced_riiid_train.pkl.gzip"
col = ['timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'part', 'answered_correctly']
train = pd.read_pickle(path)[col]


train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21733593 entries, 0 to 21733592
Data columns (total 7 columns):
 #   Column              Dtype
---  ------              -----
 0   timestamp           int64
 1   user_id             int32
 2   content_id          int16
 3   content_type_id     int8 
 4   task_container_id   int16
 5   part                int64
 6   answered_correctly  int8 
dtypes: int16(2), int32(1), int64(2), int8(2)
memory usage: 704.7 MB
Wall time: 7.57 s


# Features engineering

## Niveau d'usage des cours

In [11]:
help_usage = train[train.answered_correctly == -1].groupby('user_id')['content_id'].count()
help_usage = help_usage.reset_index().rename(columns={'content_id': 'help_usage'})
help_usage.to_csv('help.csv', index=False)

In [12]:
train_tmp = pd.merge(train, help_usage, on='user_id', how='left').fillna(0)
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage
0,0,115,5692,0,1,5,1,0.0
1,436401051,246496,5692,0,127,5,1,2.0
2,406345961,408119,5692,0,44,5,1,0.0
3,1515513043,637773,5692,0,136,5,1,2.0
4,2420045564,999788,5692,0,80,5,1,0.0
...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0
21733589,487996565,2124260567,8051,0,115,7,1,0.0
21733590,487996565,2124260567,8052,0,115,7,1,0.0
21733591,487996565,2124260567,8053,0,115,7,0,0.0


In [13]:
train_tmp['binned_help_usage'] = pd.cut(train_tmp['help_usage'], bins=[-np.inf, 0, 1, 3, np.inf], labels=[0, 1, 2, 3])
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage
0,0,115,5692,0,1,5,1,0.0,0
1,436401051,246496,5692,0,127,5,1,2.0,2
2,406345961,408119,5692,0,44,5,1,0.0,0
3,1515513043,637773,5692,0,136,5,1,2.0,2
4,2420045564,999788,5692,0,80,5,1,0.0,0
...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0


In [14]:
# Je n'ai plus besoin des cours, je les enlève
train_tmp = train_tmp[train_tmp.content_type_id == 0]
train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage
0,0,115,5692,0,1,5,1,0.0,0
1,436401051,246496,5692,0,127,5,1,2.0,2
2,406345961,408119,5692,0,44,5,1,0.0,0
3,1515513043,637773,5692,0,136,5,1,2.0,2
4,2420045564,999788,5692,0,80,5,1,0.0,0
...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0


## Niveau de difficulté des questions

In [15]:
# Listening (0) or reading (1)
train_tmp['L | R'] = pd.cut(train_tmp['part'], bins=[-np.inf, 4, np.inf], labels=[0, 1])
train_tmp.sample(10)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage,L | R
13533553,12550432518,109403587,2203,0,355,3,1,4.0,3,0
6958859,77321584,1422784609,795,0,224,2,1,8.0,3,0
9851374,2247526991,1472620026,4982,0,73,5,1,0.0,0,1
1312142,3030060985,1852081680,10684,0,264,2,1,3.0,2,0
12882003,1899593670,429903868,5185,0,71,5,0,1.0,1,1
8077709,15653555627,2122682884,7737,0,144,7,1,4.0,3,1
857037,7200816047,575804382,3364,0,63,4,0,0.0,0,0
1394072,515486974,1908290179,10685,0,41,2,1,0.0,0,0
1157998,1731639154,410556996,3878,0,27,5,0,0.0,0,1
11330421,13806169,1071609605,4710,0,31,5,1,7.0,3,1


In [16]:
train_tmp['Difficulty_level'] = 0
train_tmp['Difficulty_level'][train_tmp['L | R'] == 0] = train_tmp['part'][train_tmp['L | R'] == 0]
train_tmp['Difficulty_level'][train_tmp['L | R'] == 1] = train_tmp['part'][train_tmp['L | R'] == 1] - 4

train_tmp

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,part,answered_correctly,help_usage,binned_help_usage,L | R,Difficulty_level
0,0,115,5692,0,1,5,1,0.0,0,1,1
1,436401051,246496,5692,0,127,5,1,2.0,2,1,1
2,406345961,408119,5692,0,44,5,1,0.0,0,1,1
3,1515513043,637773,5692,0,136,5,1,2.0,2,1,1
4,2420045564,999788,5692,0,80,5,1,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,0,115,7,1,0.0,0,1,3
21733589,487996565,2124260567,8051,0,115,7,1,0.0,0,1,3
21733590,487996565,2124260567,8052,0,115,7,1,0.0,0,1,3
21733591,487996565,2124260567,8053,0,115,7,0,0.0,0,1,3


In [17]:
col = ['timestamp', 'user_id', 'content_id',
       'task_container_id', 'answered_correctly',
       'binned_help_usage', 'L | R', 'Difficulty_level']
train_tmp = train_tmp[col]
train_tmp

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,binned_help_usage,L | R,Difficulty_level
0,0,115,5692,1,1,0,1,1
1,436401051,246496,5692,127,1,2,1,1
2,406345961,408119,5692,44,1,0,1,1
3,1515513043,637773,5692,136,1,2,1,1
4,2420045564,999788,5692,80,1,0,1,1
...,...,...,...,...,...,...,...,...
21733588,487996565,2124260567,8050,115,1,0,1,3
21733589,487996565,2124260567,8051,115,1,0,1,3
21733590,487996565,2124260567,8052,115,1,0,1,3
21733591,487996565,2124260567,8053,115,0,0,1,3


In [18]:
train_tmp.dtypes

timestamp                int64
user_id                  int32
content_id               int16
task_container_id        int16
answered_correctly        int8
binned_help_usage     category
L | R                 category
Difficulty_level         int64
dtype: object

In [19]:
train_tmp.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'answered_correctly', 'binned_help_usage', 'L | R', 'Difficulty_level'],
      dtype='object')

In [20]:
train_tmp[['content_id', 'L | R', 'Difficulty_level']].groupby('content_id').max().to_csv('level.csv')

In [21]:
train_tmp = train_tmp.astype({'user_id': 'object',
                              'content_id': 'object',
                              'task_container_id': 'object'})
train_tmp.dtypes

timestamp                int64
user_id                 object
content_id              object
task_container_id       object
answered_correctly        int8
binned_help_usage     category
L | R                 category
Difficulty_level         int64
dtype: object

In [22]:
X_train, X_test = train_test_split(train_tmp, test_size=0.20, random_state=123)

In [23]:
encoder_user = TargetEncoder()
encoder_content_id = TargetEncoder()
encoder_task_container_id = TargetEncoder()

encoder_user.fit(X_train['user_id'], X_train['answered_correctly'])
encoder_content_id.fit(X_train['content_id'], X_train['answered_correctly'])
encoder_task_container_id.fit(X_train['task_container_id'], X_train['answered_correctly'])

TargetEncoder(cols=['task_container_id'])

In [24]:
X_train['user_id_enc'] = encoder_user.transform(X_train['user_id'])
X_test['user_id_enc'] = encoder_user.transform(X_test['user_id'])

X_train['content_id_enc'] = encoder_content_id.transform(X_train['content_id'])
X_test['content_id_enc'] = encoder_content_id.transform(X_test['content_id'])

X_train['task_container_id_enc'] = encoder_task_container_id.transform(X_train['task_container_id'])
X_test['task_container_id_enc'] = encoder_task_container_id.transform(X_test['task_container_id'])


In [25]:
# A réserver. Je ne sais pas comment transfere cela en soumission
# Car je n'ai pas la cible dans le fichier de test

# X_train['user_id_enc'], X_test['user_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='user_id', alpha=5)
# X_train['content_id_enc'], X_test['content_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='content_id', alpha=5)
# X_train['task_container_id_enc'], X_test['task_container_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='task_container_id', alpha=5)

---

## Training

In [26]:
X_train.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'answered_correctly', 'binned_help_usage', 'L | R', 'Difficulty_level',
       'user_id_enc', 'content_id_enc', 'task_container_id_enc'],
      dtype='object')

In [27]:
feat = ['timestamp',  'binned_help_usage', 'L | R', 'Difficulty_level',
       'user_id_enc', 'content_id_enc', 'task_container_id_enc']
target = 'answered_correctly'

y_train = X_train[target]
y_test = X_test[target]

X_train = X_train[feat]
X_test = X_test[feat]

In [28]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

---

In [29]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [30]:
xg_cl_param_grid = {'learning_rate': np.arange(0.1,1,.2),
                    'n_estimators': [100, 200, 300]}

In [31]:
xg_cl = XGBClassifier(objective='binary:logistic', seed=123)

In [32]:
randomized = RandomizedSearchCV(estimator=xg_cl,
                                param_distributions=xg_cl_param_grid,
                                n_iter=25,
                                scoring='roc_auc',
                                cv=4,
                                verbose=1,
                                n_jobs=2)

In [33]:
randomized.fit(X_train, y_train)

Fitting 4 folds for each of 15 candidates, totalling 60 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  9.2min
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed: 29.6min finished


RandomizedSearchCV(cv=4,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                          

In [34]:
display(randomized.best_params_)
display(randomized.best_score_)

{'n_estimators': 100, 'learning_rate': 0.1}

nan

In [35]:
y_pred = randomized.predict(X_test)

In [36]:
round(roc_auc_score(y_test, y_pred), 3)

0.656

In [37]:
pickle.dump(clf, open('model.sav', 'wb'))
pickle.dump(encoder_user, open('encoder_user.sav', 'wb'))
pickle.dump(encoder_content_id, open('encoder_content_id.sav', 'wb'))
pickle.dump(encoder_task_container_id, open('encoder_task_container_id.sav', 'wb'))

NameError: name 'clf' is not defined