In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("default")
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder

import pickle
from functions import *

# Import

In [8]:
%%time

path = "C:/riiid-test-answer-prediction/reduced_riiid_train.pkl.gzip"
col = ['timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'part', 'answered_correctly']
train = pd.read_pickle(path)


train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21733593 entries, 0 to 21733592
Data columns (total 7 columns):
 #   Column              Dtype
---  ------              -----
 0   timestamp           int64
 1   user_id             int32
 2   content_id          int16
 3   content_type_id     int8 
 4   task_container_id   int16
 5   part                int64
 6   answered_correctly  int8 
dtypes: int16(2), int32(1), int64(2), int8(2)
memory usage: 704.7 MB
Wall time: 1.3 s


# Features engineering

In [21]:
encoder_user = TargetEncoder()
encoder_content_id = TargetEncoder()
encoder_task_container_id = TargetEncoder()

encoder_user.fit(X_train['user_id'], X_train['answered_correctly'])
encoder_content_id.fit(X_train['content_id'], X_train['answered_correctly'])
encoder_task_container_id.fit(X_train['task_container_id'], X_train['answered_correctly'])

TargetEncoder(cols=['task_container_id'])

In [22]:
X_train['user_id_enc'] = encoder_user.transform(X_train['user_id'])
X_test['user_id_enc'] = encoder_user.transform(X_test['user_id'])

X_train['content_id_enc'] = encoder_content_id.transform(X_train['content_id'])
X_test['content_id_enc'] = encoder_content_id.transform(X_test['content_id'])

X_train['task_container_id_enc'] = encoder_task_container_id.transform(X_train['task_container_id'])
X_test['task_container_id_enc'] = encoder_task_container_id.transform(X_test['task_container_id'])


In [23]:
# A réserver. Je ne sais pas comment transfere cela en soumission
# Car je n'ai pas la cible dans le fichier de test

# X_train['user_id_enc'], X_test['user_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='user_id', alpha=5)
# X_train['content_id_enc'], X_test['content_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='content_id', alpha=5)
# X_train['task_container_id_enc'], X_test['task_container_id_enc'] = mean_target_encoding(X_train, X_test, target='answered_correctly', categorical='task_container_id', alpha=5)

---

## Training

In [24]:
X_train.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'answered_correctly', 'binned_help_usage', 'L | R', 'Difficulty_level',
       'user_id_enc', 'content_id_enc', 'task_container_id_enc'],
      dtype='object')

In [25]:
feat = ['timestamp',  'binned_help_usage', 'L | R', 'Difficulty_level',
       'user_id_enc', 'content_id_enc', 'task_container_id_enc']
target = 'answered_correctly'

y_train = X_train[target]
y_test = X_test[target]

X_train = X_train[feat]
X_test = X_test[feat]

In [26]:
X_train.dtypes

timestamp                   int64
binned_help_usage        category
L | R                    category
Difficulty_level            int64
user_id_enc               float64
content_id_enc            float64
task_container_id_enc     float64
dtype: object

---

In [21]:
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : 'roc', 
            "eval_set" : [(X_test, y_test)],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': 'auto'
           }

In [22]:
import lightgbm as lgb
#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 1000 define only the absolute maximum
clf = lgb.LGBMClassifier(num_leaves= 15, max_depth=-1, 
                         random_state=314, 
                         silent=True, 
                         metric='None', 
                         n_jobs=4, 
                         n_estimators=1000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.1)

In [23]:
#force larger number of max trees and smaller learning rate
clf.fit(X_train, y_train)

LGBMClassifier(colsample_bytree=0.9, metric='None', n_estimators=1000, n_jobs=4,
               num_leaves=15, random_state=314, subsample=0.9)

In [24]:
y_pred = clf.predict(X_test)

In [30]:
round(roc_auc_score(y_test, y_pred), 3)

0.656

In [28]:
pickle.dump(clf, open('model.sav', 'wb'))
pickle.dump(encoder_user, open('encoder_user.sav', 'wb'))
pickle.dump(encoder_content_id, open('encoder_content_id.sav', 'wb'))
pickle.dump(encoder_task_container_id, open('encoder_task_container_id.sav', 'wb'))