In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plots
import seaborn as sns # plots
import gc
import riiideducation
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/answer-correctness-rapids-xgb-lgbm/user_answers.parquet
/kaggle/input/answer-correctness-rapids-xgb-lgbm/lectures.parquet
/kaggle/input/answer-correctness-rapids-xgb-lgbm/new_train.parquet
/kaggle/input/answer-correctness-rapids-xgb-lgbm/__results__.html
/kaggle/input/answer-correctness-rapids-xgb-lgbm/content_answers.parquet
/kaggle/input/answer-correctness-rapids-xgb-lgbm/submission.csv
/kaggle/input/answer-correctness-rapids-xgb-lgbm/baseline_model.pickle.dat
/kaggle/input/answer-correctness-rapids-xgb-lgbm/__notebook__.ipynb
/kaggle/input/answer-correctness-rapids-xgb-lgbm/__output__.json
/kaggle/input/answer-correctness-rapids-xgb-lgbm/questions.parquet
/kaggle/input/answer-correctness-rapids-xgb-lgbm/custom.css
/kaggle/input/answer-correctness-rapids-xgb-lgbm/dask-worker-space/dask-worker-space/worker-ad2ucw3q.dirlock
/kaggle/input/answer-correctness-rapids-xgb-lgbm/dask-worker-space/dask-worker-space/global.lock
/kaggle/input/answer-correctness-rapids-xgb-lgbm/dask

# Riiid answer prediction - XGBoost

## Steps
1. Load
2. Process
3. Model
4. Evaluate

## 1. Load

### Issue    : Data volume  
### Solution : RAPIDS library & Kaggle GPU (39H/week)

In [2]:
# Rapids Imports
import cudf
import cupy # CuPy is an open-source array library accelerated with NVIDIA CUDA.

### Data : *train.csv*

In [3]:
%%time

# Read in data
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "int8"
}

train = cudf.read_csv('../input/riiid-test-answer-prediction/train.csv', dtype=dtypes)

CPU times: user 2.91 s, sys: 2.89 s, total: 5.8 s
Wall time: 1min 11s


In [4]:
# Fill in missing values with "-1"
train["prior_question_elapsed_time"] = train["prior_question_elapsed_time"].fillna(-1)
train["prior_question_had_explanation"] = train["prior_question_had_explanation"].fillna(-1)

___

# 2. Process

In [5]:
def aggregations(frame, target):
    """
    Thus function create aggregations data.
    """
    cols = ['content_id']  # Columns to aggregate
    
    aggs = ['var', 'mean']  # List of aggregation functions
    
    aggs_dfs = []  # List of aggregated DataFrame
    
    for col in cols:  # Loop over the columns to aggregate
        
        df = frame.groupby(col).agg({target : aggs})
        
        df.columns = [ col[0] + new_column for new_column in df.columns.droplevel()]
        
        df[col] = df.index  # Add the index as column for the merge
        
        frame = frame.merge(df, on=col)  # Merge based on the same column
        
        aggs_dfs.append(df)
        
    return frame, aggs_dfs[0]

def preprocess_frame(frame, features, target):
    """
    This function do the preprocessing on the dataframe and the feature
    engineering.
    """
    
    frame = frame[features]  # Working only on features
        
    frame = aggregations(frame, target)  # Do the aggregations

    
    return frame

# 3. Model

In [6]:
cudf.set_allocator("managed")

In [7]:
# %%time

# Let's exclude all observations where (content_type_id = 1) & (answered_correctly = -1)
train = train[train['content_type_id'] != 1]
train = train[train['answered_correctly'] != -1].reset_index(drop=True)

In [8]:
%%time

# RAPIDS roc_auc_score is 16x faster than sklearn. - cdeotte
from cuml.metrics import roc_auc_score
from cuml.preprocessing.model_selection import train_test_split
import xgboost
import pickle

CPU times: user 555 ms, sys: 53.9 ms, total: 609 ms
Wall time: 1.33 s


In [9]:
def train_xgb_model(X_train, X_test, y_train, y_test, params, prints=True):
    '''Trains an XGB and returns the trained model + ROC value.'''
    # Create DMatrix - is optimized for both memory efficiency and training speed.
    train_matrix = xgboost.DMatrix(data = X_train, label = y_train)
    
    # Create & Train the model
    model = xgboost.train(params, dtrain = train_matrix)

    # Make prediction
    predicts = model.predict(xgboost.DMatrix(X_test))
    roc = roc_auc_score(y_test.astype('int32'), predicts)

    if prints:
        print("ROC: {:.5}".format(roc))
    
    return model, roc


def param_tuning_graph(param_values, roc_values):
    '''Represents visually the ROC results for the speciffic parameter tune.'''
    
    plt.figure(figsize=(18, 3))
    ax = sns.barplot(x=param_values, y=roc_values, palette=custom_colors)

    for p in ax.patches:
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy() 
        ax.annotate(f'{height:.5%}', (x + width/2, y + height*1.02), ha='center')

In [10]:
%%time

target = 'answered_correctly'

# Preprocessing
train_proc, c_aggs = preprocess_frame(train, train.columns.tolist(), target)

features = train_proc.columns.tolist()

CPU times: user 1.88 s, sys: 338 ms, total: 2.22 s
Wall time: 3.52 s


In [11]:
features.remove('answered_correctly')
features.remove('user_answer')

In [12]:
%%time

# Features, target and train/test split
X = train_proc[features]
y = train_proc[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, shuffle=False, stratify=y)

CPU times: user 7.29 ms, sys: 548 µs, total: 7.83 ms
Wall time: 9.01 ms


In [13]:
params1 = {
    'max_depth' : 4,
    'max_leaves' : 2**4,
    'tree_method' : 'gpu_hist',
    'objective' : 'reg:logistic',
    'grow_policy' : 'lossguide'
}

In [14]:
model, roc = train_xgb_model(X_train, X_test, y_train, y_test, params1, prints=True)

version = "xgb_v6"
model.save_model(version)

ROC: 0.72266


In [15]:
# model = xboost.load_model(version)

In [16]:
def predict_from(model, Xs, threshold=0.6):
    """
    This function get the predictions from a given pandas dataframe format
    in need to be converted to the model specifics.
    """
    
    dmatrix = xgboost.DMatrix(Xs)  # Convert DataFrame column to DMatrix
    
    predictions_probas = model.predict(dmatrix)  # Get the probas of predictions
    
    predictions = predictions_probas > threshold  # Get True or False
    
    return predictions.astype(int)  # Predictions with 1 for True and 0 for False

In [17]:
def link_to_aggs(Xs, aggs, col):
    
    Xs = cudf.from_pandas(Xs)  # Convert pandas to cudf
        
    Xs = Xs.merge(aggs, how='left', on=col)  # Merge cudf DataFrames
    
    return Xs.to_pandas()

# 4. Evaluate

# 5. Predict and Submit

In [18]:
features_submission = ['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'prior_question_elapsed_time',
       'prior_question_had_explanation', 'prior_group_answers_correct',
       'prior_group_responses']

f_sub = set(features_submission)  # Features available for submission

dtypes_sub = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    "task_container_id": "int16",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "int8"
}

In [19]:
f_train = set(features)  # Features used for training

In [20]:
# print("Intersection :", f_sub & f_train)
# print("Difference   :", f_sub - f_train)
# print("Difference   :", f_train - f_sub)

In [21]:
features

['row_id',
 'timestamp',
 'user_id',
 'content_id',
 'content_type_id',
 'task_container_id',
 'prior_question_elapsed_time',
 'prior_question_had_explanation',
 'cvar',
 'cmean']

___

In [22]:
# Create the env
env = riiideducation.make_env()

In [23]:
# Create the iterator
iter_test = env.iter_test()

In [24]:
# Iter and predict
for (test_df, sample_prediction_df) in iter_test:
    
    X = test_df
    
    # X = link_to_aggs(test_df, u_aggs, 'user_id')  # Add the aggregated data
    X = link_to_aggs(X, c_aggs, 'content_id')  # Add the aggregated data
    
    X = X[features]  # Only take defined features
    
    # Preprocessing block
    X["prior_question_elapsed_time"] = X["prior_question_elapsed_time"].fillna(-1)
    X["prior_question_had_explanation"] = X["prior_question_had_explanation"].fillna(False)
    
    X = X[features].astype(dtypes_sub)  # Get the features
    
    predictions = predict_from(model, X, 0.65)  # Get predictions
    
    test_df['answered_correctly'] = predictions  # Assign predictions
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

___