# Riiid Baseline

## Import libraries

In [None]:
import random
from collections import defaultdict
from time import time
import gc
import pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
# import optuna.integration.lightgbm as lgb

from tqdm.notebook import tqdm

import riiideducation

try:
    env = riiideducation.make_env()
    iter_test = env.iter_test()
except:
    pass

## Load data

In [None]:
# nrows = 10_000_000

In [None]:
start = time()
# train = pd.read_hdf('../input/riiid-train-data-multiple-formats/riiid_train.h5', stop=nrows)

question_contents = pd.read_pickle('../input/riiid-preprocess-data/user_content.pkl')
questions = pd.read_pickle('../input/riiid-preprocess-data/questions.pkl')

with open('../input/riiid-preprocess-data/user_id_idxs.pkl', 'rb') as f:
    user_id_idxs = pickle.load(f)

print(f'{time() - start:.2f}')

## Split to feature-generating data and train data

In [None]:
def get_feature_train_val_idxs(user_id_idxs, feature_size, train_size, val_size, new_user_frac=.2):
    feature_idxs, train_idxs, val_idxs = [], [], []
    np.random.seed(42)

    for indices in random.sample(list(user_id_idxs), len(user_id_idxs)):
        if len(feature_idxs) > feature_size:
            break

        if len(val_idxs) < val_size:
            if np.random.rand() < new_user_frac:
                val_idxs.extend(indices)
            else:
                offset = np.random.randint(len(indices)//2, len(indices))
                feature_idxs.extend(indices[:len(indices)//2])
                train_idxs.extend(indices[len(indices)//2:offset])
                val_idxs.extend(indices[offset:])
        else:
            if len(train_idxs) < train_size:
                feature_idxs.extend(indices[:len(indices)//2])
                train_idxs.extend(indices[len(indices)//2:])
            else:
                feature_idxs.extend(indices)
    return feature_idxs, train_idxs, val_idxs

In [None]:
feature_size = 50_000_000
train_size = 30_000_000
val_size = 2_500_000

start = time()
feature_idxs, train_idxs, val_idxs = get_feature_train_val_idxs(user_id_idxs, 
                                                                feature_size, 
                                                                train_size, 
                                                                val_size, 
                                                                new_user_frac=.2)

print(len(feature_idxs), len(train_idxs), len(val_idxs))
print(f'{time() - start:.2f}')

In [None]:
feature_df = question_contents.loc[feature_idxs]
train_df = question_contents.loc[train_idxs]
val_df = question_contents.loc[val_idxs]

In [None]:
del question_contents, feature_idxs, train_idxs, val_idxs
gc.collect()

## Feature engineering

In [None]:
target = ["answered_correctly"]

### Users_state

In [None]:
def get_users_state(feature_df):
    users_state = defaultdict(lambda:{
        'user_accuracy':0.660, 
        'correctly_answered_content_cnt':0, 
        'answered_content_cnt':0, 
        'user_content_attempts':defaultdict(lambda:0)
    })

    for user_id, content_id, answer in feature_df[['user_id', 'content_id', 'answered_correctly']].values:
        if users_state[user_id]["user_content_attempts"][content_id] < 5:
            users_state[user_id]["user_content_attempts"][content_id] += 1

        users_state[user_id]["correctly_answered_content_cnt"] += answer
        users_state[user_id]["answered_content_cnt"] += 1

        if users_state[user_id]["answered_content_cnt"] >= 1:
            users_state[user_id]["user_accuracy"] = users_state[user_id]["correctly_answered_content_cnt"] \
            / users_state[user_id]["answered_content_cnt"]
    
    return users_state

In [None]:
start = time()
users_state = get_users_state(feature_df)
print(f'{time() - start:.2f}')

In [None]:
del feature_df
gc.collect()

### Update users_state

In [None]:
def update_users_state(users_state, prev_test_df):
    for user_id, content_id, answer in prev_test_df[['user_id', 'content_id', 'answered_correctly']].values:
        if users_state[user_id]["user_content_attempts"][content_id] < 5:
            users_state[user_id]["user_content_attempts"][content_id] += 1

        users_state[user_id]["correctly_answered_content_cnt"] += answer
        users_state[user_id]["answered_content_cnt"] += 1

        if users_state[user_id]["answered_content_cnt"] >= 1:
            users_state[user_id]["user_accuracy"] = users_state[user_id]["correctly_answered_content_cnt"] / users_state[user_id]["answered_content_cnt"]

    return users_state

In [None]:
def update_data(data, users_state, questions):
    start = time()
    
    user_accuracy = []
    answered_content_cnt = []
    correctly_answered_content_cnt = []
    user_content_attempts = []
    
    data = data.copy()
    
    for user_id, content_id in tqdm(data[['user_id', 'content_id']].values):
        user_accuracy.append(users_state[user_id]['user_accuracy'])
        answered_content_cnt.append(users_state[user_id]['answered_content_cnt'])
        correctly_answered_content_cnt.append(users_state[user_id]['correctly_answered_content_cnt'])
        user_content_attempts.append(min(5, users_state[user_id]['user_content_attempts'][content_id] + 1))
    
    data['user_accuracy'] = user_accuracy
    data['answered_content_cnt'] = answered_content_cnt
    data['correctly_answered_content_cnt'] = correctly_answered_content_cnt
    data['user_content_attempts'] = user_content_attempts
    
    data = data.merge(questions, how='left', on='content_id')
    
    data['hmean_user_content_accuracy'] = 2 * (data['user_accuracy'] * data['content_accuracy']) / (data['user_accuracy'] + data['content_accuracy'])
    data['hmean_user_part_accuracy'] = 2 * (data['user_accuracy'] * data['part_accuracy']) / (data['user_accuracy'] + data['part_accuracy'])
    data['hmean_user_tags_accuracy'] = 2 * (data['user_accuracy'] * data['tags_accuracy']) / (data['user_accuracy'] + data['tags_accuracy'])
    
    data['prior_question_elapsed_time'].fillna(23916, inplace=True)
#     data['prior_question_had_explanation'].fillna(False, inplace=True)
    
    print(f'{time() - start:.2f}')
    return data

### Train model

In [None]:
updated_train_df = update_data(train_df, users_state, questions)

In [None]:
users_state = update_users_state(users_state, updated_train_df)

In [None]:
updated_val_df = update_data(val_df, users_state, questions)

In [None]:
del train_df, val_df
gc.collect()

In [None]:
features = [
    # user-based features
    "user_accuracy",
    "correctly_answered_content_cnt",
    "answered_content_cnt",
    
    # content-based features
    "content_accuracy",
#     'tags_accuracy',
#     'part_accuracy',
    
    # given features
    'prior_question_elapsed_time',
    
    # other features
    "hmean_user_content_accuracy",
#     "hmean_user_tags_accuracy",
#     "hmean_user_part_accuracy",
    'user_content_attempts'
]

categorical_features = [
    "part",
#     'prior_question_had_explanation',
    'tags'
]

train_data = lgb.Dataset(
    data=updated_train_df[features + categorical_features],
    label=updated_train_df[target],
    categorical_feature=categorical_features,
    free_raw_data=False
)

val_data = lgb.Dataset(
    data=updated_val_df[features + categorical_features],
    label=updated_val_df[target],
    categorical_feature=categorical_features,
    free_raw_data=False,
    reference=train_data
)

In [None]:
del updated_train_df
gc.collect()

In [None]:
lgbm_params = {
    "objective":"binary",
    "metric":"auc"
}

evals_result = {}

model = None

start = time()
model = lgb.train(
    params = lgbm_params,
    train_set = train_data, 
    valid_sets = [train_data, val_data], 
    init_model = model,
    num_boost_round = 10_000,
    verbose_eval = 10,
    early_stopping_rounds = 50,
    evals_result = evals_result,
    categorical_feature = categorical_features
)

model.save_model('model.txt')

print(f'{time() - start:.2f}')

In [None]:
feature_importances = model.feature_importance("gain")

feature_importances /= np.sum(feature_importances)

for i in range(len(features)):
    print(f"{features[i]}: {feature_importances[i]:.3f}")
  
for i in range(len(categorical_features)):
    print(f"{categorical_features[i]}: {feature_importances[len(features)+i]:.3f}")
    
lgb.plot_importance(model, importance_type='gain', dpi=100)
plt.show()

In [None]:
feature_importances = model.feature_importance("split")

for i in range(len(features)):
    print(f"{features[i]}: {feature_importances[i]}")

for i in range(len(categorical_features)):
    print(f"{categorical_features[i]}: {feature_importances[len(features)+i]:.2f}")

lgb.plot_importance(model, importance_type = 'split', dpi=100)
plt.show()

In [None]:
test_dtype = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'prior_question_elapsed_time': 'float32',
#     'prior_question_had_explanation': 'category'
}

users_state = update_users_state(users_state, updated_val_df)

prev_test_df = None

for idx, (test_df, _) in tqdm(enumerate(iter_test)):
    if prev_test_df is not None:
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        users_state = update_users_state(users_state, prev_test_df[lambda x:x['content_type_id'] == 0])
        
        train_data = val_data
        
        val_data = lgb.Dataset(data=prev_test_df[features+categorical_features],
                               label=prev_test_df[target],
                               categorical_feature=categorical_features,
                               free_raw_data=False,
                               reference=train_data
                              )
        
        model = lgb.train(
            params = lgbm_params,
            train_set = train_data,
            valid_sets = [train_data, val_data],
            init_model = model,
            keep_training_booster=True,
            num_boost_round = 10_000,
            verbose_eval = 10,
            early_stopping_rounds = 50,
            categorical_feature = categorical_features
            )
    
    test_df = update_data(test_df, users_state, questions)
    
    test_df = test_df.astype(test_dtype)

    test_df['answered_correctly'] = model.predict(test_df[features + categorical_features], 
                                                  num_iteration=model.best_iteration)
        
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    
    prev_test_df = test_df.copy()

In [None]:
submission = pd.read_csv('./submission.csv')

In [None]:
display(submission)