In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt
import lightgbm as lgb
import riiideducation
from sklearn.metrics import roc_auc_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
env = riiideducation.make_env()

In [None]:
%%time
train = dt.fread("../input/riiid-test-answer-prediction/train.csv").to_pandas()

In [None]:
train = train[['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']]
train.dtypes

In [None]:
train['content_id'] = train['content_id'].astype('int16')
train['content_type_id'] = train['content_type_id'].astype('int8')
train['answered_correctly'] = train['answered_correctly'].astype('int8')
train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'].astype('float32')
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')

In [None]:
origin_shape = train.shape[0]
train = train[train.content_type_id==0]
print(f'remove {origin_shape-train.shape[0]} useless datas.')

In [None]:
elapsed_mean = train['prior_question_elapsed_time'].mean()

In [None]:
train = train.sort_values(['timestamp'], ascending=True).reset_index(drop=True)

In [None]:
results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean', 'sum', np.size, np.std])
results_c.columns = ['correctly_mean_content', 'correctly_sum_content', 'correctly_count_content', 'correctly_std_content']

results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', np.size, np.std])
results_u.columns = ['correctly_mean_user', 'correctly_sum_user', 'correctly_count_user', 'correctly_std_user']

results_etu = train[['user_id','prior_question_elapsed_time']].groupby(['user_id']).agg(['mean'])
results_etu.columns = ['elapsed_time_user']

results_etc = train[['content_id','prior_question_elapsed_time']].groupby(['content_id']).agg(['mean'])
results_etc.columns = ['elapsed_time_content_id']

In [None]:
train.drop(['timestamp', 'content_type_id'], axis=1, inplace=True)

In [None]:
validation = pd.DataFrame()
for i in range(4):
    last_records = train.drop_duplicates('user_id', keep='last')
    train = train[~train.index.isin(last_records.index)]
    validation = validation.append(last_records)
    
len(validation)

In [None]:
X = pd.DataFrame()
for i in range(15):
    last_records = train.drop_duplicates('user_id', keep='last')
    train = train[~train.index.isin(last_records.index)]
    X = X.append(last_records)
    
len(X)

In [None]:
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")
X = pd.merge(X, results_etu, on=['user_id'], how="left")
X = pd.merge(X, results_etc, on=['content_id'], how="left")

In [None]:
validation = pd.merge(validation, results_u, on=['user_id'], how="left")
validation = pd.merge(validation, results_c, on=['content_id'], how="left")
validation = pd.merge(validation, results_etu, on=['user_id'], how="left")
validation = pd.merge(validation, results_etc, on=['content_id'], how="left")

In [None]:
questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',
                            usecols=[0,1,3,4],
                            dtype={'question_id': 'int16',
                              'part': 'int8','bundle_id': 'int8','tags': 'str'}
                          )

In [None]:
tags = questions_df["tags"].str.split(" ", n=10, expand=False)
tags.head()

In [None]:
tags[10033] = []
questions_df['tags'] = tags
questions_df['tags_count'] = questions_df['tags'].apply(lambda x: len(x))
questions_df = questions_df[questions_df['tags_count']!=0]

In [None]:
from collections import Counter

tags_list = []
for tag in questions_df['tags'].tolist():
    tags_list.extend(tag)
tags_counter = dict(Counter(tags_list))

In [None]:
def tag_appr_means(tags):
    l = []
    for tag in tags:
        l.append(tags_counter[tag])
    return np.mean(l)

questions_df['tags_appr_mean'] = questions_df['tags'].apply(tag_appr_means)

In [None]:
questions_df.drop(['tags'], axis=1, inplace=True)
questions_df['part'] = questions_df['part'] - 1
questions_df.head()

In [None]:
X['prior_question_had_explanation'].fillna(False, inplace=True)
X['prior_question_had_explanation'] = X['prior_question_had_explanation'].astype(np.int8)
X['prior_question_elapsed_time'].fillna(elapsed_mean, inplace=True)

validation['prior_question_had_explanation'].fillna(False, inplace=True)
validation['prior_question_had_explanation'] = validation['prior_question_had_explanation'].astype(np.int8)
validation['prior_question_elapsed_time'].fillna(elapsed_mean, inplace=True)

In [None]:
X = pd.merge(X, questions_df, left_on='content_id', right_on='question_id', how='left')
validation = pd.merge(validation, questions_df, left_on='content_id', right_on='question_id', how='left')

In [None]:
train_m = pd.merge(train, questions_df, left_on='content_id', right_on='question_id', how='left')

In [None]:
train_m_part = train_m[['part','answered_correctly']].groupby(['part']).agg(['mean', 'sum', np.std])
train_m_part.columns = ['correctly_mean_part', 'correctly_sum_part', 'correctly_std_part']

In [None]:
X = pd.merge(X, train_m_part, on=['part'], how="left")
validation = pd.merge(validation, train_m_part, on=['part'], how="left")

In [None]:
X.drop('bundle_id', axis=1, inplace=True)
validation.drop('bundle_id', axis=1, inplace=True)
X.drop('question_id', axis=1, inplace=True)
validation.drop('question_id', axis=1, inplace=True)

In [None]:
X = X.dropna()
validation = validation.dropna()

X = reduce_mem_usage(X)
validation = reduce_mem_usage(validation)

In [None]:
y = X['answered_correctly']
X = X.drop(['answered_correctly'], axis=1)

y_val = validation['answered_correctly']
X_val = validation.drop(['answered_correctly'], axis=1)

In [None]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'max_bin': 700,
    'learning_rate': 0.05,
}

lgb_train = lgb.Dataset(X, y, categorical_feature=['part', 'prior_question_had_explanation'])
lgb_eval = lgb.Dataset(X_val, y_val, categorical_feature=['part', 'prior_question_had_explanation'], reference=lgb_train)

In [None]:
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=100,
    num_boost_round=100000,
    early_stopping_rounds=200
)

In [None]:
columns_features = list(X.columns)

#lgb_model = lgb.Booster(model_file='../input/lgb-1019/lgb_10_19.txt')

In [None]:
%%time
y_pred = model.predict(X_val)
y_true = np.array(y_val)
roc_auc_score(y_true, y_pred)

In [None]:
#model.save_model('lgb_1020.txt')

In [None]:
for (test_df, sample_prediction_df) in env.iter_test():
    test_df = pd.merge(test_df, results_u, on=['user_id'], how="left")
    test_df = pd.merge(test_df, results_c, on=['content_id'], how="left")
    test_df = pd.merge(test_df, results_etu, on=['user_id'], how="left")
    test_df = pd.merge(test_df, results_etc, on=['content_id'], how="left")
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    test_df = pd.merge(test_df, train_m_part, on=['part'], how="left")
    test_df['correctly_mean_content'].fillna(0.5, inplace=True)
    test_df['correctly_mean_user'].fillna(0.5, inplace=True)
    test_df['correctly_mean_part'].fillna(0.5, inplace=True)
    test_df['part'] = test_df['part'] - 1

    test_df['part'].fillna(4, inplace=True)
    test_df['prior_question_elapsed_time'].fillna(elapsed_mean, inplace=True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype(np.int8)
    
    test_df.fillna(0, inplace=True)
    
    test_data = test_df[columns_features]
    test_df['answered_correctly'] = model.predict(test_data)
    env.predict(test_df.loc[test_df['content_type_id']==0, ['row_id', 'answered_correctly']])