In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict, Counter
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm
import lightgbm as lgb
import datatable as dt
from category_encoders import TargetEncoder
import optuna
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'answered_correctly': 'int8',
    'user_answer': 'int8',
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}

train_size = 10
valid_size = 2
target = 'answered_correctly'

In [None]:
def clear_mem():
    %reset -f out
    %reset -f in
    gc.collect()

In [None]:
%%time
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()
#train_df = train_df[train_df[target] != -1].reset_index(drop=True)
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df = train_df.astype(data_types_dict)

In [None]:
user_watch_lecture_agg = train_df.groupby('user_id')['content_type_id'].agg(['count', 'sum'])

train_df = train_df[train_df[target] != -1].reset_index(drop=True)
prior_question_elapsed_time_avg = train_df['prior_question_elapsed_time'].dropna().values.mean()

In [None]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count', 'std', 'skew'])
time_content_agg = train_df.groupby('content_id')['prior_question_elapsed_time'].agg(['max', 'min', 'std', 'skew'])
user_timestamp_agg = train_df.groupby('user_id')['timestamp'].tail()
task_container_agg = train_df.groupby('user_id')['task_container_id'].nunique()

In [None]:
#train_df["attempt_no"] = 1
#train_df["attempt_one_question"] = train_df[['user_id','content_id','attempt_no']].groupby(["user_id","content_id"])["attempt_no"].cumsum()
#train_df.drop("attempt_no", axis=1, inplace=True)

In [None]:
#attempt_one_dict = train_df[['user_id', 'content_id','attempt_one_question']].groupby(['user_id','content_id'])['attempt_one_question'].max().to_dict()

In [None]:
train_df = train_df.groupby('user_id').tail(train_size).reset_index(drop=True)
print(train_df.shape)
clear_mem()

In [None]:
questions = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 1, 2, 3, 4],
    dtype={'question_id': 'int16', 'bundle_id': 'int16', 
           'correct_answer': 'int8', 'part': 'int8', 'tags': 'str'}
)

lecture = pd.read_csv(
    '../input/riiid-test-answer-prediction/lectures.csv',
    usecols=[0, 1, 2, 3],
    dtype={'lecture_id': 'int16', 'tag': 'int16', 'part': 'int8', 'type_of': 'str'}
)

In [None]:
#tags_set = set(list(lecture['tag']))

In [None]:
def question_type_of(x):
    types = []
    for i in x:
        if i in set(lecture['tag'].tolist()):
            des = list(lecture.loc[lecture['tag']==i, 'type_of'])
            types.extend(des)
        else:
            continue
    return types

In [None]:
tags = questions["tags"].str.split(" ", n=10, expand=False)
tags[10033] = ['162']
questions['tags'] = tags

def str_to_int(x):
    result = []
    for i in x:
        result.append(int(i))
    return set(result)

questions['tags'] = questions['tags'].apply(str_to_int)

questions['question_type_of'] = questions['tags'].apply(question_type_of)

In [None]:
questions.head()

In [None]:
questions['type_of_solving_question'] = questions['question_type_of'].apply(lambda x: x.count('solving question'))
questions['type_of_concept'] = questions['question_type_of'].apply(lambda x: x.count('concept'))

In [None]:
questions['tags_count'] = questions['tags'].apply(lambda x: len(x))
#questions_df = questions_df[questions_df['tags_count']!=0]

tags_list = []
for tag in questions['tags'].tolist():
    tags_list.extend(list(tag))
tags_counter = dict(Counter(tags_list))

bundle_dict = questions['bundle_id'].value_counts().to_dict()
questions['bundle_size'] = questions['bundle_id'].apply(lambda x: bundle_dict[x])

def tag_appr_means(tags):
    l = []
    for tag in tags:
        l.append(tags_counter[tag])
    return np.mean(l)

def tag_appr_sum(tags):
    l = []
    for tag in tags:
        l.append(tags_counter[tag])
    return np.sum(l)

def tag_appr_most(tags):
    tag_max = 0
    for tag in tags:
        if tags_counter[tag] > tag_max:
            tag_max = tags_counter[tag]
        else:
            continue
    return tag_max

questions['tags_appr_mean'] = questions['tags'].apply(tag_appr_means)
questions['tags_appr_sum'] = questions['tags'].apply(tag_appr_sum)
questions['tags_appr_most'] = questions['tags'].apply(tag_appr_most)

#questions.loc[questions['question_id']==10033, 'tags_count'] = questions['tags_count'].mode()[0]
#questions.loc[questions['question_id']==10033, 'tags_appr_mean'] = questions['tags_appr_mean'].median()
#questions.loc[questions['question_id']==10033, 'type_of_concept'] = 1
#questions.loc[questions['question_id']==10033, 'type_of_solving_question'] = 1
#questions.loc[questions['question_id']==10033, 'type_of_starter'] = 0
#questions.loc[questions['question_id']==10033, 'type_of_intention'] = 0

#questions_df.drop(['tags', 'question_type_of'], axis=1, inplace=True)

In [None]:
questions.set_index('question_id', inplace=True)
train_df = train_df.join(questions, on=['content_id'], how="left")

In [None]:
# train_df['only_tag'].mode()
# {73}

In [None]:
#def extract_only_tag(x):
#    return tags_set.intersection(x)

#train_df['only_tag'] = train_df['tags'].apply(extract_only_tag)
#train_df['only_tag'] = train_df['only_tag'].apply(lambda x: list(x)[0] if len(x)>0 else 73)

In [None]:
#enc = TargetEncoder(cols=['only_tag'], return_df=False)
#enc.fit(train_df['only_tag'], train_df[target])
del train_df
clear_mem()

feats = ['prior_question_elapsed_time', 'prior_question_had_explanation', 'user_avg',
         'attempt_one_question', 'elapsed_time_std', 'elapsed_time_skew', 'max-elapsed_time', 
         'elapsed_time-min', 'part', 'type_of_solving_question', 'type_of_concept', 
         'tags_count', 'bundle_size', 'tags_appr_mean', 'tags_appr_sum', 'is_162', 'tag_alone', 
         'content_count', 'content_avg', 'content_std', 'content_skew', 'hmean_by_user_content'
        ]

In [None]:
feats = ['prior_question_elapsed_time', 'prior_question_had_explanation', 
         'watch_lecture_%',
         'watch_lecture_sum', 
         'user_avg', 
         'lag_time',
         'task_container_id_nunique',
         'attempt_one_question', 
         'elapsed_time_std', 'elapsed_time_skew', 'max-elapsed_time', 
         'elapsed_time-min', 'part', 'type_of_solving_question', 'type_of_concept', 
         'tags_count', 'bundle_size', 'tags_appr_mean', 'tags_appr_sum', 
         #'only_tag',
         #'is_162', 'tag_alone', 
         'content_count', 'content_avg', 'content_std', 'content_skew', 
         'hmean_by_user_content'
        ]

In [None]:
cat_cols = ['prior_question_had_explanation', 'is_162', 'tag_alone']

In [None]:
#model.save_model('sazuma_tail_24_6.txt')
model = lgb.Booster(model_file='../input/riiids-models-and-dicts/sazuma_tail_1927_7.txt')

In [None]:
def get_max_attempt_one(user_id, content_id):
    k = (user_id, content_id)

    if k in attempt_one_dict.keys():
        attempt_one_dict[k] += 1
        return attempt_one_dict[k]

    attempt_one_dict[k] = 1
    return attempt_one_dict[k]

def mapper_162(x):
    if '162' in x:
        return 1
    else:
        return 0
    
def mapper_tag_alone(x):
    if  len(x)==1:
        return 1
    else:
        return 0

In [None]:
attempt_one_dict = np.load('../input/riiids-models-and-dicts/attempt_one_dict_1927_7.npy', allow_pickle=True).item()

In [None]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))
user_watch_lecture_sum_dict = user_watch_lecture_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_watch_lecture_count_dict = user_watch_lecture_agg['count'].astype('int16').to_dict(defaultdict(int))
user_timestamp_dict = user_timestamp_agg.to_dict(defaultdict(int))
task_container_dict = task_container_agg.to_dict(defaultdict(int))

clear_mem()

In [None]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        
        user_ids = prior_test_df['user_id'].values
        content_type_ids = prior_test_df['content_type_id'].values
        
        for user_id, content_type_id in zip(user_ids, content_type_ids):
            user_watch_lecture_sum_dict[user_id] += content_type_id
            user_watch_lecture_count_dict[user_id] += 1
        
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            
            
            

    prior_test_df = test_df.copy()
    
    test_df = test_df.sort_values(by='timestamp')
    
    user_task_container_shift = test_df.groupby('user_id')['task_container_id'].shift()
    test_df['task_container_id_shift'] = user_task_container_shift
    test_df['task_container_id_not_equal'] = (test_df['task_container_id'] != test_df['task_container_id_shift']).astype(np.int8)
    
    user_watch_lecture_sum = np.zeros(len(test_df), dtype=np.int16)
    user_watch_lecture_count = np.zeros(len(test_df), dtype=np.int16)
    user_task_container_id_count = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_type_id, task_container_id_not_equal) in enumerate(zip(test_df['user_id'].values, test_df['content_type_id'].values, test_df['task_container_id_not_equal'].values)):
        user_watch_lecture_sum[i] = user_watch_lecture_sum_dict[user_id]
        user_watch_lecture_count[i] = user_watch_lecture_count_dict[user_id]
        # update the numbers of task_container per user
        task_container_dict[user_id] = task_container_dict[user_id] + task_container_id_not_equal
        user_task_container_id_count[i] = task_container_dict[user_id]
    
    test_df['watch_lecture_%'] = user_watch_lecture_sum / user_watch_lecture_count
    test_df['watch_lecture_sum'] = user_watch_lecture_sum
    test_df['task_container_id_nunique'] = user_task_container_id_count
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    
    test_df = test_df.join(questions, on='content_id', how='left')
    
    test_df['is_162'] = test_df['tags'].apply(mapper_162)
    test_df['tag_alone'] = test_df['tags'].apply(mapper_tag_alone)
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    user_timestamp_diffs = np.zeros(len(test_df), dtype=np.int64)
    
    for i, (user_id, timestamp, content_id) in enumerate(zip(test_df['user_id'].values, test_df['timestamp'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        ### every user's timestamp diff ###
        user_timestamp_diffs[i] = timestamp - user_timestamp_dict.get(user_id, 0)
        user_timestamp_dict[user_id] = timestamp
        
    #test_df['only_tag'] = test_df['tags'].apply(extract_only_tag)
    #test_df['only_tag'] = test_df['only_tag'].apply(lambda x: list(x)[0] if len(x)>0 else 73)
    #test_df['only_tag'] = enc.transform(test_df['only_tag'])
    
    test_df['lag'] = user_timestamp_diffs
    test_df['lag'] = test_df['lag'].replace(0, np.nan)
    test_df['lag'] = test_df['lag'].fillna(method='ffill')
    test_df['lag_time'] = test_df['lag'] - test_df['prior_question_elapsed_time']
    test_df['user_avg'] = user_sum / user_count
    #test_df['user_count'] = user_count
    #test_df['user_sum'] = user_sum
    test_df['content_count'] = content_count
    test_df['content_avg'] = content_sum / content_count
    test_df['content_std'] = test_df['content_id'].map(content_agg['std'])
    test_df['content_skew'] = test_df['content_id'].map(content_agg['skew'])
    test_df['hmean_by_user_content'] = 2*test_df['user_avg']*test_df['content_avg'] / (test_df['user_avg']+test_df['content_avg'])
    
    test_df['elapsed_time_std'] = test_df['content_id'].map(time_content_agg['std'])
    test_df['elapsed_time_skew'] = test_df['content_id'].map(time_content_agg['skew'])
    test_df['max-elapsed_time'] = test_df['content_id'].map(time_content_agg['max']) - test_df['prior_question_elapsed_time']
    test_df['elapsed_time-min'] = test_df['prior_question_elapsed_time'] - test_df['content_id'].map(time_content_agg['min'])
    
    test_df["attempt_one_question"] = test_df[["user_id", "content_id"]].apply(lambda row: get_max_attempt_one(row["user_id"], row["content_id"]), axis=1)
    test_df["attempt_one_question"] = test_df["attempt_one_question"].apply(lambda x: 3 if x>3 else x)
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')
    test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_avg)
    
    test_df[target] = model.predict(test_df[feats])
    env.predict(test_df[['row_id', target]])