In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm
import lightgbm as lgb
from datatable import dt, fread
from utils.feature_eng import *

In [2]:
train_path = '/home/carlchao/Riiid-/CV/small_csv_file/cv1_train.csv'
valid_path = '/home/carlchao/Riiid-/CV/small_csv_file/cv1_valid.csv'
test_path = "/home/carlchao/Riiid_data/data/example_test.csv"
question_file = '/home/carlchao/Riiid_data/data/questions.csv'

In [3]:
%%time
# drop task_container_id, user_answer
feld_needed = ['row_id', 'timestamp','user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
train = fread(train_path).to_pandas()[feld_needed]
valid = fread(valid_path).to_pandas()[feld_needed]


CPU times: user 3.87 s, sys: 302 ms, total: 4.17 s
Wall time: 576 ms


In [4]:
test = fread(test_path).to_pandas()

In [5]:

# part
questions_df = pd.read_csv(question_file)
en = Tags_encoder(5)
questions_df = pd.concat([questions_df, en.fit_transform(questions_df['tags'].fillna(' '))], 1)

train = pd.concat([train.reset_index(drop=True), questions_df[['question_id', 'part','PCA_0','PCA_3','PCA_4']].reindex(train['content_id'].values).reset_index(drop=True)], axis=1)
valid = pd.concat([valid.reset_index(drop=True), questions_df[['question_id', 'part','PCA_0','PCA_3','PCA_4']].reindex(valid['content_id'].values).reset_index(drop=True)], axis=1)

train = train.loc[train.content_type_id == False].reset_index(drop=True)
valid = valid.loc[valid.content_type_id == False].reset_index(drop=True)

# answered correctly average for each content
content_df = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean','sum','count','std']).reset_index()
content_df.columns = ['content_id', 'answered_correctly_avg_c','answered_correctly_sum_c', 'answered_correctly_count_c', 'answered_correctly_std_c']

train = pd.concat([train.reset_index(drop=True), content_df.reindex(train['content_id'].values).reset_index(drop=True).drop(columns=['content_id'])], axis=1)
valid = pd.concat([valid.reset_index(drop=True), content_df.reindex(valid['content_id'].values).reset_index(drop=True).drop(columns=['content_id'])], axis=1)

In [6]:


def add_user_feats(df, answered_correctly_sum_u_dict, content_dict, part_dict_sum):
    
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    bool = np.zeros(len(df), dtype=np.int8)
    part_auc = np.zeros(len(df), dtype=np.int32)
    part_c = np.zeros(len(df), dtype=np.int32)
    
    
    for cnt,row in enumerate(tqdm(df[['user_id','answered_correctly','content_id','part']].values)):

        acsu[cnt] = np.sum( answered_correctly_sum_u_dict[row[0]] )
        cu[cnt] = len( answered_correctly_sum_u_dict[row[0]] )
        
        part_auc[cnt] = np.sum( part_dict_sum[int(row[3])-1][row[0]] )
        part_c[cnt] = len( part_dict_sum[int(row[3])-1][row[0]] )

        answered_correctly_sum_u_dict[row[0]].append( row[1] )
        part_dict_sum[int(row[3])-1][row[0]].append( row[1] )
      
        if row[2] not in content_dict[row[0]]:
            content_dict[row[0]].append(row[2])
            bool[cnt] = 1
        else:
            bool[cnt] = 0

    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu, 'first_time': bool,'part_sum_u':part_auc,'part_count_u':part_c})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    user_feats_df['part_avg_u'] = user_feats_df['part_sum_u'] / user_feats_df['part_count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    
    return df

def add_user_feats_without_update(df, answered_correctly_sum_u_dict, content_dict, part_dict_sum):
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    bool = np.zeros(len(df), dtype=np.int8)
    part_auc = np.zeros(len(df), dtype=np.int32)
    part_c = np.zeros(len(df), dtype=np.int32)
    
    for cnt,row in enumerate(df[['user_id','content_id','part']].values):
        acsu[cnt] = np.sum( answered_correctly_sum_u_dict[row[0]] )
        cu[cnt] = len( answered_correctly_sum_u_dict[row[0]] )
        part_auc[cnt] = np.sum( part_dict_sum[int(row[2])-1][row[0]] )
        part_c[cnt] = len( part_dict_sum[int(row[2])-1][row[0]] )
        
        
        if row[1] not in content_dict[row[0]]:
            bool[cnt] = 1
        else:
            bool[cnt] = 0
            
    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu, 'first_time': bool,'part_sum_u':part_auc,'part_count_u':part_c})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    user_feats_df['part_avg_u'] = user_feats_df['part_sum_u'] / user_feats_df['part_count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    
    return df

def update_user_feats(df, answered_correctly_sum_u_dict, count_u_dict, content_dict):
    for row in df[['user_id','answered_correctly','content_type_id','part']].values:
        if row[2] == 0:
            answered_correctly_sum_u_dict[row[0]].append( row[1] )
            part_dict_sum[int(row[3])-1][row[0]].append( row[1] )

In [7]:
part_dict_sum = [ defaultdict(list) for i in range(7)]
answered_correctly_sum_u_dict = defaultdict(list)
content_dict = defaultdict(list)

In [None]:
# user stats features with loops
train = add_user_feats(train, answered_correctly_sum_u_dict, content_dict, part_dict_sum)
train.to_pickle('preprocess_fea_v3/small_train.pickle')
valid = add_user_feats_without_update(valid, answered_correctly_sum_u_dict, content_dict, part_dict_sum)
valid.to_pickle('preprocess_fea_v3/small_valid.pickle')

HBox(children=(FloatProgress(value=0.0, max=9569193.0), HTML(value='')))

In [None]:
import pickle

with open("preprocess_fea_v3/answered_correctly_sum_u_dict.pickle", "wb") as filename:  
    pickle.dump(answered_correctly_sum_u_dict, filename)
    filename.close()

with open("preprocess_fea_v3/content_dict.pickle", "wb") as filename:  
    pickle.dump(content_dict, filename)
    filename.close()
    
with open("preprocess_fea_v3/part_dict_sum.pickle", "wb") as filename:  
    pickle.dump(part_dict_sum, filename)
    filename.close()

In [None]:

#drop useless col

train.drop(['user_id','content_id','content_type_id','question_id'], axis=1, inplace=True)
valid.drop(['user_id','content_id','content_type_id','question_id'], axis=1, inplace=True)


# changing dtype to avoid lightgbm error
train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')

In [None]:
TARGET = 'answered_correctly'
FEATS = ['answered_correctly_avg_u', 'answered_correctly_sum_u', 'count_u', 'answered_correctly_avg_c', 'part', 'prior_question_had_explanation', 'prior_question_elapsed_time','first_time']
FEATS = FEATS + ['PCA_0','PCA_3','PCA_4','part_avg_u', 'part_sum_u', 'part_count_u','answered_correctly_sum_c', 'answered_correctly_count_c', 'answered_correctly_std_c']
dro_cols = list(set(train.columns) - set(FEATS))
y_tr = train[TARGET]
y_va = valid[TARGET]
train.drop(dro_cols, axis=1, inplace=True)
valid.drop(dro_cols, axis=1, inplace=True)
_=gc.collect()

In [None]:
# fillna 
train.describe().loc['mean']

In [None]:
lgb_train = lgb.Dataset(train[FEATS], y_tr, categorical_feature=['part', 'prior_question_had_explanation'])
lgb_valid = lgb.Dataset(valid[FEATS], y_va, categorical_feature=['part', 'prior_question_had_explanation'])
del train, y_tr
_=gc.collect()

In [None]:
model = lgb.train(
                    {'objective': 'binary'}, 
                    lgb_train,
                    valid_sets=[lgb_train, lgb_valid],
                    verbose_eval=100,
                    num_boost_round=10000,
                    early_stopping_rounds= 300,
                    categorical_feature=['part', 'prior_question_had_explanation']
    
                )
print('auc:', roc_auc_score(y_va, model.predict(valid[FEATS])))
_ = lgb.plot_importance(model)

In [None]:

# ans = test_df['prior_group_answers_correct'][0]
# test_df = test_df.sort_values(['user_id','timestamp'], ascending=False)
# test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
# test_df = pd.concat([test_df.reset_index(drop=True), questions_df[['question_id', 'part','PCA_0','PCA_3','PCA_4']].reindex(test_df['content_id'].values).reset_index(drop=True)], axis=1)
# test_df = pd.concat([test_df.reset_index(drop=True), content_df.reindex(test_df['content_id'].values).reset_index(drop=True).drop(columns=['content_id'])], axis=1)
# test_df_save = test_df.copy()
# if ans !='[]':
#     test_df_save['answered_correctly'] = ans
#     test_df = update_user_feats(test_df, answered_correctly_sum_u_dict, content_dict, part_dict_sum)
    
# test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')