In [1]:
import os
import gc
import time
import gzip
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import roc_auc_score
import datatable as dt

In [2]:
import warnings
import random

random.seed(1)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [19]:
# DATA_PATH = '../input/riiid-test-answer-prediction/'
MY_DATA_PATH = './my_data/'
CACHE_PATH = './lgb1215weights/'

DATA_PATH = '/home/zuoyuhui/datasets/riid准确回答/'
file_train = 'train.csv'
file_questions = 'questions.csv'
file_lectures = 'lectures.csv'

In [20]:
if not os.path.exists(CACHE_PATH):
    os.mkdir(CACHE_PATH)
    
    
DEBUG = True
OFFLINE = True
CV5 = False #联系cv5
CV = False #普通cut

# if OFFLINE:
#     nrows = 1250000
# else:
#     nrows = None

if DEBUG:
    MY_DATA_PATH = f'{MY_DATA_PATH}/debug/'
    CACHE_PATH = f'{CACHE_PATH}/debug/'
    if not os.path.exists(CACHE_PATH):
        os.mkdir(CACHE_PATH)
        
config_file = f'{CACHE_PATH}/config.pkl'

In [45]:
data = pd.read_excel('1.xlsx',sheet_name = "CA",skiprows = 2)
data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,-,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Crank Angle,Cylinder Volume,Crank Angle.1,Cylinder Volume.1
0,-360.0,-0.004102,-180.0,1.38103,-360.0,1.49343,-360.0,1.49343,-360.0,1.49343,-360.0,1.49343,-360.0,0.00612,-360.0,-12.669,-120.0,1.274,-30.0,663.524,-30.0,-2.26684,-30.0,0.0,-360.0,-0.001712,-360.0,-0.006762,-360.0,0.061728,-360.0,42.7702
1,-359.0,-0.004108,-179.0,1.38635,-359.5,1.50003,-359.0,1.50567,-359.0,1.50567,-359.0,1.50567,-359.0,0.00284,-359.0,-12.6691,-118.0,1.334,-29.0,670.418,-29.0,-2.7611,-29.0,-2.51397,-359.0,-0.0017,-359.0,-0.006766,-359.95,0.061729,-359.95,42.7703
2,-358.0,-0.004098,-178.0,1.41263,-359.0,1.50567,-358.0,1.49911,-358.0,1.49911,-358.0,1.49911,-358.0,-0.0171,-358.0,-12.669,-116.0,1.328,-28.0,677.426,-28.0,-2.43798,-28.0,-5.11351,-358.0,-0.001696,-358.0,-0.00676,-359.9,0.061729,-359.9,42.7709
3,-357.0,-0.0041,-177.0,1.42679,-358.5,1.44443,-357.0,1.47147,-357.0,1.47147,-357.0,1.47147,-357.0,-0.02568,-357.0,-12.6695,-114.0,1.861,-27.0,685.179,-27.0,-2.66017,-27.0,-7.66259,-357.0,-0.001694,-357.0,-0.00678,-359.85,0.061731,-359.85,42.7717
4,-356.0,-0.00412,-176.0,1.41187,-358.0,1.49911,-356.0,1.44775,-356.0,1.44775,-356.0,1.44775,-356.0,-0.02714,-356.0,-12.6697,-112.0,1.401,-26.0,692.056,-26.0,-2.87398,-26.0,-10.4297,-356.0,-0.001712,-356.0,-0.006788,-359.8,0.061732,-359.8,42.7729


In [62]:
def is_number(s):
    s = str(s)
    
    if s.count('.') == 1:#小数
        new_s = s.split('.')
        left_num = new_s[0]
        right_num = new_s[1]
        if right_num=='0':
            return True
    return False

T = data[(-360 <= data['Crank Angle.1']) & (data['Crank Angle.1'] <= 539)][['Crank Angle.1','Cylinder Volume.1']]
T=T[T['Crank Angle.1'].apply(is_number)]

Unnamed: 0,Crank Angle.1,Cylinder Volume.1
0,-360.0,42.7702
20,-359.0,42.8389
40,-358.0,43.0453
60,-357.0,43.3890
80,-356.0,43.8700
...,...,...
17900,535.0,734.7280
17920,536.0,735.0590
17940,537.0,735.3160
17960,538.0,735.5000


In [21]:
def save_pickle(dic,save_path):
    with gzip.open(save_path,'wb') as f:
        pickle.dump(dic,f)
        
def load_pickle(load_path):
    with gzip.open(load_path,'rb') as f:
        message_dict = pickle.load(f)
    return message_dict

#定义内存压缩方法
def reduce_mem_usage(df,verbose=True):
    start_mem = df.memory_usage().sum()/ 1024**2
    numerics = ['int16','int32','int64','float16','float32','float64']
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                #NumPy分别提供numpy.iinfo 并numpy.finfo 验证NumPy整数和浮点值的最小值或最大值：
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                    
            end_men = df.memory_usage().sum()/1024**2
    print('Memory usage after optimization is :{:.2f} MB'.format(end_men))
    print('Decreased by {:1f}%'.format(100*(start_mem - end_men)/start_mem))
    return df

=============================================================================================================================

# load data

## question_data

In [22]:
# question_data
questions_df = pd.read_csv(f'{DATA_PATH}/questions.csv')
questions_df['content_bundle_same'] = (questions_df['question_id'] == questions_df['bundle_id']).astype(int)  # 取出question_id 和 bundle_id：解决问题的代码。类别变量 相同的数据 9765个

questions_df['tags_len'] = questions_df['tags'].apply(lambda x:0 if str(x)=='nan' else len(str(x).split(' '))) # 统计tags的个数 nan记为0

questions_df['part_content_num'] = questions_df.groupby('part')['question_id'].transform('count') # 问题所属part的数量

questions_df['tags'] = questions_df['tags'].apply(lambda x: [] if str(x) == 'nan' else str(x).split(' ')) # 切分tags

In [23]:
question_bundle_dict = dict(zip(questions_df.question_id.values,questions_df.bundle_id.values))
question_part_dict = dict(zip(questions_df.question_id.values, questions_df.part.values))
question_tags_dict = dict(zip(questions_df.question_id.values, questions_df.tags.values))

In [24]:
bundle_df = questions_df.groupby('bundle_id')['question_id'].unique() # 相当于把list变成set 返回问题类别的所有唯一值
bundle_df = bundle_df[bundle_df.apply(len)>1]  
bundle_df # 找出同类问题

bundle_id
1400        [1400, 1401, 1402]
1403        [1403, 1404, 1405]
1406        [1406, 1407, 1408]
1409        [1409, 1410, 1411]
1412        [1412, 1413, 1414]
                 ...          
13238    [13238, 13239, 13240]
13241    [13241, 13242, 13243]
13244    [13244, 13245, 13246]
13247    [13247, 13248, 13249]
13250    [13250, 13251, 13252]
Name: question_id, Length: 1614, dtype: object

In [25]:
bundle_mapping ={}
for id_list in bundle_df.values:
    bid = id_list[0]
    for qid in id_list:
        bundle_mapping[qid] = (bid,len(id_list))  #1400: (1400, 3) 找出一个类别有几个个数大于1的问题

--------------------------------------------------------------------------------------------------------------------
## lecture_data

In [26]:
lectures_df = pd.read_csv(f'{DATA_PATH}/lectures.csv')
lecture_tag_dict = dict(zip(lectures_df.lecture_id.values, lectures_df.tag.values))  # 演讲的标签代码
lecture_part_dict = dict(zip(lectures_df.lecture_id.values, lectures_df.part.values)) #  讲座的顶级类别代码
lecture_type_dict = dict(zip(lectures_df.lecture_id.values, lectures_df.type_of.values)) # 简要介绍讲座的核心目的 解决问题类型

In [27]:
lectures_df.groupby('lecture_id').tail(5)


Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question
...,...,...,...,...
413,32535,8,5,solving question
414,32570,113,3,solving question
415,32604,24,6,concept
416,32625,142,2,concept


--------------------------------------------------------------------------------------------------------------------
## train

In [28]:
%time
if OFFLINE:
    feld_needed = ['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id', 'task_container_id',
                   'user_answer', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
    
    if CV5:
#             val_size = 250000
        val_size = 2500000
        train = dt.fread(DATA_PATH+file_train, max_nrows=None,columns=feld_needed).to_pandas()
        valid_split1 = train.groupby('user_id').tail(5)  
        train_split1 = train[~train.row_id.isin(valid_split1.row_id)]
        valid_split1 = valid_split1[valid_split1.content_type_id == 0]
        train_split1 = train_split1[train_split1.content_type_id == 0]
        print(f'{train_split1.answered_correctly.mean():.3f} {valid_split1.answered_correctly.mean():.3f}')
        
        max_timestamp_u = train[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()  # 得到每个用户最大的时间戳
        max_timestamp_u.columns = ['user_id', 'max_time_stamp'] 
        MAX_TIME_STAMP = max_timestamp_u.max_time_stamp.max()  # 所有用户最大时间  因为是时间戳
        # (MAX_TIME_STAMP for all users) - (max_time_stamp for each user)  所有用户的最大时间减去一个用户的最大时间就是这个用户开始的时间戳
        def rand_time(max_time_stamp):
            interval = MAX_TIME_STAMP - max_time_stamp
            rand_time_stamp = random.randint(0,interval)
            return rand_time_stamp
        # 由于训练数据和测试数据是按时间拆分的，因此验证数据也应该按时间拆分。但是，给定的时间戳是自用户的第一个事件以来经过的时间，而不是实际时间。因此，我在一定间隔内为每个用户设置了随机的首次访问时间。
        max_timestamp_u['rand_time_stamp'] = max_timestamp_u.max_time_stamp.apply(rand_time)
        train = train.merge(max_timestamp_u, on='user_id', how='left')
        train['viretual_time_stamp'] = train.timestamp + train['rand_time_stamp']
        
        train = train.sort_values(['viretual_time_stamp', 'row_id']).reset_index(drop=True)
        #现在我们已经按viretual_time_amp对数据帧进行了排序，我们可以轻松地按时间拆分数据帧。
        
        for cv in range(5):
            valid = train[-val_size:]
            train = train[:-val_size]
            # check new users and new contents
            new_users = len(valid[~valid.user_id.isin(train.user_id)].user_id.unique())
            valid_question = valid[valid.content_type_id == 0]
            train_question = train[train.content_type_id == 0]
            new_contents = len(valid_question[~valid_question.content_id.isin(train_question.content_id)].content_id.unique())    
            print(f'cv{cv} {train_question.answered_correctly.mean():.3f} {valid_question.answered_correctly.mean():.3f} {new_users} {new_contents}')
            valid.to_pickle(f'cv{cv+1}_valid.pickle')
            train.to_pickle(f'cv{cv+1}_train.pickle')
            
    train = pd.read_pickle(f'./cv1_train.pickle')[feld_needed]
    valid = pd.read_pickle(f'./cv1_valid.pickle')[feld_needed]
    print(f'Train|Valid: {len(train)}|{len(valid)}')
    
    '''
    Make feat for valid:
    * ques: quest_train
    * lect: lect_train

    Make feat for test:
    * ques: quest_train/quest_valid
    * lect: lect_train/lect_valid
    ''' 
    
    # 训练集验证集都取出 问题的数据的这三列
    ques_train = train.loc[train.content_type_id == False, ['row_id','content_id','answered_correctly']].reset_index(drop=True)
    ques_valid = valid.loc[valid.content_type_id == False, ['row_id','content_id','answered_correctly']].reset_index(drop=True) 
    # 对他们进行量化
    ques_train = reduce_mem_usage(ques_train, verbose=True)
    ques_valid = reduce_mem_usage(ques_valid, verbose=True)
    # 对验证集和训练集 的 用户在回答上一个问题包(忽略其间的任何讲座)后是否看到了解释和正确的回答 做填充
    train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(False).astype('int8') 
    valid['prior_question_had_explanation'] = valid['prior_question_had_explanation'].fillna(False).astype('int8')
    
    # 对用户对问题包答题的nan进行均值填充
    prior_question_elapsed_time_mean = train.loc[train.content_type_id == False].prior_question_elapsed_time.dropna().values.mean()
    train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean).astype(np.int32)
    valid['prior_question_elapsed_time'] = valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean).astype(np.int32)
    train = reduce_mem_usage(train, verbose=True)
    valid = reduce_mem_usage(valid, verbose=True)
    
    save_pickle(prior_question_elapsed_time_mean, f'./prior_question_elapsed_time_mean.pkl')
    config_dict = {}
else:
    config_dict = load_pickle(config_file)
    prior_question_elapsed_time_mean = load_pickle(f'{CACHE_PATH}/prior_question_elapsed_time_mean.pkl')
    

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 12.6 µs
Train|Valid: 98730332|2500000
Memory usage after optimization is :646.33 MB
Decreased by 41.666662%
Memory usage after optimization is :16.38 MB
Decreased by 41.666486%
Memory usage after optimization is :2636.38 MB
Decreased by 26.315789%
Memory usage after optimization is :66.76 MB
Decreased by 26.315753%


In [27]:
len(content_feat)

13523

------------------------------------------------------------------------------------------------------------------------
## Content static Feat

In [29]:
def make_content_feat(df, type):
    df = df.loc[df.content_type_id==False].reset_index(drop=True)
    file_name = f'content_feat_{type}.pkl'
    
    # 每个题目的全局平均准确率
    feat_df = df.groupby('content_id', as_index=False)['answered_correctly'].mean().rename(columns={'answered_correctly':'content_target_mean'})
    # 每个题目全局出现次数
    content_cnt = df.groupby('content_id')['user_id'].count()
    feat_df['content_cnt'] = content_cnt.reindex(feat_df.content_id.values).values
    
    save_pickle(feat_df, save_path=f'./{file_name}')
    feat_df = reduce_mem_usage(feat_df, verbose=True)
    return feat_df

In [30]:
if OFFLINE:
    content_feat = make_content_feat(df=train.copy(deep=True),type='train')
    content_feat_test = make_content_feat(df=pd.concat([train,valid]),type='test')
    print('content_feat:\n',content_feat.head())
else:
    content_feat_test = load_pickle(f'./content_feat_test.pkl')
    content_feat_test = reduce_mem_usage(content_feat_test, verbose=True)
    
content_target_mean_dict = dict(zip(content_feat_test.content_id.values,
                                    content_feat_test.content_target_mean.values))
content_feat_cols = [col for col in content_feat_test if col != 'content_id']

Memory usage after optimization is :0.21 MB
Decreased by 50.000000%
Memory usage after optimization is :0.21 MB
Decreased by 50.000000%
content_feat:
    content_id  content_target_mean  content_cnt
0           0             0.907227         6777
1           1             0.890625         7265
2           2             0.554199        43781
3           3             0.779297        22451
4           4             0.613770        30881


------------------------------------------------------------------------------------------------------------------------
## Part Fast

In [31]:
def make_part_mean_dict(df):
    df = df.loc[df.content_type_id==False].reset_index(drop=True)
    df = df.merge(questions_df[['question_id','part']], left_on='content_id', right_on='question_id',how='left')
    
    feat_df = df.groupby('part', as_index=False)['answered_correctly'].mean(). rename(columns={'answered_correctly': 'part_target_mean'})
    return dict(zip(feat_df.part.values,feat_df.part_target_mean.values))

if OFFLINE:
    part_target_mean_dict = make_part_mean_dict(df=pd.concat([train[['content_id', 'content_type_id', 'answered_correctly']],
                                                              valid[['content_id', 'content_type_id', 'answered_correctly']]]))
    save_pickle(part_target_mean_dict, f'{CACHE_PATH}/part_target_mean_dict.pkl')
else:
    part_target_mean_dict = load_pickle(f'./part_target_mean_dict.pkl')
    
questions_df['part_target_mean'] = questions_df['part'].apply(lambda x: part_target_mean_dict[x])

------------------------------------------------------------------------------------------------------------------------
## Id static feat

In [32]:
static_feat_cols = ['part','prior_question_elapsed_time']

def get_stat_feat(df,feat_cols):
    df = df.loc[df.content_type_id==False].reset_index(drop=True)
    df = df.merge(questions_df[['question_id','part']],left_on='content_id',right_on='question_id',how='left')
    return df[feat_cols]

if OFFLINE:
    state_feat_train = get_stat_feat(df=train.copy(deep=True), feat_cols=static_feat_cols)
    state_feat_valid = get_stat_feat(df=valid.copy(deep=True), feat_cols=static_feat_cols)
    state_feat_train = reduce_mem_usage(state_feat_train, verbose=True)
    state_feat_valid = reduce_mem_usage(state_feat_valid, verbose=True)

Memory usage after optimization is :1200.32 MB
Decreased by 35.000000%
Memory usage after optimization is :30.42 MB
Decreased by 35.000000%


In [33]:
print(len(state_feat_train),len(state_feat_valid))
len(train)

96817414 2453886


98730332

------------------------------------------------------------------------------------------------------------------------
## User Loop Feat

In [34]:
window_size = 25
if OFFLINE:
    user_cnt_dict = defaultdict(int) # 当字典里的key不存在但被查找时，返回的不是keyError而是一个默认值
    user_pos_cnt_dict = defaultdict(int) # 比如list对应[ ]，str对应的是空字符串，set对应set( )，int对应0
    user_part_cnt_dict = defaultdict(int)
    user_part_pos_cnt_dict = defaultdict(int)
    user_content_cnt_dict = defaultdict(int)
    user_content_pos_cnt_dict = defaultdict(int)
    user_content_redo_cnt_dict = defaultdict(int)
    user_content_mean_sum_dict = defaultdict(int)
    user_consecutive_pos_cnt_dict = defaultdict(int)
    user_target_win25_dict = defaultdict(list)
    user_content_mean_win10_dict = defaultdict(list)

    user_explanation_cnt_dict = defaultdict(int)
    user_explanation_pos_cnt_dict = defaultdict(int)
    user_elapse_time_sum_dict = defaultdict(int)
    user_elapse_time_win10_dict = defaultdict(list)
    user_last_timestamp_dict = defaultdict(int)
    user_last_task_dict = defaultdict(int)
    user_content_win5_dict = defaultdict(list)
    user_part_win10_dict = defaultdict(list)

    bundle_state_dict = defaultdict(list) # bundle_id, time_diff
    # user_order_in_session_dict = defaultdict(int)
    user_cum_time_dict = defaultdict(int)
    user_timespan_win10_dict = defaultdict(list)

    user_tags_cnt_dict = defaultdict(int)
    user_tags_pos_cnt_dict = defaultdict(int)

    user_continue_quest_cnt_dict = defaultdict(int)
else:
    user_content_feat_df = pd.read_pickle(f'{CACHE_PATH}/user_content_feat.pkl')
    user_content_feat_df = reduce_mem_usage(user_content_feat_df)
    user_content_cnt_dict = defaultdict(int)
    user_content_pos_cnt_dict = defaultdict(int)

    user_cnt_dict = load_pickle(f'{CACHE_PATH}/user_cnt_dict.pkl')
    user_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_pos_cnt_dict.pkl')
    user_part_cnt_dict = load_pickle(f'{CACHE_PATH}/user_part_cnt_dict.pkl')
    user_part_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_part_pos_cnt_dict.pkl')
    # user_content_cnt_dict = load_pickle(f'{CACHE_PATH}/user_content_cnt_dict.pkl')
    user_content_redo_cnt_dict = load_pickle(f'{CACHE_PATH}/user_content_redo_cnt_dict.pkl')
    user_content_mean_sum_dict = load_pickle(f'{CACHE_PATH}/user_content_mean_sum_dict.pkl')
    user_consecutive_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_consecutive_pos_cnt_dict.pkl')
    user_target_win25_dict = load_pickle(f'{CACHE_PATH}/user_target_win25_dict.pkl')
    user_content_mean_win10_dict = load_pickle(f'{CACHE_PATH}/user_content_mean_win10_dict.pkl')

    user_explanation_cnt_dict = load_pickle(f'{CACHE_PATH}/user_explanation_cnt_dict.pkl')
    user_explanation_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_explanation_pos_cnt_dict.pkl')
    user_elapse_time_sum_dict = load_pickle(f'{CACHE_PATH}/user_elapse_time_sum_dict.pkl')
    user_elapse_time_win10_dict = load_pickle(f'{CACHE_PATH}/user_elapse_time_win10_dict.pkl')
    user_last_timestamp_dict = load_pickle(f'{CACHE_PATH}/user_last_timestamp_dict.pkl')
    user_last_task_dict = load_pickle(f'{CACHE_PATH}/user_last_task_dict.pkl')
    user_content_win5_dict = load_pickle(f'{CACHE_PATH}/user_content_win5_dict.pkl')
    user_part_win10_dict = load_pickle(f'{CACHE_PATH}/user_part_win10_dict.pkl')

    bundle_state_dict = load_pickle(f'{CACHE_PATH}/bundle_state_dict.pkl')
    # user_order_in_session_dict = load_pickle(f'{CACHE_PATH}/user_order_in_session_dict.pkl')
    user_cum_time_dict = load_pickle(f'{CACHE_PATH}/user_cum_time_dict.pkl')
    user_timespan_win10_dict = load_pickle(f'{CACHE_PATH}/user_timespan_win10_dict.pkl')

    user_tags_cnt_dict = load_pickle(f'{CACHE_PATH}/user_tags_cnt_dict.pkl')
    user_tags_pos_cnt_dict = load_pickle(f'{CACHE_PATH}/user_tags_pos_cnt_dict.pkl')

    user_continue_quest_cnt_dict = load_pickle(f'{CACHE_PATH}/user_continue_quest_cnt_dict.pkl')

In [35]:
used_cols = ['user_id', 'content_id', 'task_container_id', 'answered_correctly', 'prior_question_elapsed_time',
             'prior_question_had_explanation', 'content_type_id', 'timestamp']

def make_user_loop_feature(df, content_target_mean_dict,
                          user_cnt_dict, 
                          user_pos_cnt_dict,
                          user_part_cnt_dict,
                          user_part_pos_cnt_dict,
                          user_content_cnt_dict,
                          user_content_pos_cnt_dict,
                          user_content_redo_cnt_dict,
                          user_content_mean_sum_dict,
                          user_consecutive_pos_cnt_dict, 
                          user_target_win25_dict, 
                          user_content_mean_win10_dict,
                          user_explanation_cnt_dict,
                          user_explanation_pos_cnt_dict, 
                          user_elapse_time_sum_dict,
                          user_elapse_time_win10_dict,
                          user_last_timestamp_dict,
                          user_last_task_dict,
                          user_content_win5_dict,
                          user_part_win10_dict,
                          bundle_state_dict,
                          user_cum_time_dict, 
                          user_timespan_win10_dict,# user_order_in_session_dict,
                          user_tags_cnt_dict,
                          user_tags_pos_cnt_dict,
                          user_continue_quest_cnt_dict,
                          update=True, isTrain=True):
    sample_num = len(df.loc[df.content_type_id == False])

    user_cnt_npy = np.zeros(sample_num)
    user_pos_cnt_npy = np.zeros(sample_num)
    user_part_cnt_npy = np.zeros(sample_num)
    user_part_pos_cnt_npy = np.zeros(sample_num)
    user_content_cnt_npy = np.zeros(sample_num)
    user_content_pos_cnt_npy = np.zeros(sample_num)
    user_content_redo_cnt_npy = np.zeros(sample_num)
    user_content_mean_mean_npy = np.zeros(sample_num)
    user_consecutive_pos_cnt_npy = np.zeros(sample_num)
    user_pos_cnt_win25_npy = np.zeros(sample_num)
    user_content_mean_win10_npy = np.zeros(sample_num)

    user_explanation_cnt_npy = np.zeros(sample_num)
    user_explanation_pos_cnt_npy = np.zeros(sample_num)
    user_elapse_time_mean_npy = np.zeros(sample_num)
    user_elapse_time_mean_win10_npy = np.zeros(sample_num)
    user_last_timespan_npy = np.zeros(sample_num)
    user_last_task_diff_npy = np.zeros(sample_num)
    user_content_appear_in_win5_npy = np.zeros(sample_num)
    user_part_cnt_in_win10_npy = np.zeros(sample_num)

    # user_order_in_session_npy = np.zeros(sample_num)
    user_cum_time_npy = np.zeros(sample_num)
    user_timespan_win10_mean_npy = np.zeros(sample_num)

    user_tags_cnt_mean_npy = np.zeros(sample_num)
    user_tags_pos_rate_npy = np.zeros(sample_num)

    user_continue_quest_cnt_npy = np.zeros(sample_num)

    if update:
        tk0 = tqdm(df[used_cols].values)
    else:
        tk0 = df[used_cols].values
    idx = 0
    for(_user_id,_content_id,_task_container_id, _answered_correctly, _prior_question_elapsed_time,
           _prior_question_had_explanation, _content_type_id, _timestamp) in tk0:
        
        if _content_id in content_target_mean_dict:
            _content_target_mean = content_target_mean_dict[_content_id]
        else:
            _content_target_mean = 0
            
        if _content_type_id == False:
            _bundle_id = question_bundle_dict[_content_id]
        else:
            _content_target_mean=0
        