In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/riiid-test-answer-prediction/lectures.csv
/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv
/kaggle/input/riiid-test-answer-prediction/example_test.csv
/kaggle/input/riiid-test-answer-prediction/questions.csv
/kaggle/input/riiid-test-answer-prediction/train.csv
/kaggle/input/riiid-test-answer-prediction/riiideducation/__init__.py
/kaggle/input/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/lgbmodelno-part/model.txt


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import gc
from collections import Counter 
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
dir_path = '/kaggle/input/riiid-test-answer-prediction/'
file_train = 'train.csv'
file_questions = 'questions.csv'
file_lectures = 'lectures.csv'

In [4]:
nrows = 200 * 10000
# nrows = None  #选所有数据

In [5]:
train = pd.read_csv(
                    dir_path + file_train, 
                    nrows=nrows, 
                    usecols=['row_id', 'timestamp', 'user_id', 'content_id', 
                             'content_type_id', 'task_container_id', 'answered_correctly',
                            'prior_question_elapsed_time','prior_question_had_explanation'],
                    dtype={
                            'row_id': 'int64',
                            'timestamp': 'int64',
                            'user_id': 'int32',
                            'content_id': 'int16',
                            'content_type_id': 'int8',
                            'task_container_id': 'int8',
                            'answered_correctly': 'int8',
                            'prior_question_elapsed_time': 'float32',
                            'prior_question_had_explanation': 'str'
                        }
                   )

In [6]:
lectures = pd.read_csv(
                       dir_path + file_lectures, 
                       usecols=['lecture_id','tag','part','type_of'], 
                       nrows=nrows,
                       dtype={
                           'lecture_id': 'int16',
                           'tag': 'int16',
                           'part': 'int8',
                           'type_of': 'str'
                       }
                    )

In [7]:
questions = pd.read_csv(
                        dir_path + file_questions, 
                        nrows=nrows,
                        usecols=['question_id','bundle_id','part','tags'], 
                        dtype={
                           'question_id': 'int16',
                           'bundle_id': 'int16',
                           'part': 'int8',
                           'tags': 'str'
                       }
                    )

In [8]:
# 数据处理
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].map({'True':1,'False':0}).fillna(0).astype(np.int8)
prior_question_elapsed_time_mean = train['prior_question_elapsed_time'].dropna().values.mean()
train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)

lectures['type_of'] = lectures['type_of'].map({'concept':0, 'intention':1, 'solving question':2, 'starter':3})

questions['tags'] = questions['tags'].fillna('1')

In [9]:
# # 压缩内存, 每个分组取最后1000条数据
# max_num = 1000
# train = train.groupby(['user_id']).tail(max_num)

In [10]:
# 切分数据
train_lectures = train[train['content_type_id']==1] 
train_questions = train[train['content_type_id']==0]
del train
gc.collect()

0

In [11]:
# 关联数据
train_lectures_info = pd.merge(
        left=train_lectures,
        right=lectures,
        how='left',
        left_on='content_id',
        right_on='lecture_id'
        )

train_questions_info = pd.merge(
        left=train_questions,
        right=questions,
        how='left',
        left_on='content_id',
        right_on='question_id'
        )

In [12]:
del train_lectures
del train_questions
gc.collect()

40

In [13]:
# 提取特征函数
# 文献课程类函数
def get_lecture_basic_features__user_by_part_typeof(train_lectures_info):
    gb_columns = ['user_id']
    
    train_lectures_info = pd.get_dummies(train_lectures_info, columns=['part', 'type_of'])
    train_lectures_info.columns = train_lectures_info.columns.map(lambda column: column.replace('part', 'lectures_part'))
    train_lectures_info.columns = train_lectures_info.columns.map(lambda column: column.replace('type', 'lectures_type'))
    
    part_lectures_columns = [column for column in train_lectures_info.columns if column.startswith('lectures_part')]
    types_of_lectures_columns = [column for column in train_lectures_info.columns if column.startswith('lectures_type')]
    
    agg_func =  ['sum']
    columns = part_lectures_columns + types_of_lectures_columns
    train_lectures_info__user_by_part_typeof_f = train_lectures_info.\
                                groupby(gb_columns)[part_lectures_columns + types_of_lectures_columns].\
                                agg(agg_func).\
                                reset_index()
    
    train_lectures_info__user_by_part_typeof_f.columns = gb_columns + columns
    dtypes = {}
    for col in columns:
        dtypes[col] = 'int8'
        
    train_lectures_info__user_by_part_typeof_f = train_lectures_info__user_by_part_typeof_f.astype(dtypes)
    
    return train_lectures_info__user_by_part_typeof_f



# 问答类函数
def get_questions_basic_features__user(train_questions_info):
    gb_columns = ['user_id']
    gb_suffixes = 'question_'+'_'.join(gb_columns)
    train_questions_info['lag'] = train_questions_info.groupby('user_id')['answered_correctly'].shift()
    agg_func = {
        'answered_correctly': ['mean','count','std','sum'], #每个用户的平均精度mean, 回答了多少问题 

        #'task_container_id': [lambda x: len(set(x))],

        'prior_question_elapsed_time': ['mean'], 

        'prior_question_had_explanation': ['mean'],#每个用户都有解释的先前问题的比例
        
        'lag': ['cumsum', 'cumcount']

        # part 展开
        #'part': [lambda x: len(set(x))],
        #'tags': [lambda x: len(set(x))],
    }
    columns = [
               gb_suffixes+'_answered_correctly_mean',
               gb_suffixes+'_answered_correctly_count',
               gb_suffixes+'_answered_correctly_std',
               gb_suffixes+'_answered_correctly_sum',

               #gb_suffixes+'_unique_task_container_id',
        
               gb_suffixes+'_prior_question_elapsed_time_mean',

               gb_suffixes+'_prior_question_had_explanation_mean',
        
               gb_suffixes+'_lag_cumsum',
               gb_suffixes+'_lag_cumcount',

               #gb_suffixes+'_unique_part',
               #gb_suffixes+'_unique_tags',
              ]

    train_questions_info__user_f = train_questions_info.\
                                    groupby(gb_columns).\
                                    agg(agg_func).\
                                    reset_index()
    train_questions_info__user_f.columns = gb_columns + columns  
    train_questions_info__user_f[gb_suffixes+'_user_correctness'] = train_questions_info__user_f[gb_suffixes+'_lag_cumsum'] / train_questions_info__user_f[gb_suffixes+'_lag_cumcount']
    train_questions_info.drop(['lag'], axis=1, inplace=True)
    
    dtypes = {
        gb_suffixes+'_answered_correctly_mean': 'float32',
        gb_suffixes+'_answered_correctly_count': 'int16',
        gb_suffixes+'_answered_correctly_std': 'float32',
        gb_suffixes+'_answered_correctly_sum': 'int16',
        gb_suffixes+'_prior_question_elapsed_time_mean': 'float32',
        gb_suffixes+'_prior_question_had_explanation_mean': 'float32',
        gb_suffixes+'_lag_cumsum': 'int16',
        gb_suffixes+'_lag_cumcount':  'int16',
        gb_suffixes+'_user_correctness': 'float32'
        
    }
    train_questions_info__user_f.replace(np.nan, 0, inplace=True) 
    train_questions_info__user_f.replace(np.inf, 0, inplace=True)
    train_questions_info__user_f = train_questions_info__user_f.astype(dtypes)
    
    return train_questions_info__user_f


def get_questions_basic_features__content(train_questions_info):
    gb_columns = ['content_id']
    gb_suffixes = 'question_'+'_'.join(gb_columns)
    agg_func = {
        'answered_correctly': ['mean','count','std', 'sum'], # 每个问题正确回答的比例mean,回答总次数count,标准差std,回答正确的个数sum

        'user_id': ['count','nunique'], #每个问题有多少个用户回答 size计数时包含NaN值

        'prior_question_had_explanation': ['mean'] # 每个问题都有解释的先验问题的比例
    }
    columns = [
               gb_suffixes+'_answered_correctly_mean',
               gb_suffixes+'_answered_correctly_count',
               gb_suffixes+'_answered_correctly_std',
               gb_suffixes+'_answered_correctly_sum',

               gb_suffixes+'_user_id_count', 
               gb_suffixes+'_user_id_nunique', 

               gb_suffixes+'_prior_question_had_explanation_mean'
              ]    
    
    train_questions_info__user_content_f = train_questions_info.\
                                    groupby(gb_columns).\
                                    agg(agg_func).\
                                    reset_index()
    train_questions_info__user_content_f.columns = gb_columns + columns
    
    train_questions_info__user_content_f[gb_suffixes+'_avg_questions'] = train_questions_info__user_content_f[gb_suffixes+'_user_id_count']/train_questions_info__user_content_f[gb_suffixes+'_user_id_nunique']
    dtypes = {
        gb_suffixes+'_answered_correctly_mean': 'float32',
        gb_suffixes+'_answered_correctly_count': 'int16',
        gb_suffixes+'_answered_correctly_std': 'float32',
        gb_suffixes+'_answered_correctly_sum': 'int16',
        gb_suffixes+'_user_id_count': 'int16',
        gb_suffixes+'_user_id_nunique': 'int16',
        gb_suffixes+'_prior_question_had_explanation_mean': 'float32',
        gb_suffixes+'_avg_questions': 'float32'
        
    }
    train_questions_info__user_content_f = train_questions_info__user_content_f.astype(dtypes)
        
    return train_questions_info__user_content_f

In [14]:
train_lectures_info__user__by_part_typeof_f = get_lecture_basic_features__user_by_part_typeof(train_lectures_info)

In [15]:
train_questions_info__user_f = get_questions_basic_features__user(train_questions_info)

In [16]:
train_questions_info__user_content_f = get_questions_basic_features__content(train_questions_info)

In [17]:
train_questions_info = train_questions_info.merge(train_lectures_info__user__by_part_typeof_f,on=['user_id'],how='left')
train_questions_info = train_questions_info.merge(train_questions_info__user_f,on=['user_id'],how='left')
train_questions_info = train_questions_info.merge(train_questions_info__user_content_f,on=['content_id'],how='left')

In [18]:
# del train_lectures_info__user__by_part_typeof_f
# del train_questions_info__user_f
# del train_questions_info__user_content_f
# gc.collect()

In [19]:
train_questions_info.columns

Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'question_id', 'bundle_id', 'part', 'tags', 'lectures_part_1',
       'lectures_part_2', 'lectures_part_3', 'lectures_part_4',
       'lectures_part_5', 'lectures_part_6', 'lectures_part_7',
       'lectures_type_of_0', 'lectures_type_of_1', 'lectures_type_of_2',
       'question_user_id_answered_correctly_mean',
       'question_user_id_answered_correctly_count',
       'question_user_id_answered_correctly_std',
       'question_user_id_answered_correctly_sum',
       'question_user_id_prior_question_elapsed_time_mean',
       'question_user_id_prior_question_had_explanation_mean',
       'question_user_id_lag_cumsum', 'question_user_id_lag_cumcount',
       'question_user_id_user_correctness',
       'question_content_id_answered_correctly_mean',
       'question_content_id_answered_c

In [20]:
# 修改
remove_columns = ['row_id','timestamp','user_id','content_type_id','question_content_id_user_id_count',
                  'task_container_id','answered_correctly','question_id', 'bundle_id','tags'] #'part','tags'
features_columns = [c for c in train_questions_info.columns if c not in remove_columns]


In [21]:
valid_data = train_questions_info.groupby('user_id').tail(6)
train_data = train_questions_info[~train_questions_info.index.isin(valid_data.index)]
# train_questions_info = train_questions_info[~train_questions_info.index.isin(valid_data.index)]

In [22]:
# train_data = train_questions_info.groupby('user_id').tail(30)

In [23]:
len(train_data),len(valid_data)

(1914779, 46238)

In [24]:
del train_questions_info
gc.collect()

198

In [25]:
X_test, y_test = valid_data[features_columns], valid_data['answered_correctly']

X_train, y_train = train_data[features_columns], train_data['answered_correctly']

In [26]:
params = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_bin': 800,
    'num_leaves': 80
}

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature = ['part','prior_question_had_explanation'])
lgb_eval = lgb.Dataset(X_test, y_test, categorical_feature = ['part','prior_question_had_explanation'], reference=lgb_train)

gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=10000,
            valid_sets=lgb_eval,
            early_stopping_rounds=50,
            verbose_eval=50
            )



Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.761274
[100]	valid_0's auc: 0.764414
[150]	valid_0's auc: 0.766214
[200]	valid_0's auc: 0.766969
[250]	valid_0's auc: 0.7671
[300]	valid_0's auc: 0.767245
[350]	valid_0's auc: 0.767288
[400]	valid_0's auc: 0.767352
[450]	valid_0's auc: 0.767429
[500]	valid_0's auc: 0.76741
Early stopping, best iteration is:
[473]	valid_0's auc: 0.767487


In [27]:
gbm_features = pd.DataFrame({'feature_importance':gbm.feature_importance(),'features_columns':features_columns}).sort_values(['feature_importance'],ascending=0)
gbm_features

Unnamed: 0,feature_importance,features_columns
1,3000,prior_question_elapsed_time
14,2805,question_user_id_answered_correctly_mean
23,2603,question_content_id_answered_correctly_mean
19,2286,question_user_id_prior_question_had_explanatio...
29,2169,question_content_id_avg_questions
18,2160,question_user_id_prior_question_elapsed_time_mean
0,2104,content_id
28,1800,question_content_id_prior_question_had_explana...
15,1702,question_user_id_answered_correctly_count
25,1561,question_content_id_answered_correctly_std


In [28]:
# model = lgb.Booster(model_file='/kaggle/input/lgbmodelno-part/model.txt')

In [29]:
import riiideducation
env = riiideducation.make_env()

In [30]:
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    
    test_questions = test_df[test_df['content_type_id']==0]
    test_questions_info = pd.merge(
            left=test_questions,
            right=questions,
            how='left',
            left_on='content_id',
            right_on='question_id'
            )
    
    # 数据处理
    test_questions_info['prior_question_had_explanation'] = test_questions_info['prior_question_had_explanation'].map({'True':1,'False':0}).fillna(0).astype(np.int8)
    prior_question_elapsed_time_mean = test_questions_info['prior_question_elapsed_time'].dropna().values.mean()
    test_questions_info['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)

    test_questions_info = test_questions_info.merge(train_lectures_info__user__by_part_typeof_f,on=['user_id'],how='left')
    test_questions_info = test_questions_info.merge(train_questions_info__user_f,on=['user_id'],how='left')
    test_questions_info = test_questions_info.merge(train_questions_info__user_content_f,on=['content_id'],how='left')


    X_test = test_questions_info[features_columns]
    
    test_questions_info['answered_correctly'] =  gbm.predict(X_test)
    
    env.predict(test_questions_info.loc[test_questions_info['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [31]:
# prior_test_df = None
# for (test_df, sample_prediction_df) in iter_test:
#     if prior_test_df is not None:
#         prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
#         prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
#         user_ids = prior_test_df['user_id'].values
#         content_ids = prior_test_df['content_id'].values
#         targets = prior_test_df[target].values
        
#         for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
#             if user_id in user_agg.index:
#                 user_agg.loc[user_id, 'sum'] += answered_correctly
#                 user_agg.loc[user_id, 'count'] += 1
#             else:
#                 user_agg.loc[user_id] = [answered_correctly, 1]
            
#             if content_id in content_agg.index:
#                 content_agg.loc[content_id, 'sum'] += answered_correctly
#                 content_agg.loc[content_id, 'count'] += 1
#             else:
#                 content_agg.loc[content_id] = [answered_correctly, 1]
                
#     prior_test_df = test_df.copy()
#     test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    
#     test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
#     test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].map({'True':1,'False':0}).fillna(0).astype(np.int8)
#     prior_question_elapsed_time_mean = test_df.prior_question_elapsed_time.dropna().values.mean()
#     test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean, inplace=True)
    
   
    
#     test_df['user_correctness'] = test_df['user_id'].map(user_agg['sum'] / user_agg['count'])
#     test_df['content_count'] = test_df['content_id'].map(content_agg['count']).fillna(1)
#     test_df['content_difficulty'] = test_df['content_id'].map(content_agg['sum'] / content_agg['count']).fillna(0.7)
#     test_df['user_answered_count'] = test_df['user_id'].map(user_agg['count']).fillna(1)
#     test_df['user_answered_correctly_count'] = test_df['user_id'].map(user_agg['sum']).fillna(1)
       
# #     test_df[target] = model.predict(test_df[features])
# #     env.predict(test_df[['row_id', target]])
    
#     test_df['answered_correctly'] = model.predict(test_df[features_columns])
#     env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [32]:
ls

__notebook__.ipynb  submission.csv


In [33]:
test_df[features_columns]

KeyError: "['question_content_id_prior_question_had_explanation_mean', 'lectures_part_6', 'question_user_id_prior_question_elapsed_time_mean', 'lectures_part_7', 'question_content_id_user_id_nunique', 'lectures_part_5', 'lectures_type_of_2', 'part', 'question_content_id_answered_correctly_sum', 'question_user_id_lag_cumcount', 'question_user_id_lag_cumsum', 'question_content_id_avg_questions', 'question_content_id_answered_correctly_count', 'lectures_type_of_0', 'question_content_id_answered_correctly_mean', 'question_user_id_user_correctness', 'lectures_type_of_1', 'lectures_part_1', 'question_user_id_answered_correctly_sum', 'question_user_id_answered_correctly_count', 'lectures_part_2', 'question_content_id_answered_correctly_std', 'question_user_id_prior_question_had_explanation_mean', 'lectures_part_3', 'question_user_id_answered_correctly_std', 'question_user_id_answered_correctly_mean', 'lectures_part_4'] not in index"