In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

import lightgbm as lgb

import riiideducation

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/riiid-train-data-multiple-formats/riiid_train.parquet
/kaggle/input/riiid-train-data-multiple-formats/riiid_train.feather
/kaggle/input/riiid-train-data-multiple-formats/riiid_train.jay
/kaggle/input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip
/kaggle/input/riiid-train-data-multiple-formats/riiid_train.h5
/kaggle/input/riiid-test-answer-prediction/lectures.csv
/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv
/kaggle/input/riiid-test-answer-prediction/example_test.csv
/kaggle/input/riiid-test-answer-prediction/questions.csv
/kaggle/input/riiid-test-answer-prediction/train.csv
/kaggle/input/riiid-test-answer-prediction/riiideducation/__init__.py
/kaggle/input/riiid-test-answer-prediction/riiideducation/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl


# Notebook by Braysen Goodwin

## Heavily based off of: https://www.kaggle.com/erikbruin/riiid-comprehensive-eda-baseline by Erik Bruin

# Read In Data

In [2]:
%%time

train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")

print("Train size:", train.shape)

Train size: (101230332, 10)
CPU times: user 4.45 s, sys: 6.39 s, total: 10.8 s
Wall time: 11.5 s


In [3]:
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False
3,3,131167,115,7860,False,3,0,1,19000.0,False
4,4,137965,115,7922,False,4,1,1,11000.0,False


# Preprocess Data

In [4]:
features = ['content_id', 'timestamp', 'content_type_id', 'task_container_id', 'prior_question_elapsed_time', 'prior_question_had_explanation']

def setupDataframes(dataframe, garbageCollect=False, includeLabels = False):
    """
    normalizes the dataframe by filling the missing values with special values, 
      conveting to the correct type, and only keeping the necessary data.
    
    params:
      dataframe - the pandas dataframe to normalize
      garbageCollect = False - whether to garbage collect after memory intensive operations
      includeLabels - should include also return the labels
    
    returns:
          data
            data - a pandas dataframe with the correct data to run the model on
        or 
          data, labels
            data - a pandas dataframe with the correct data to run the model on
            lables - the target labels for each row
    """
    data = dataframe[features]
    
    data['content_type_id'] = data['content_type_id'].replace(np.nan, -1)
    data['prior_question_elapsed_time'] = data['prior_question_elapsed_time'].replace(np.nan, -1)
    data['prior_question_had_explanation'] = data['prior_question_had_explanation'].replace(np.nan, -1)
    data['prior_question_had_explanation'] = data['prior_question_had_explanation'].apply(lambda x: -1 if x is None else int(x))
    
    if garbageCollect:
        gc.collect()

    data['content_type_id'] = data['content_type_id'].astype('int32')
    data['prior_question_had_explanation'] = data['prior_question_had_explanation'].astype('int32')

    if garbageCollect:
        gc.collect()
    
    if includeLabels:
        return data, dataframe['answered_correctly']
    
    return data
    
    
    
    
    
    

In [5]:
train, labels = setupDataframes(train, garbageCollect=True, includeLabels=True)

gc.collect()

train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,content_id,timestamp,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation
0,5692,0,0,1,-1.0,-1
1,5716,56943,0,2,37000.0,0
2,128,118363,0,0,55000.0,0
3,7860,131167,0,3,19000.0,0
4,7922,137965,0,4,11000.0,0


In [6]:
labels.head()

0    1
1    1
2    1
3    1
4    1
Name: answered_correctly, dtype: int8

In [7]:
trainingCount = 90000000

# convert the dataset into an object the model can understand
train_dataset = lgb.Dataset(train[:trainingCount], labels[:trainingCount])
valid_dataset = lgb.Dataset(train[trainingCount:], labels[trainingCount:])

# Make and train the model

In [8]:
%%time
model = lgb.train(
    {'objective': 'binary', 'metric': 'auc'}, 
    train_dataset,
    valid_sets=[train_dataset, valid_dataset],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

Training until validation scores don't improve for 8 rounds
[50]	training's auc: 0.631767	valid_1's auc: 0.632393
[100]	training's auc: 0.636182	valid_1's auc: 0.636819
[150]	training's auc: 0.638052	valid_1's auc: 0.638594
[200]	training's auc: 0.639169	valid_1's auc: 0.639655
[250]	training's auc: 0.639833	valid_1's auc: 0.640241
[300]	training's auc: 0.640275	valid_1's auc: 0.640573
[350]	training's auc: 0.64064	valid_1's auc: 0.640872
[400]	training's auc: 0.640948	valid_1's auc: 0.641122
[450]	training's auc: 0.641151	valid_1's auc: 0.641289
[500]	training's auc: 0.641393	valid_1's auc: 0.641433
[550]	training's auc: 0.641564	valid_1's auc: 0.641521
[600]	training's auc: 0.641719	valid_1's auc: 0.641609
Early stopping, best iteration is:
[622]	training's auc: 0.641779	valid_1's auc: 0.641624
CPU times: user 10h 51min 41s, sys: 7min 16s, total: 10h 58min 57s
Wall time: 3h 50min 54s


In [9]:
gc.collect()

562

# Create Submission

In [10]:
env = riiideducation.make_env()

In [11]:
for (test_df, sample_prediction_df) in env.iter_test():
    tesdata = setupDataframes(test_df)
    test_df['answered_correctly'] =  model.predict(tesdata[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user