In [1]:
#############################################################################
# 0. Libraries

import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import pandas_profiling as pp

from tqdm import tqdm
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score 

# import tensorflow as tf
# tf.keras.backend.clear_session()

# physical_devices = tf.config.list_physical_devices('GPU')

# try:
#     tf.config.experimental.set_memory_growth(physical_devices[0], True)
# except:
#     print('Invalid device or cannot modify virtual devices once initialized.')
    
# from tensorflow.keras.utils import Sequence
# from tensorflow.keras import models, layers, regularizers, metrics, losses, optimizers

#############################################################################

In [2]:
#############################################################################
# 1. Load Data

path = '../01_Data/01_Raw/'
path_output = '../01_Data/02_GeneratedData/'


df_train = pd.read_csv(path + 'train.csv', 
                       low_memory=False, 
                       nrows=2*(10**6), 
                       dtype={'row_id': 'int32', 'timestamp': 'int64', 
                               'user_id': 'int32', 'content_id': 'int16', 
                              'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 
                              'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean'
                      }
)

df_lectures = pd.read_csv(path + 'lectures.csv')
df_questions = pd.read_csv(path + 'questions.csv')

#############################################################################

In [3]:
#############################################################################
# 2. Global Variables

SEQ_LENGTH = 100

#############################################################################

In [4]:
#############################################################################
# 3. Global Functions

def reduceMemUsage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


def scale(x, mean_, std_):
    return (x - mean_) / std_


def unscale(x, mean_, std_):
    return (x * std_) + mean_


def getUserFeatures(d, a, b ,column):
    tmp_ = d[column][d[a] < d[b]]
    mean_, count_, std_ = tmp_.mean(), tmp_.count(), tmp_.std()
    return mean_, count_, std_


#############################################################################

In [5]:
#############################################################################
# 5. Preprocess Tables

# 5.1 Questions

df_questions['tags'][df_questions['tags'].isna()] = '-1'
df_questions['tags'] = df_questions.tags.apply(lambda x: np.array(str(x).split(' '), dtype=np.int16))
df_questions.columns = ['question_' + x if x != 'question_id' else x for x in list(df_questions.columns)]

# 5.2 Train

features_train = ['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id', 'task_container_id', 
                  'answered_correctly', 'user_answer', 'prior_question_elapsed_time', 'prior_question_had_explanation']

df_train = df_train[features_train]
df_train['prior_question_had_explanation'] = df_train['prior_question_had_explanation'].\
                                                        apply(lambda x: 1 if x is True else 0)

df_train['prior_question_elapsed_time'] = df_train['prior_question_elapsed_time'].fillna(0)
df_train = reduceMemUsage(df_train)

df_train['task_container_id'] = (
    df_train
    .groupby('user_id')['task_container_id']
    .transform(lambda x: pd.factorize(x)[0])
    .astype('int16')
)

df_train = df_train[df_train['content_type_id']==0]

df_train['user_row'] = df_train.sort_values(['user_id', 'timestamp'], ascending=[True, True]) \
             .groupby(['user_id']) \
             .cumcount() + 1

df_tmp = pd.DataFrame(df_train.groupby(['user_id']).size())
df_tmp['user_id'] = df_tmp.index
df_tmp = df_tmp.reset_index(drop=True)
df_tmp.columns = ['user_size', 'user_id']

df_train = df_train.merge(df_tmp, how='inner',  on='user_id')

df_train['timestamp'] = (df_train['timestamp']/1_000).astype(int)
df_train['prior_question_elapsed_time'] = (df_train['prior_question_elapsed_time']/1_000).astype(int)
df_train = reduceMemUsage(df_train)

#############################################################################

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Memory usage after optimization is: 53.41 MB
Decreased by 20.0%
Memory usage after optimization is: 63.59 MB
Decreased by 29.2%


In [6]:
#############################################################################
# 6. Feature engineering

# Questions

df_features_content = df_train.groupby(['content_id']).agg({
   'answered_correctly': ['mean', 'count', 'std'] 
})

df_features_content.columns = ['mean_accuracy', 'question_asked', 'std_accuracy']
df_features_content['question_asked'] = np.log1p(df_features_content['question_asked'])
df_features_content['content_id'] = df_features_content.index

# Users 

df_user_content = df_train.groupby(['user_id']).apply(getUserFeatures, a='user_row', b='user_size', 
                                                      column='answered_correctly').reset_index()

df_user_content['mean_accuracy'] = df_user_content[0].apply(lambda x: x[0])
df_user_content['question_asked'] = df_user_content[0].apply(lambda x: np.log1p(x[1]))
df_user_content['std_accuracy'] = df_user_content[0].apply(lambda x: x[2])
df_user_content = df_user_content.drop([0], axis=1)

# df_user_content = df_train.groupby(['user_id']).agg({
#    'answered_correctly': ['mean', 'count', 'std'] 
# })

# df_user_content.columns = ['mean_accuracy', 'question_asked', 'std_accuracy']
# df_user_content[df_user_content.index.isin([115, 124])]
# df_user_content.iloc[[115, 224], :]

# Merge all

df_train = df_train.merge(df_questions, 
                          left_on=['content_id'],
                          right_on=['question_id'],
                          how='inner')

df_train = df_train.drop(['content_id', 'row_id', 'question_bundle_id'], axis=1)
df_train = df_train.sort_values(['user_id', 'timestamp']).reset_index(drop=True)

df_train = reduceMemUsage(df_train)

# Dicts 

unique_users = set(df_train['user_id'].unique())
dict_users = {v: k for k, v in enumerate(unique_users)}
dict_users_inv ={k: v for k, v in enumerate(unique_users)}

# Filter

df_train['keep'] = df_train.apply(lambda x: 0 if x['user_row']<(x['user_size']-(SEQ_LENGTH+1)) else 1, axis=1)
print(f'Before filtering: {df_train.shape[0]} rows')
df_train = df_train[df_train['keep']==1]
print(f'After filtering: {df_train.shape[0]} rows')

#############################################################################

Memory usage after optimization is: 59.85 MB
Decreased by 38.5%
Before filtering: 1961017 rows
After filtering: 436408 rows


In [7]:
#############################################################################
# 7. Build Dataset

# user_window_past_questions - Past ids answered
# user_window_past_answered_correct - Past target
# user_window_elapsed_time - Elapsed time solving last question (0-300 secs)
# user_window_lag_time - Lag time between questions (0-1440 mins)

X_past_questions = np.full((len(unique_users), SEQ_LENGTH, 3+7), -1)
X_past_responses = np.full((len(unique_users), SEQ_LENGTH, 2), -1)
X_past_times = np.full((len(unique_users), SEQ_LENGTH, 2), -1)

X_agg_content_features = np.full((len(unique_users), 3), -1)
X_agg_user_features = np.full((len(unique_users), 3), -1)

X_curr_question_ids = np.full((len(unique_users), 2), -1)
X_curr_question_tags = np.full((len(unique_users), 7), -1)
X_target = np.full((len(unique_users), 1) ,-1)

for i, user in tqdm(enumerate(unique_users), total=len(unique_users), position=0):
    user_window = df_train[df_train['user_id']==user]
    pos = dict_users[user]
    break
    ## 7.1 Past Features
    
    user_window_past_questions_ids = user_window['question_id'].values[(-SEQ_LENGTH-1):-1].astype(np.int32)
    user_window_past_questions_parts = user_window['question_part'].values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_past_questions_correct_answer = user_window['question_correct_answer'].values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_past_questions_tags = user_window['question_tags'].values[(-SEQ_LENGTH-1):-1]
    
    user_window_elapsed_time = user_window['prior_question_elapsed_time'].shift(-1).values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_elapsed_time = np.where(user_window_elapsed_time > 300, 300, user_window_elapsed_time)
    user_window_lag_time = ((user_window['timestamp'] - user_window['timestamp'].shift(1).fillna(0))/60).values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_lag_time = np.where(user_window_lag_time > 1440, 1440, user_window_lag_time)
    
    user_window_past_answered_correct = user_window['answered_correctly'].values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_past_answered_parts = user_window['user_answer'].values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    
    X_past_questions[pos, :user_window_past_questions_ids.shape[0], 0] = user_window_past_questions_ids
    X_past_questions[pos, :user_window_past_questions_parts.shape[0], 1] = user_window_past_questions_parts
    X_past_questions[pos, :user_window_past_questions_correct_answer.shape[0], 2] = user_window_past_questions_correct_answer
    
    for i in range(user_window_past_questions_tags.shape[0]):
        values = user_window_past_questions_tags
        X_past_questions[pos, i, 3:(3+user_window_past_questions_tags[i].shape[0])] = user_window_past_questions_tags[i]
    
    X_past_responses[pos, :user_window_past_answered_correct.shape[0], 0] = user_window_past_answered_correct
    X_past_responses[pos, :user_window_past_answered_parts.shape[0], 1] = user_window_past_answered_parts
    
    X_past_times[pos, :user_window_elapsed_time.shape[0], 0] = user_window_elapsed_time
    X_past_times[pos, :user_window_lag_time.shape[0], 1] = user_window_lag_time
    
    ## 7.2 Current questions ids & tags
    
    q_id = user_window['question_id'].values[0]
    X_curr_question_ids[pos, 0] = q_id.astype(np.int32)
    X_curr_question_ids[pos, 1] = user_window['question_part'].values[0].astype(np.int16)
    
    tags_ = user_window['question_tags'].values[0]
    X_curr_question_tags[pos, :tags_.shape[0]] = tags_
    
    ## 7.3 Current agg features
    
    X_agg_user_features[pos, :] = df_user_content[df_user_content['user_id']==user].values[0, 1:]
    X_agg_content_features[pos, :] = df_features_content[df_features_content['content_id']==q_id].values[0, :3]
    
    ## 7.4 Current target
    
    X_target[pos] = user_window['answered_correctly'].values[0].astype(np.int16)
    
    
# np.save(path_output + 'train/' + 'X_past_questions.npy', X_past_questions)
# np.save(path_output + 'train/' + 'X_past_responses.npy', X_past_responses)
# np.save(path_output + 'train/' + 'X_past_times.npy', X_past_times)
# np.save(path_output + 'train/' + 'X_agg_content_features.npy', X_agg_content_features)
# np.save(path_output + 'train/' + 'X_agg_user_features.npy', X_agg_user_features)
# np.save(path_output + 'train/' + 'X_curr_question_ids.npy', X_curr_question_ids)
# np.save(path_output + 'train/' + 'X_curr_question_tags.npy', X_curr_question_tags)
# np.save(path_output + 'train/' + 'X_target.npy', X_target)

# np.save(path_output + 'train/' + 'unique_users.npy', unique_users)
# np.save(path_output + 'train/' + 'dict_users.npy', dict_users)
# np.save(path_output + 'train/' + 'dict_users_inv.npy', dict_users_inv)


#############################################################################

  0%|                                                                                         | 0/7712 [00:00<?, ?it/s]


In [11]:
user_window['prior_question_elapsed_time'].shift(-1).values[(-SEQ_LENGTH-1):-1].astype(np.int16)

array([21, 21, 25, 15, 15, 15, 18, 18, 18, 18, 18, 18, 14, 14, 14, 16, 31,
       13, 11, 11, 18, 36, 36, 36, 36, 20, 20, 20, 20], dtype=int16)

In [8]:
user_window

Unnamed: 0,timestamp,user_id,content_type_id,task_container_id,answered_correctly,user_answer,prior_question_elapsed_time,prior_question_had_explanation,user_row,user_size,question_id,question_correct_answer,question_part,question_tags,keep
1552550,0,33718282,0,0,1,0,0,0,1,30,7900,0,1,"[131, 93, 81]",1
1552551,23,33718282,0,1,0,2,21,0,2,30,7876,3,1,"[10, 94, 92]",1
1552552,51,33718282,0,2,0,1,21,0,3,30,175,2,1,"[9, 10, 92]",1
1552553,69,33718282,0,3,1,3,25,0,4,30,1278,3,2,"[143, 140, 81, 29]",1
1552554,125,33718282,0,4,1,2,15,0,6,30,2065,2,3,"[136, 162, 92, 29]",1
1552555,125,33718282,0,4,0,2,15,0,5,30,2064,1,3,"[157, 92, 29]",1
1552556,125,33718282,0,4,0,1,15,0,7,30,2063,0,3,"[136, 92, 29]",1
1552557,183,33718282,0,5,0,1,18,0,9,30,3363,3,4,"[74, 103, 29]",1
1552558,183,33718282,0,5,1,2,18,0,10,30,3365,2,4,"[136, 103, 29]",1
1552559,183,33718282,0,5,1,1,18,0,8,30,3364,1,4,"[136, 103, 29]",1


In [25]:
#############################################################################
# 7. Build Dataset

# user_window_past_questions - Past ids answered
# user_window_past_answered_correct - Past target
# user_window_elapsed_time - Elapsed time solving last question (0-300 secs)
# user_window_lag_time - Lag time between questions (0-1440 mins)

X_past_questions = np.full((len(unique_users), SEQ_LENGTH, 3+7), -1)
X_past_responses = np.full((len(unique_users), SEQ_LENGTH, 2), -1)
X_past_times = np.full((len(unique_users), SEQ_LENGTH, 2), -1)

X_agg_content_features = np.full((len(unique_users), 3), -1)
X_agg_user_features = np.full((len(unique_users), 3), -1)

X_curr_question_ids = np.full((len(unique_users), 2), -1)
X_curr_question_tags = np.full((len(unique_users), 7), -1)


for i, user in tqdm(enumerate(unique_users), total=len(unique_users), position=0):
    user_window = df_train[df_train['user_id']==user]
    pos = dict_users[user]
    
    ## 7.1 Past Features
    
    user_window_past_questions_ids = user_window['question_id'].values[(-SEQ_LENGTH-1):-1].astype(np.int32)
    user_window_past_questions_parts = user_window['question_part'].values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_past_questions_correct_answer = user_window['question_correct_answer'].values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_past_questions_tags = user_window['question_tags'].values[(-SEQ_LENGTH-1):-1]
    
    user_window_elapsed_time = user_window['prior_question_elapsed_time'].shift(-1).values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_elapsed_time = np.where(user_window_elapsed_time > 300, 300, user_window_elapsed_time)
    user_window_lag_time = ((user_window['timestamp'] - user_window['timestamp'].shift(1).fillna(0))/60).values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_lag_time = np.where(user_window_lag_time > 1440, 1440, user_window_lag_time)
    
    user_window_past_answered_correct = user_window['answered_correctly'].values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    user_window_past_answered_parts = user_window['user_answer'].values[(-SEQ_LENGTH-1):-1].astype(np.int16)
    
    X_past_questions[pos, :user_window_past_questions_ids.shape[0], 0] = user_window_past_questions_ids
    X_past_questions[pos, :user_window_past_questions_parts.shape[0], 1] = user_window_past_questions_parts
    X_past_questions[pos, :user_window_past_questions_correct_answer.shape[0], 2] = user_window_past_questions_correct_answer
    
    for i in range(user_window_past_questions_tags.shape[0]):
        values = user_window_past_questions_tags
        X_past_questions[pos, i, 3:(3+user_window_past_questions_tags[i].shape[0])] = user_window_past_questions_tags[i]
    
    X_past_responses[pos, :user_window_past_answered_correct.shape[0], 0] = user_window_past_answered_correct
    X_past_responses[pos, :user_window_past_answered_parts.shape[0], 1] = user_window_past_answered_parts
    
    X_past_times[pos, :user_window_elapsed_time.shape[0], 0] = user_window_elapsed_time
    X_past_times[pos, :user_window_lag_time.shape[0], 1] = user_window_lag_time
    
    ## 7.2 Current questions ids & tags
    
    q_id = user_window['question_id'].values[0]
    X_curr_question_ids[pos, 0] = q_id.astype(np.int32)
    X_curr_question_ids[pos, 1] = user_window['question_part'].values[0].astype(np.int16)
    
    tags_ = user_window['question_tags'].values[0]
    X_curr_question_tags[pos, :tags_.shape[0]] = tags_
    
    ## 7.3 Current agg features
    
    X_agg_user_features[pos, :] = df_user_content[df_user_content['user_id']==user].values[0, 1:]
    X_agg_content_features[pos, :] = df_features_content[df_features_content['content_id']==q_id].values[0, :3]
    
    
np.save(path_output + 'test/' + 'X_past_questions.npy', X_past_questions)
np.save(path_output + 'test/' + 'X_past_responses.npy', X_past_responses)
np.save(path_output + 'test/' + 'X_past_times.npy', X_past_times)
np.save(path_output + 'test/' + 'X_agg_content_features.npy', X_agg_content_features)
np.save(path_output + 'test/' + 'X_agg_user_features.npy', X_agg_user_features)
np.save(path_output + 'test/' + 'X_curr_question_ids.npy', X_curr_question_ids)
np.save(path_output + 'test/' + 'X_curr_question_tags.npy', X_curr_question_tags)
# np.save(path_output + 'test/' + 'X_target.npy', X_target)

# np.save(path_output + 'test/' + 'unique_users.npy', unique_users)
# np.save(path_output + 'test/' + 'dict_users.npy', dict_users)
# np.save(path_output + 'test/' + 'dict_users_inv.npy', dict_users_inv)

df_train.to_csv(path_output + 'test/' + 'df_test.csv', index=False)

#############################################################################

100%|█████████████████████████████████████████████████████████████████████████████| 7712/7712 [00:17<00:00, 449.56it/s]


In [19]:
X_past_questions[np.where(X_past_questions==-1)].shape

(5076884,)

In [20]:
set_unique_tags = set()
values = df_questions['question_tags'].values
for i, list_tags in enumerate(values):
    for q in list_tags:
        if q not in set_unique_tags:
            set_unique_tags.add(q)

input_dim_q_tags = len(set_unique_tags)
print(input_dim_q_tags)

189


In [21]:
df_questions.question_part.unique().shape

(7,)

In [26]:
df_train['question_correct_answer'].unique()

array([3, 2, 0, 1], dtype=int8)