In [30]:
import os
import sys
import gzip
import time
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

## ASSISTments 12-13

Download original data file [here](https://sites.google.com/site/assistmentsdata/home/2012-13-school-data-with-affect).

In [31]:
prefix = '/work/cywang/workspace/Data/ASSISTments/'  # path to the original data

f = open(prefix + '2012-2013-data-with-predictions-4-final.csv')
header = eval(f.readline().strip())
f.close()
header

('problem_log_id',
 'skill',
 'problem_id',
 'user_id',
 'assignment_id',
 'assistment_id',
 'start_time',
 'end_time',
 'problem_type',
 'original',
 'correct',
 'bottom_hint',
 'hint_count',
 'actions',
 'attempt_count',
 'ms_first_response',
 'tutor_mode',
 'sequence_id',
 'student_class_id',
 'position',
 'type',
 'base_sequence_id',
 'skill_id',
 'teacher_id',
 'school_id',
 'overlap_time',
 'template_id',
 'answer_id',
 'answer_text',
 'first_action',
 'problemlogid',
 'Average_confidence(FRUSTRATED)',
 'Average_confidence(CONFUSED)',
 'Average_confidence(CONCENTRATING)',
 'Average_confidence(BORED)')

In [32]:
problem_id, user_id, correct = 2, 3, 10
start_time, end_time = 6, 7
skill_name, skill_id = 1, 22
features = [29, 8, 9, 16, 23, 24]
feature_names = ['first_action', 'problem_type', 'original', 'tutor_mode', 'teacher_id', 'school_id']

In [33]:
def get_row(f):  
    # the format of this data is a mess... an record can be splited to multiple lines
    first_part = f.readline()
    if len(first_part) == 0:   # EOF
        return False
    
    first_part = first_part.strip()
    assert len(first_part.split('","')) == 14
        
    line = f.readline().strip()
    while len(line.split(',')) <= 10:
        line = f.readline().strip()
    second_part = line
    while second_part.count('"') % 2 == 0:
        line = f.readline().strip()
        second_part += line
        
    info_str = (first_part + second_part).replace('\\', '')
    info = eval(info_str)
    return info

In [34]:
f = open(prefix + '2012-2013-data-with-predictions-4-final.csv')
f.readline()

data_dict = {}
valid_cnt, bad_cnt, total_cnt = 0, 0, 0
info = get_row(f)
while info:
    if (total_cnt + 1) % 10000 == 0:
        print('.', end='')
        sys.stdout.flush()
    if (total_cnt + 1) % 1000000 == 0:
        print()
    total_cnt += 1
    
    if info[skill_id] != '':  # over half of the records have blank skill_id
        time_format = "%Y-%m-%d %H:%M:%S"
        start = time.mktime(datetime.strptime(info[start_time].split('.')[0], time_format).timetuple())
        end = time.mktime(datetime.strptime(info[end_time].split('.')[0], time_format).timetuple())

        data_dict[valid_cnt] = {
            'user_id': info[user_id],
            'problem_id': info[problem_id],
            'correct': round(float(info[correct])),
            'skill_id': info[skill_id],
            'timestamp': (int(start) + int(end)) // 2,
            'skill_name': info[skill_name],
            'dwell_time': int(end) - int(start) if int(end) - int(start) > 0 else 0,
        }
        for i in range(len(features)):
            data_dict[valid_cnt][feature_names[i]] = info[features[i]]
        valid_cnt += 1
    
    try:
        info = get_row(f)
    except:
        bad_cnt += 1
        info = get_row(f)
    
f.close()

print()
print(total_cnt, valid_cnt, bad_cnt)
raw_data_df = pd.DataFrame.from_dict(data_dict, orient='index')
raw_data_df.head()

....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
............
6123015 2711813 255


Unnamed: 0,teacher_id,first_action,problem_type,timestamp,problem_id,dwell_time,tutor_mode,original,user_id,skill_name,school_id,skill_id,correct
0,47424,0,algebra,1349751722,365981,21,tutor,1,61394,Rounding,5048,54,1
1,47424,0,algebra,1362624804,426415,8,tutor,1,61394,Multiplication and Division Integers,5048,279,0
2,47424,0,algebra,1376999708,86686,25,tutor,1,61394,Proportion,5048,79,1
3,152676,0,algebra,1355317261,401234,12,tutor,1,76592,Exponents,7561,86,1
4,49343,0,algebra,1350354684,87699,61,tutor,1,78401,Equation Solving Two or Fewer Steps,5056,311,1


In [35]:
# Clean data

# 1. skill_name is blank
illegal_users = list()
print('raw data:', len(raw_data_df))
filter_df = raw_data_df[raw_data_df['skill_name'] != '']
print('drop blank:', len(filter_df))

# 2. sequence length is not in a proper range
user_wise_lst = list()
for user, user_df in filter_df.groupby('user_id'):
    if 5 <= len(user_df):
        illegal_users.append(user)
        df = user_df.sort_values(by=['timestamp'])  # assure the sequence order
        user_wise_lst.append(df)
        
# 3. shuffle
np.random.seed(0)
np.random.shuffle(user_wise_lst)
user_wise_df = pd.concat(user_wise_lst).reset_index(drop=True)
print('drop <5:', len(user_wise_df))
user_wise_df.head()

raw data: 2711813
drop blank: 2630080
drop <5: 2621308


Unnamed: 0,teacher_id,first_action,problem_type,timestamp,problem_id,dwell_time,tutor_mode,original,user_id,skill_name,school_id,skill_id,correct
0,63593,0,choose_1,1371613820,988,76,tutor,1,221693,Graph Shape,5545,106,1
1,63593,0,algebra,1371614147,314635,24,tutor,1,221693,Equation Solving Two or Fewer Steps,5545,311,1
2,63593,0,algebra,1371614175,321174,27,tutor,1,221693,Write Linear Equation from Slope and y-intercept,5545,326,1
3,63593,0,choose_1,1377683264,374226,42,tutor,1,221693,Solving Inequalities,5545,375,1
4,63593,0,algebra,1377683301,43993,29,tutor,1,221693,Exponents,5545,86,1


In [36]:
# Re-index

skill2name = dict()
new_user_id, new_problem_id, new_skill_id = dict(), dict(), dict()
user_cnt, problem_cnt, skill_cnt = 0, 0, 0
new_feature_id, feature_cnt = list(), list()
for i in range(len(features)):
    new_feature_id.append(dict())
    feature_cnt.append(0)
    
# user and problem
for u_id, p_id in zip(user_wise_df['user_id'].values, user_wise_df['problem_id'].values):
    if u_id not in new_user_id:
        new_user_id[u_id] = user_cnt
        user_cnt += 1
    if p_id not in new_problem_id:
        new_problem_id[p_id] = problem_cnt
        problem_cnt += 1

# skill
for s_id, s_name in zip(user_wise_df['skill_id'].values, user_wise_df['skill_name'].values):
    if s_id not in new_skill_id:
        new_skill_id[s_id] = skill_cnt
        skill2name[skill_cnt] = s_name
        skill_cnt += 1
user_wise_df.drop(columns='skill_name', inplace=True)
        
# features
for i in range(len(features)):
    for f_val in user_wise_df[feature_names[i]].values:
        if f_val not in new_feature_id[i]:
            new_feature_id[i][f_val] = feature_cnt[i]
            feature_cnt[i] += 1

user_wise_df['user_id'] = user_wise_df['user_id'].apply(lambda x: new_user_id[x])
user_wise_df['problem_id'] = user_wise_df['problem_id'].apply(lambda x: new_problem_id[x])
user_wise_df['skill_id'] = user_wise_df['skill_id'].apply(lambda x: new_skill_id[x])
for j, f_name in enumerate(feature_names):
    user_wise_df[f_name] = user_wise_df[f_name].apply(lambda x: new_feature_id[j][x])

print(user_cnt, problem_cnt, skill_cnt)
print(feature_cnt)
user_wise_df.head()

25266 50918 245
[3, 6, 2, 2, 664, 335]


Unnamed: 0,teacher_id,first_action,problem_type,timestamp,problem_id,dwell_time,tutor_mode,original,user_id,school_id,skill_id,correct
0,0,0,0,1371613820,0,76,0,0,0,0,0,1
1,0,0,1,1371614147,1,24,0,0,0,0,1,1
2,0,0,1,1371614175,2,27,0,0,0,0,2,1
3,0,0,0,1377683264,3,42,0,0,0,0,3,1
4,0,0,1,1377683301,4,29,0,0,0,0,4,1


In [37]:
# Adujust dtypes

user_wise_df = user_wise_df.astype({'correct': np.float64, 'dwell_time': np.float64, 'timestamp': np.float64})
user_wise_df.dtypes

teacher_id        int64
first_action      int64
problem_type      int64
timestamp       float64
problem_id        int64
dwell_time      float64
tutor_mode        int64
original          int64
user_id           int64
school_id         int64
skill_id          int64
correct         float64
dtype: object

In [38]:
# Save

target_prefix = '/work/cywang/workspace/Project/HawkesKT/data/'
dataset_name = 'ASSISTments_12-13/'
user_wise_df.to_csv(os.path.join(target_prefix, dataset_name, 'interactions.csv'), sep='\t', index=False)
with open(os.path.join(target_prefix, dataset_name, 'skill_name.json'), 'w') as fp:
    json.dump(skill2name, fp)

---------------------------------------------------------------------------------------------------------------------

## ASSISTments 09-10

Download the original data file [here](https://sites.google.com/site/assistmentsdata/home/assistment-2009-2010-data/skill-builder-data-2009-2010).

In [10]:
# Oringal encoding is ISO-8859-1, which need to be converted to utf-8 in advance

prefix = '/work/cywang/workspace/Data/ASSISTments/'
data_df_09 = pd.read_csv(os.path.join(prefix, 'skill_builder_data_corrected.csv'), encoding='utf-8')
data_df_09.head()

Unnamed: 0,order_id,assignment_id,user_id,assistment_id,problem_id,original,correct,attempt_count,ms_first_response,tutor_mode,...,hint_count,hint_total,overlap_time,template_id,answer_id,answer_text,first_action,bottom_hint,opportunity,opportunity_original
0,33022537,277618,64525,33139,51424,1,1,1,32454,tutor,...,0,3,32454,30799,,26,0,,1,1.0
1,33022709,277618,64525,33150,51435,1,1,1,4922,tutor,...,0,3,4922,30799,,55,0,,2,2.0
2,35450204,220674,70363,33159,51444,1,0,2,25390,tutor,...,0,3,42000,30799,,88,0,,1,1.0
3,35450295,220674,70363,33110,51395,1,1,1,4859,tutor,...,0,3,4859,30059,,41,0,,2,2.0
4,35450311,220674,70363,33196,51481,1,0,14,19813,tutor,...,3,4,124564,30060,,65,0,0.0,3,3.0


In [11]:
data_df_09 = data_df_09[
    ['user_id', 'problem_id', 'correct', 'skill_id', 'skill_name', 'ms_first_response', 
     'first_action', 'answer_type', 'original', 'tutor_mode', 'teacher_id', 'school_id']
]
data_df_09.head()

Unnamed: 0,user_id,problem_id,correct,skill_id,skill_name,ms_first_response,first_action,answer_type,original,tutor_mode,teacher_id,school_id
0,64525,51424,1,1.0,Box and Whisker,32454,0,algebra,1,tutor,22763,73
1,64525,51435,1,1.0,Box and Whisker,4922,0,algebra,1,tutor,22763,73
2,70363,51444,0,1.0,Box and Whisker,25390,0,algebra,1,tutor,22763,73
3,70363,51395,1,1.0,Box and Whisker,4859,0,algebra,1,tutor,22763,73
4,70363,51481,0,1.0,Box and Whisker,19813,0,algebra,1,tutor,22763,73


In [12]:
# Clean data

# 1. skill_name is NaN
print('raw_data:', len(data_df_09))
filter_df_09 = data_df_09.dropna()
print('drop nan:', len(filter_df_09))

# 2. sequence length is not in a proper range
# 3. construct timestamp (slow)
filter_df_09['dwell_time'] = filter_df_09['ms_first_response'].apply(lambda t: t / 1000.0)
user_wise_lst = list()
for user, user_df in filter_df_09.groupby('user_id'):
    if len(user_df) >= 5:
        timestamps = np.zeros(len(user_df))
        dwells = user_df['dwell_time'].values
        for i in range(1, len(user_df)):
            timestamps[i] = timestamps[i - 1] + dwells[i - 1] + 1.
        user_df['timestamp'] = timestamps
        user_wise_lst.append(user_df)
        
# 4. shuffle
np.random.seed(0)
np.random.shuffle(user_wise_lst)
user_wise_df_09 = pd.concat(user_wise_lst).reset_index(drop=True).drop(columns=['ms_first_response'])
print('drop <5:', len(user_wise_df_09))
user_wise_df_09.head()

raw_data: 401756
drop nan: 325637
drop <5: 324527


Unnamed: 0,user_id,problem_id,correct,skill_id,skill_name,first_action,answer_type,original,tutor_mode,teacher_id,school_id,dwell_time,timestamp
0,78570,93156,1,11.0,Venn Diagram,0,algebra,1,tutor,46174,5056,79.421,0.0
1,78570,93194,1,11.0,Venn Diagram,0,algebra,1,tutor,46174,5056,48.275,80.421
2,78570,93175,1,11.0,Venn Diagram,0,algebra,1,tutor,46174,5056,0.001,129.696
3,78570,86118,1,70.0,Percent Of,0,algebra,1,tutor,46174,5056,97.509,130.697
4,78570,86121,1,70.0,Percent Of,0,algebra,1,tutor,46174,5056,80.465,229.206


In [13]:
# Re-index

skill2name_09 = dict()
new_user_id, new_problem_id, new_skill_id = dict(), dict(), dict()
user_cnt, problem_cnt, skill_cnt = 0, 0, 0

for u_id, p_id in zip(user_wise_df_09['user_id'].values, user_wise_df_09['problem_id'].values):
    if u_id not in new_user_id:
        new_user_id[u_id] = user_cnt
        user_cnt += 1
    if p_id not in new_problem_id:
        new_problem_id[p_id] = problem_cnt
        problem_cnt += 1
        
for s_id, s_name in zip(user_wise_df_09['skill_id'].values, user_wise_df_09['skill_name'].values):
    if s_id not in new_skill_id:
        new_skill_id[s_id] = skill_cnt
        skill2name_09[skill_cnt] = s_name
        skill_cnt += 1
user_wise_df_09.drop(columns='skill_name', inplace=True)
        
user_wise_df_09 = user_wise_df_09.rename(columns={'answer_type': 'problem_type'})
feature_names = ['first_action', 'problem_type', 'original', 'tutor_mode', 'teacher_id', 'school_id']
new_feature_id, feature_cnt = list(), list()
for i in range(len(feature_names)):
    new_feature_id.append(dict())
    feature_cnt.append(0)
for i in range(len(features)):
    for f_val in user_wise_df_09[feature_names[i]].values:
        if f_val not in new_feature_id[i]:
            new_feature_id[i][f_val] = feature_cnt[i]
            feature_cnt[i] += 1
            
user_wise_df_09['user_id'] = user_wise_df_09['user_id'].apply(lambda x: new_user_id[x])
user_wise_df_09['problem_id'] = user_wise_df_09['problem_id'].apply(lambda x: new_problem_id[x])
user_wise_df_09['skill_id'] = user_wise_df_09['skill_id'].apply(lambda x: new_skill_id[x])
for j, f_name in enumerate(feature_names):
    user_wise_df_09[f_name] = user_wise_df_09[f_name].apply(lambda x: new_feature_id[j][x])

print(user_cnt, problem_cnt, skill_cnt)
print(feature_cnt)
user_wise_df_09.head()

3678 16868 111
[3, 5, 2, 2, 129, 60]


Unnamed: 0,user_id,problem_id,correct,skill_id,first_action,problem_type,original,tutor_mode,teacher_id,school_id,dwell_time,timestamp
0,0,0,1,0,0,0,0,0,0,0,79.421,0.0
1,0,1,1,0,0,0,0,0,0,0,48.275,80.421
2,0,2,1,0,0,0,0,0,0,0,0.001,129.696
3,0,3,1,1,0,0,0,0,0,0,97.509,130.697
4,0,4,1,1,0,0,0,0,0,0,80.465,229.206


In [14]:
# Adujust dtypes

user_wise_df_09 = user_wise_df_09.astype({'correct': np.float64, 'dwell_time': np.float64, 'timestamp': np.float64})
user_wise_df_09.dtypes

user_id           int64
problem_id        int64
correct         float64
skill_id          int64
first_action      int64
problem_type      int64
original          int64
tutor_mode        int64
teacher_id        int64
school_id         int64
dwell_time      float64
timestamp       float64
dtype: object

In [15]:
# Save

target_prefix = '/work/cywang/workspace/Project/HawkesKT/data/'
dataset_name = 'ASSISTments_09-10'
user_wise_df_09.to_csv(os.path.join(target_prefix, dataset_name, 'interactions.csv'), sep='\t', index=False)
with open(os.path.join(target_prefix, dataset_name, 'skill_name.json'), 'w') as fp:
    json.dump(skill2name_09, fp)

---------------------------------------------------------------------------------------------------------------------

# slepemapy.cz

Download the original data file [here](https://www.fi.muni.cz/adaptivelearning/?a=data).

In [16]:
prefix = '/work/cywang/workspace/Data/slepemapy/'
data_df_cz = pd.read_csv(os.path.join(prefix, 'answer.csv'), sep=';')
data_df_cz.head()

Unnamed: 0,id,user,place_asked,place_answered,type,inserted,response_time,place_map,language,options,ip_country,ip_id
0,2407,75,131,,2,2013-09-26 14:45:51,8250,,0,"[57, 82, 131, 77]",,
1,2408,75,130,69.0,1,2013-09-26 14:46:03,8526,,0,"[58, 219, 69, 130]",,
2,2409,75,136,128.0,1,2013-09-26 14:46:10,4060,,0,"[128, 136, 216, 54]",,
3,2410,75,126,126.0,1,2013-09-26 14:46:20,1842,,0,"[153, 58, 126, 214]",,
4,2411,75,101,101.0,2,2013-09-26 14:46:33,10258,,0,"[153, 163, 101, 195]",,


In [17]:
# Clean data

# 1. place_answered is NaN
print('raw data:', len(data_df_cz))
filter_df_cz = data_df_cz[~data_df_cz['place_answered'].isna()]
print('drop nan:', len(filter_df_cz))

# 2. define skill, problem, label
filter_df_cz.rename(columns={'user': 'user_id'}, inplace=True)
filter_df_cz['correct'] = data_df_cz['place_asked'].astype(float) == data_df_cz['place_answered'].astype(float)
filter_df_cz['dwell_time'] = filter_df_cz['response_time'].apply(lambda t: t / 1000.0)
filter_df_cz['timestamp'] = filter_df_cz['inserted'].apply(
    lambda t: time.mktime(time.strptime(t, '%Y-%m-%d %H:%M:%S')))
filter_df_cz['skill_id'] = filter_df_cz['place_asked'] - 1
filter_df_cz['problem_id'] = filter_df_cz['skill_id'] * 2 + filter_df_cz['type'] - 1

# 3. sequence length is not in a proper range
user_wise_lst = list()
for user, user_df in filter_df_cz.groupby('user_id'):
    if len(user_df) >= 5:
        df = user_df.sort_values(by=['timestamp'])  # assure the sequence order
        user_wise_lst.append(df)
        
# 4. shuffle
np.random.seed(0)
np.random.shuffle(user_wise_lst)
user_wise_df_cz = pd.concat(user_wise_lst).reset_index(drop=True)
user_wise_df_cz = user_wise_df_cz[['user_id', 'skill_id', 'problem_id', 'dwell_time', 'timestamp', 'correct']]
print('drop <5:', len(user_wise_df_cz))
user_wise_df_cz.head()

raw data: 10087305
drop nan: 9805599
drop <5: 9786488


Unnamed: 0,user_id,skill_id,problem_id,dwell_time,timestamp,correct
0,50783,78,156,8.987,1416971000.0,False
1,50783,114,229,4.859,1416971000.0,False
2,50783,181,363,4.796,1416971000.0,True
3,50783,189,378,12.721,1416971000.0,True
4,50783,193,386,7.739,1416971000.0,True


In [18]:
# Re-index

user_ids = list(user_wise_df_cz['user_id'].unique())
user_dict = dict(zip(user_ids, range(len(user_ids))))
user_wise_df_cz['user_id'] = user_wise_df_cz['user_id'].apply(lambda x: user_dict[x])
user_wise_df_cz.head()

Unnamed: 0,user_id,skill_id,problem_id,dwell_time,timestamp,correct
0,0,78,156,8.987,1416971000.0,False
1,0,114,229,4.859,1416971000.0,False
2,0,181,363,4.796,1416971000.0,True
3,0,189,378,12.721,1416971000.0,True
4,0,193,386,7.739,1416971000.0,True


In [19]:
# Adujust dtypes

user_wise_df_cz = user_wise_df_cz.astype({'correct': np.float64, 'dwell_time': np.float64, 'timestamp': np.float64})
user_wise_df_cz.dtypes

user_id         int64
skill_id        int64
problem_id      int64
dwell_time    float64
timestamp     float64
correct       float64
dtype: object

In [20]:
# Save

target_prefix = '/work/cywang/workspace/Project/HawkesKT/data/'
dataset_name = 'slepemapy_cz'
user_wise_df_cz.to_csv(os.path.join(target_prefix, dataset_name, 'interactions.csv'), sep='\t', index=False)