This version is specifically designed for a short dataset following pengchen's work and only looking at 4 AES

In [39]:
import pandas as pd

# read log data
with open('../raw data/training-log-corpus.log', 'r') as f:
    logs = f.readlines()

last_time = 0.0
last_student = ''
action_count = 0
l = []
for i, line in enumerate(logs):
    tokens = line.split('\n')[0]
    tokens = tokens.split('|')

    student_id = tokens[0]
    if len(student_id) != 8:
        print(
            f"ERROR: in line {i}, student id length should be 8, found {len(student_id)}!"
        )
        exit(0)

    action = tokens[1]

    # starting entries do not have time
    try:
        float(tokens[2])
    except:
        tokens.insert(2, 0.0)
    time = float(tokens[2])

    if last_student != student_id:
        last_time = 0.0
        print(f"{action_count} actions found for student {last_student}")
        action_count = 0

    # check is time is almost monotonic
    if last_time - time >= 1:
        print(
            f"ERROR: in line {i}, time is not monotonic! current action {action}. total action found {action_count}. student {student_id}"
        )

    last_time = time
    last_student = student_id
    action_count += 1

    try:
        detail = tokens[3]
    except:
        detail = ""

    others = ("|").join(tokens[4:])

    l.append({
        'student_id': student_id,
        'action': action,
        'time': time,
        'detail': detail,
        'others': others
    })

df = pd.DataFrame(l,
                  columns=['student_id', 'action', 'time', 'detail', 'others'])


# for students with "RESET"
# we are keeping if student reseted after planing signifcant part of the game (>70%)
# or resetted very early (<30%)
print("-- removing reset issues --")
dlt_idx = []
student_with_reset = df.loc[df['action'] == 'RESET']['student_id'].unique()
for student in student_with_reset:
    df_s = df.loc[df['student_id'] == student]
    total_steps = len(df_s)
    i = df.loc[(df['student_id'] == student)
               & (df['action'] == 'RESET')].index[0]
    before_reset_steps = len(df_s.loc[:i])
    percent_before_reset = round(before_reset_steps / total_steps * 100, 2)
    drop_curr = []
    if percent_before_reset < 30.0:
        drop_curr = list(df_s.loc[df_s.index[0]:i + 1].index)
    elif percent_before_reset > 70.0:
        drop_curr = list(df_s.loc[i:].index)
    else:
        drop_curr = list(df_s.index)

    print(
        "student: {0} | total_steps: {1} | reset_at: {2}({3}%) | dropping: {4}"
        .format(student, total_steps, before_reset_steps, percent_before_reset,
                len(drop_curr)))
    dlt_idx += drop_curr
    

df = df.drop(index=dlt_idx).reset_index(drop=True)

0 actions found for student 
908 actions found for student 100-0001
737 actions found for student 100-0003
605 actions found for student 100-0004
851 actions found for student 100-0005
705 actions found for student 100-0006
708 actions found for student 100-0007
816 actions found for student 100-0008
624 actions found for student 100-0009
1183 actions found for student 100-0010
1132 actions found for student 100-0011
799 actions found for student 100-0012
854 actions found for student 100-0013
836 actions found for student 100-0014
631 actions found for student 100-0016
1291 actions found for student 100-0017
585 actions found for student 100-0018
873 actions found for student 100-0019
897 actions found for student 100-0020
683 actions found for student 100-0021
790 actions found for student 100-0022
596 actions found for student 100-0024
575 actions found for student 100-0025
883 actions found for student 100-0026
955 actions found for student 100-0027
883 actions found for student 10

513 actions found for student 100-0336
997 actions found for student 100-0337
981 actions found for student 100-0338
887 actions found for student 100-0339
722 actions found for student 100-0340
472 actions found for student 100-0343
739 actions found for student 100-0345
785 actions found for student 100-0346
654 actions found for student 100-0347
679 actions found for student 100-0348
760 actions found for student 100-0349
850 actions found for student 100-0350
917 actions found for student 100-0351
1172 actions found for student 100-0352
572 actions found for student 100-0354
780 actions found for student 100-0355
769 actions found for student 100-0356
802 actions found for student 100-0357
735 actions found for student 100-0358
848 actions found for student 100-0359
631 actions found for student 100-0361
861 actions found for student 100-0363
618 actions found for student 100-0364
772 actions found for student 100-0365
758 actions found for student 100-0366
774 actions found for st

In [2]:
from collections import defaultdict

# creating action to feature maps
action_map = {
    'DIALOG': 's_dialog_turn',
    'OPEN': 's_open_door',
    'CLOSE' : 's_close_door',
    'LOOKSTART': 's_view_poster',
    'PDAUSE': 's_take_quiz',
    'PICKUP': 's_pickup_obj',
    'DROP': 's_drop_obj',
    'TESTOBJECT': 's_test_obj',
    'WORKSHEET': 's_diagnosis_worksheet',
    'PDAOPEN': 's_use_pda',
    'LABELING': 's_label_slide',
    'NOTES': 's_take_note',
    'BOOKREAD': 's_read_book',
    'BRYCECOMPUTER': 's_bryce_computer',
    'GAMEOVER': 's_end_game'
}

# TALK action needs to be segmented further for narrative planner
action_talk_map = {
    'cur-action-talk-bryce': 's_talk_bryce',
    'cur-action-talk-teresa': 's_talk_teresa', 
    'cur-action-talk-ford': 's_talk_ford', 
    'cur-action-talk-quentin': 's_talk_quentin', 
    'cur-action-talk-robert': 's_talk_robert', 
    
    'cur-action-talk-extraa': 's_talk_others',
    'cur-action-talk-extrab': 's_talk_others',
    'cur-action-talk-elise': 's_talk_others',
    'cur-action-talk-kim': 's_talk_others',
    
}

# all types of adaptions. Not using mystry quiz (as this has 8 choise) and off-task behavior (not enough sample)
adaption_map = {
    'select-bryce-symptoms-level': 's_aes_bryce_symptoms',
    'select-teresa-symptoms-level': 's_aes_teresa_symptoms',
    'select-present-quiz': 's_aes_knowledge_quiz',
    'select-worksheet-level': 's_aes_diagnosis_feedback',
}

states = ['s_time_passed'] + list(set(action_talk_map.values())) + list(action_map.values()) + list(adaption_map.values())

init_row = defaultdict()
for state in states:
    init_row[state] = 0 


- do we need SOLUTION for s_aes_mystry_solution?
- kim revel adaption points player towards quentin's revel
- removing students who did restart. maybe worthwhile to include them later 
- 21 students didn't do pretest. thus nlg none. removing them. maybe worthwhile to include them later 

In [3]:
from copy import deepcopy

skipping_action = set()
skipping_adaption = set()
all_student_inserts = []
all_narrative_inserts = []
dlt_students = []
for student_id, rows in df.groupby(['student_id']):
    row_insert = deepcopy(init_row)
    row_insert['student_id'] = student_id
    row_insert['step'] = 0
    row_insert['done'] = False
    narrative_step = 0

    for i, row in rows.iterrows():
        if row['action'] == 'ADAPTATION':
            if row['detail'] not in adaption_map.keys():
                skipping_adaption.add(row['detail'])
                continue

            for adapt_feature in adaption_map.values():
                row_insert[adapt_feature] = 0

            row_insert[adaption_map[row['detail']]] = int(row['others'][-1:])

            narrative_row_insert = deepcopy(row_insert)
            narrative_row_insert['s_time_passed'] = row['time']
            narrative_row_insert['step'] = narrative_step
            narrative_row_insert['action'] = int(row['others'][-1:])

            all_narrative_inserts.append(deepcopy(narrative_row_insert))
            narrative_step += 1
            continue

        if row['action'] == 'TALK':
            char = row['others'].split('|')[0]
            if char not in action_talk_map.keys():
                print(
                    "-- ERROR TALK UNKNOWN | action: {0} | detail: {1} | others: {2} --"
                    .format(row['action'], row['detail'], row['others']))
                break
            row_insert[action_talk_map[char]] += 1
            action = action_talk_map[char]

        elif row['action'] in action_map.keys():
            row_insert[action_map[row['action']]] += 1
            action = action_map[row['action']]
        else:
            skipping_action.add(row['action'])
            if row['action'] == 'RESET':
                print(
                    "-- RESET happened for student {0} --".format(student_id))
                dlt_students.append(student_id)
#             print("-- skipping action | action: {0} | detail: {1} | others: {2} --".format(row['action'], row['detail'], row['others']))
            continue

        row_insert['s_time_passed'] = row['time']
        row_insert['action'] = action
        all_student_inserts.append(deepcopy(row_insert))
        row_insert['step'] += 1

    all_student_inserts[-1]['done'] = True
    all_narrative_inserts[-1]['done'] = True

    print('-- finished student {0} --'.format(student_id))

df_student_data = pd.DataFrame(all_student_inserts,
                               columns=['student_id', 'step'] + states +
                               ['action', 'done'])
df_narrative_data = pd.DataFrame(all_narrative_inserts,
                                 columns=['student_id', 'step'] + states +
                                 ['action', 'done'])

df_student_data = df_student_data.loc[~df_student_data['student_id'].
                                      isin(dlt_students)]
df_student_data = df_student_data.reset_index(drop=True)

df_narrative_data = df_narrative_data.loc[~df_narrative_data['student_id'].
                                          isin(dlt_students)]
df_narrative_data = df_narrative_data.reset_index(drop=True)

print("Actions skipped", skipping_action)
print("Adaptions skipped", skipping_adaption)
print("Removing {0} students for Restarting".format(len(dlt_students)))

-- finished student 100-0001 --
-- finished student 100-0003 --
-- finished student 100-0004 --
-- finished student 100-0005 --
-- finished student 100-0006 --
-- finished student 100-0007 --
-- finished student 100-0008 --
-- finished student 100-0009 --
-- finished student 100-0010 --
-- finished student 100-0011 --
-- finished student 100-0012 --
-- finished student 100-0013 --
-- finished student 100-0014 --
-- finished student 100-0016 --
-- finished student 100-0017 --
-- finished student 100-0018 --
-- finished student 100-0019 --
-- finished student 100-0020 --
-- finished student 100-0021 --
-- finished student 100-0022 --
-- finished student 100-0024 --
-- finished student 100-0025 --
-- finished student 100-0026 --
-- finished student 100-0027 --
-- finished student 100-0028 --
-- finished student 100-0029 --
-- finished student 100-0030 --
-- finished student 100-0031 --
-- finished student 100-0032 --
-- finished student 100-0033 --
-- finished student 100-0034 --
-- finis

-- finished student 100-0287 --
-- finished student 100-0289 --
-- finished student 100-0290 --
-- finished student 100-0291 --
-- finished student 100-0292 --
-- finished student 100-0293 --
-- finished student 100-0294 --
-- finished student 100-0295 --
-- finished student 100-0296 --
-- finished student 100-0297 --
-- finished student 100-0298 --
-- finished student 100-0299 --
-- finished student 100-0300 --
-- finished student 100-0302 --
-- finished student 100-0303 --
-- finished student 100-0304 --
-- finished student 100-0305 --
-- RESET happened for student 100-0307 --
-- finished student 100-0307 --
-- finished student 100-0308 --
-- finished student 100-0309 --
-- finished student 100-0310 --
-- finished student 100-0311 --
-- finished student 100-0312 --
-- finished student 100-0314 --
-- finished student 100-0315 --
-- finished student 100-0316 --
-- finished student 100-0317 --
-- finished student 100-0319 --
-- finished student 100-0320 --
-- finished student 100-0321 -

In [4]:
# merging scores in the feature list

df_score = pd.read_csv('../raw data/training-survey-corpus.csv')
df_score = df_score[[
    'Student ID', 'Gender', 'Game-Playing Frequency', 'Content Pre Total',
    'Normalized Learning Gain'
]]
df_score = df_score.rename(
    columns={
        'Student ID': 'student_id',
        'Gender': 's_static_gender',
        'Game-Playing Frequency': 's_static_game_freq',
        'Content Pre Total': 's_static_pretest',
        'Normalized Learning Gain': 'nlg'
    })

# few nlg are NONE!
df_score = df_score.loc[df_score['nlg']!='None']
df_score['nlg'] = pd.to_numeric(df_score['nlg'])


# making sure we have nlgs for all students
df_student_data = df_student_data.loc[df_student_data['student_id'].isin(df_score['student_id'].unique())]
df_student_data = df_student_data.reset_index(drop=True)

df_narrative_data = df_narrative_data.loc[df_narrative_data['student_id'].isin(df_score['student_id'].unique())]
df_narrative_data = df_narrative_data.reset_index(drop=True)

df_score = df_score.loc[df_score['student_id'].isin(df_student_data['student_id'].unique())]


# splitting nlg based on median
mid_nlg = df_score['nlg'].describe()['50%']
df_score.loc[df_score['nlg']<mid_nlg, 'nlg'] = 0
df_score.loc[df_score['nlg']>=mid_nlg, 'nlg'] = 1


# rearrange the columns
cols = [
    'student_id', 'step', 's_static_gender', 's_static_game_freq',
    's_static_pretest', 's_time_passed', 's_talk_kim', 's_talk_quentin',
    's_talk_robert', 's_talk_ford', 's_talk_others', 's_talk_bryce',
    's_talk_teresa', 's_dialog_turn', 's_open_door', 's_close_door',
    's_view_poster', 's_take_quiz', 's_pickup_obj', 's_drop_obj', 's_test_obj',
    's_diagnosis_worksheet', 's_use_pda', 's_label_slide', 's_take_note',
    's_read_book', 's_bryce_computer', 's_end_game', 's_aes_bryce_revel',
    's_aes_bryce_symptoms', 's_aes_next_goal_prompt', 's_aes_increase_urgency',
    's_aes_kim_revel', 's_aes_knowledge_quiz', 's_aes_quentin_revel',
    's_aes_record_reminder', 's_aes_reflection_prompt',
    's_aes_teresa_symptoms', 's_aes_test_count', 's_aes_diagnosis_feedback',
    'action', 'done', 'nlg'
]

df_student_data = df_student_data.merge(df_score,
                                        on=['student_id'],
                                        how='left')

df_narrative_data = df_narrative_data.merge(df_score,
                                        on=['student_id'],
                                        how='left')

df_narrative_data = df_narrative_data[cols]
df_narrative_data

Unnamed: 0,student_id,step,s_static_gender,s_static_game_freq,s_static_pretest,s_time_passed,s_talk_kim,s_talk_quentin,s_talk_robert,s_talk_ford,...,s_aes_knowledge_quiz,s_aes_quentin_revel,s_aes_record_reminder,s_aes_reflection_prompt,s_aes_teresa_symptoms,s_aes_test_count,s_aes_diagnosis_feedback,action,done,nlg
0,100-0001,0,2,1,3,1.170,0,0,0,0,...,0,0,0,0,0,1,0,1,False,0.0
1,100-0001,1,2,1,3,408.255,2,0,0,0,...,0,0,0,0,0,0,0,1,False,0.0
2,100-0001,2,2,1,3,604.005,2,0,0,0,...,0,0,0,0,0,0,0,2,False,0.0
3,100-0001,3,2,1,3,615.285,2,1,0,0,...,2,0,0,0,0,0,0,2,False,0.0
4,100-0001,4,2,1,3,738.465,3,1,0,0,...,0,0,0,0,0,0,2,2,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9425,100-0828,28,1,4,3,3007.365,5,4,1,1,...,0,0,2,0,0,0,0,2,False,1.0
9426,100-0828,29,1,4,3,3048.375,5,4,1,1,...,0,0,2,0,0,0,0,2,False,1.0
9427,100-0828,30,1,4,3,3177.330,6,4,1,1,...,0,0,0,0,0,0,2,2,False,1.0
9428,100-0828,31,1,4,3,3326.325,7,4,1,1,...,0,0,0,0,0,0,1,1,False,1.0


In [6]:
df_student_data.to_pickle('../processed_data/student_trajectories.pkl')
df_narrative_data.to_pickle('../processed_data/narrative_trajectories.pkl')
df_score.to_pickle('../processed_data/scores.pkl')

In [23]:
x = df_score.loc[df_score['nlg']==0]['student_id'].unique()
y = df_score.loc[df_score['nlg']==1]['student_id'].unique()

l = df.loc[(df['student_id'].isin(x)) & (df['action']=='FINALSCORE')].apply(lambda z: int(z['detail'].split('-')[-1]), axis=1)
h = df.loc[(df['student_id'].isin(y)) & (df['action']=='FINALSCORE')].apply(lambda z: int(z['detail'].split('-')[-1]), axis=1)

In [22]:
np.mean(l), np.median(l), np.std(l), np.max(l), np.min(l)

(319.6453488372093, 220.5, 292.91572887530765, 1212, 0)

In [24]:
np.mean(h), np.median(h), np.std(h), np.max(h), np.min(h)

(431.1020408163265, 267.5, 372.5293849468741, 1366, 1)