This version is specifically designed for a short dataset following pengchen's work and only looking at 4 AES

In [1]:
import pandas as pd

# read log data
with open('../raw data/training-log-corpus.log', 'r') as f:
    logs = f.readlines()

last_time = 0.0
last_student = ''
action_count = 0
l = []
for i, line in enumerate(logs):
    tokens = line.split('\n')[0]
    tokens = tokens.split('|')

    student_id = tokens[0]
    if len(student_id) != 8:
        print(
            f"ERROR: in line {i}, student id length should be 8, found {len(student_id)}!"
        )
        exit(0)

    action = tokens[1]

    # starting entries do not have time
    try:
        float(tokens[2])
    except:
        tokens.insert(2, 0.0)
    time = float(tokens[2])

    if last_student != student_id:
        last_time = 0.0
        print(f"{action_count} actions found for student {last_student}")
        action_count = 0

    # check is time is almost monotonic
    if last_time - time >= 1:
        print(
            f"ERROR: in line {i}, time is not monotonic! current action {action}. total action found {action_count}. student {student_id}"
        )

    last_time = time
    last_student = student_id
    action_count += 1

    try:
        detail = tokens[3]
    except:
        detail = ""

    others = ("|").join(tokens[4:])

    l.append({
        'student_id': student_id,
        'action': action,
        'time': time,
        'detail': detail,
        'others': others
    })

df = pd.DataFrame(l,
                  columns=['student_id', 'action', 'time', 'detail', 'others'])


# for students with "RESET"
# we are keeping if student reseted after planing signifcant part of the game (>70%)
# or resetted very early (<30%)
print("-- removing reset issues --")
dlt_idx = []
student_with_reset = df.loc[df['action'] == 'RESET']['student_id'].unique()
for student in student_with_reset:
    df_s = df.loc[df['student_id'] == student]
    total_steps = len(df_s)
    i = df.loc[(df['student_id'] == student)
               & (df['action'] == 'RESET')].index[0]
    before_reset_steps = len(df_s.loc[:i])
    percent_before_reset = round(before_reset_steps / total_steps * 100, 2)
    drop_curr = []
    if percent_before_reset < 30.0:
        drop_curr = list(df_s.loc[df_s.index[0]:i + 1].index)
    elif percent_before_reset > 70.0:
        drop_curr = list(df_s.loc[i:].index)
    else:
        drop_curr = list(df_s.index)

    print(
        "student: {0} | total_steps: {1} | reset_at: {2}({3}%) | dropping: {4}"
        .format(student, total_steps, before_reset_steps, percent_before_reset,
                len(drop_curr)))
    dlt_idx += drop_curr
    

df = df.drop(index=dlt_idx).reset_index(drop=True)
df

0 actions found for student 
908 actions found for student 100-0001
737 actions found for student 100-0003
605 actions found for student 100-0004
851 actions found for student 100-0005
705 actions found for student 100-0006
708 actions found for student 100-0007
816 actions found for student 100-0008
624 actions found for student 100-0009
1183 actions found for student 100-0010
1132 actions found for student 100-0011
799 actions found for student 100-0012
854 actions found for student 100-0013
836 actions found for student 100-0014
631 actions found for student 100-0016
1291 actions found for student 100-0017
585 actions found for student 100-0018
873 actions found for student 100-0019
897 actions found for student 100-0020
683 actions found for student 100-0021
790 actions found for student 100-0022
596 actions found for student 100-0024
575 actions found for student 100-0025
883 actions found for student 100-0026
955 actions found for student 100-0027
883 actions found for student 10

666 actions found for student 100-0319
665 actions found for student 100-0320
1085 actions found for student 100-0321
ERROR: in line 223610, time is not monotonic! current action RESET. total action found 1021. student 100-0322
1125 actions found for student 100-0322
429 actions found for student 100-0323
339 actions found for student 100-0324
617 actions found for student 100-0335
513 actions found for student 100-0336
997 actions found for student 100-0337
981 actions found for student 100-0338
887 actions found for student 100-0339
722 actions found for student 100-0340
472 actions found for student 100-0343
739 actions found for student 100-0345
785 actions found for student 100-0346
654 actions found for student 100-0347
679 actions found for student 100-0348
760 actions found for student 100-0349
850 actions found for student 100-0350
917 actions found for student 100-0351
1172 actions found for student 100-0352
572 actions found for student 100-0354
780 actions found for student

Unnamed: 0,student_id,action,time,detail,others
0,100-0001,STUDY,0.000,AnonymousAMiddleSchool-December2010t500,SelfReport-true
1,100-0001,SOLUTION,0.000,disease-Influenza,object-Egg
2,100-0001,ADAPTATION,1.170,select-test-count,random|selected-1
3,100-0001,AUTHENTICATE,42.915,press-authenticate,100-0001
4,100-0001,PDAOPEN,116.775,mainscreen,
...,...,...,...,...,...
329916,100-0828,DIALOG,3334.410,NPC-Utterance,kim|Comebackwhenyouhaveadiagnosis!
329917,100-0828,DIALOG,3338.070,menu-choice,3337.186523|3338.079346|0.892822|choice-6|Bye.
329918,100-0828,GAMEOVER,3338.610,,
329919,100-0828,ADAPTATION,3338.610,select-reflection-prompt,random|selected-1


In [2]:
from collections import defaultdict

# creating action to feature maps
action_map = {
    'DIALOG': 's_dialog_turn',
    'OPEN': 's_open_door',
    'LOOKSTART': 's_view_poster',
    'PICKUP': 's_pickup_obj',
    'DROP': 's_drop_obj',
    'TESTOBJECT': 's_test_obj',
    'WORKSHEET': 's_worksheet',
    'QUIZ': 's_quiz',
    'PDAOPEN': 's_use_pda',
    'LABELING': 's_label_slide',
    'NOTES': 's_take_note',
    'BOOKREAD': 's_read_book',
    'BRYCECOMPUTER': 's_bryce_computer',
    'GAMEOVER': 's_end_game'
}

# TALK action needs to be segmented further for narrative planner
action_talk_map = {
    'cur-action-talk-bryce': 's_talk_bryce',
    'cur-action-talk-teresa': 's_talk_teresa', 
    'cur-action-talk-ford': 's_talk_ford_quen_rob', 
    'cur-action-talk-quentin': 's_talk_ford_quen_rob', 
    'cur-action-talk-robert': 's_talk_ford_quen_rob', 
    
    'cur-action-talk-extraa': 's_talk_others',
    'cur-action-talk-extrab': 's_talk_others',
    'cur-action-talk-elise': 's_talk_others',
    'cur-action-talk-kim': 's_talk_others',
    
}

# all types of adaptions. Not using mystry quiz (as this has 8 choise) and off-task behavior (not enough sample)
adaption_map = {
    'select-bryce-symptoms-level': 's_aes_bryce_symptoms',
    'select-teresa-symptoms-level': 's_aes_teresa_symptoms',
    'select-present-quiz': 's_aes_knowledge_quiz',
    'select-worksheet-level': 's_aes_diagnosis_feedback',
}

# start index of action triggers 
adaption_trigger_map = {
    's_aes_trigger_bryce_symptoms': 0, # two actions [1, 2]
    's_aes_trigger_teresa_symptoms': 2, # three actions [3, 4, 5]
    's_aes_trigger_knowledge_quiz': 5, # two actions [6, 7]
    's_aes_trigger_diagnosis_feedback': 7 # three actions [8, 9, 10],
}

states_student = list(set(action_talk_map.values())) + ['s_worksheet_submitted'] + list(action_map.values()) + list(adaption_map.values())
init_row_student = defaultdict()
for state in states_student:
    init_row_student[state] = 0 
    
states_narrative = list(set(action_talk_map.values())) + ['s_worksheet_submitted'] + list(action_map.values())  +list(adaption_map.values()) + list(adaption_trigger_map.keys())
init_row_narrative = defaultdict()
for state in states_narrative:
    init_row_narrative[state] = 0 


- do we need SOLUTION for s_aes_mystry_solution?
- kim revel adaption points player towards quentin's revel
- removing students who did restart. maybe worthwhile to include them later 
- 21 students didn't do pretest. thus nlg none. removing them. maybe worthwhile to include them later 

In [3]:
from copy import deepcopy
import pdb
pd.set_option("display.max_columns", None)

skipping_action = set()
skipping_adaption = set()

all_inserts_student = []
all_inserts_narrative = []
for student_id, rows in df.groupby(['student_id']):
    row_insert_student = deepcopy(init_row_student)
    row_insert_student['student_id'] = student_id
    row_insert_student['step'] = 0
    row_insert_student['done'] = False
    
    # this part is for narrative df only
    latest_adaptive_trigger = ''
    step_narrative = 0
    
    for i, row in rows.iterrows():
        if row['action'] == 'ADAPTATION':
            if row['detail'] not in adaption_map.keys():
                skipping_adaption.add(row['detail'])
                continue

            for adapt_feature in adaption_map.values():
                row_insert_student[adapt_feature] = 0

            row_insert_student[adaption_map[row['detail']]] = int(row['others'][-1:])
            
            # this part is for narrative df only
            
            # no adaption (among the selected ones) can happen as first action. 
            # thus, there must be another state before 
            row_insert_narrative = deepcopy(all_inserts_student[-1])
            for adapt_trigger_feature in adaption_trigger_map.keys():
                row_insert_narrative[adapt_trigger_feature] = 0
            row_insert_narrative[latest_adaptive_trigger] = 1
            row_insert_narrative['step'] = step_narrative
            
            # converting different AES actions into uniform actions
            row_insert_narrative['action'] = adaption_trigger_map[latest_adaptive_trigger] + int(row['others'][-1:])
            
            step_narrative += 1
            all_inserts_narrative.append(deepcopy(row_insert_narrative))
            continue

        if row['action'] == 'TALK':
            char = row['others'].split('|')[0]
            if char not in action_talk_map.keys():
                print(
                    "-- ERROR TALK UNKNOWN | action: {0} | detail: {1} | others: {2} --"
                    .format(row['action'], row['detail'], row['others']))
                break
            row_insert_student[action_talk_map[char]] += 1
            action = action_talk_map[char]
        
        elif row['action'] == 'DIALOG':
            if row['detail'] == 'menu-choice':
                if row['others'].split('|')[-1] == 'IthinkIhaveadiagnosis':
                    row_insert_student['s_worksheet_submitted'] += 1
                    action = 's_worksheet_submitted'
                else:
                    row_insert_student[action_map[row['action']]] += 1
                    action = action_map[row['action']]
            else:
                continue

        elif row['action'] in action_map.keys():
            row_insert_student[action_map[row['action']]] += 1
            action = action_map[row['action']]
        else:
            skipping_action.add(row['action'])
            if row['action'] == 'RESET':
                print(
                    "-- ERROR RESET happened for student {0} --".format(student_id))
            continue

        row_insert_student['action'] = action
        all_inserts_student.append(deepcopy(row_insert_student))
        row_insert_student['step'] += 1
        
        if action == 's_end_game':
            break
        
        
        # this part is for narrative df only
        # TODO CURRENTLY ASSUMING PORER TA TE ADAPTION HOICHE
        if action == 's_talk_bryce':
            latest_adaptive_trigger = 's_aes_trigger_bryce_symptoms'
        elif action == 's_talk_teresa':
            latest_adaptive_trigger = 's_aes_trigger_teresa_symptoms'
        elif action == 's_talk_ford_quen_rob':
            latest_adaptive_trigger = 's_aes_trigger_knowledge_quiz'
        elif action == 's_worksheet_submitted':
            latest_adaptive_trigger = 's_aes_trigger_diagnosis_feedback'
    
    if all_inserts_student[-1]['action'] != 's_end_game':
        row_insert_student['s_eng_game'] = 1
        row_insert_student['step'] += 1
        row_insert_student['action'] = 's_end_game'
        all_inserts_student.append(deepcopy(row_insert_student))
            
    all_inserts_student[-1]['done'] = True
    all_inserts_narrative[-1]['done'] = True

    print('-- finished student {0} --'.format(student_id))

df_student_data = pd.DataFrame(all_inserts_student,
                               columns=['student_id', 'step'] + states_student +
                               ['action', 'done'])

df_narrative_data = pd.DataFrame(all_inserts_narrative,
                               columns=['student_id', 'step'] + states_narrative +
                               ['action', 'done'])


print("Actions skipped", skipping_action)
print("Adaptions skipped", skipping_adaption)

NameError: name 'all_inserts_narrative' is not defined

In [None]:
# merging scores in the feature list

df_score = pd.read_csv('../raw data/training-survey-corpus.csv')
df_score = df_score[[
    'Student ID', 'Gender', 'Game-Playing Frequency', 'Content Pre Total',
    'Normalized Learning Gain'
]]
df_score = df_score.rename(
    columns={
        'Student ID': 'student_id',
        'Gender': 's_static_gender',
        'Game-Playing Frequency': 's_static_game_freq',
        'Content Pre Total': 's_static_pretest',
        'Normalized Learning Gain': 'nlg'
    })

# few nlg are NONE!
df_score = df_score.loc[df_score['nlg']!='None']
df_score['nlg'] = pd.to_numeric(df_score['nlg'])


# making sure we have nlgs for all students
df_student_data = df_student_data.loc[df_student_data['student_id'].isin(df_score['student_id'].unique())]
df_student_data = df_student_data.reset_index(drop=True)

df_narrative_data = df_narrative_data.loc[df_narrative_data['student_id'].isin(df_score['student_id'].unique())]
df_narrative_data = df_narrative_data.reset_index(drop=True)

df_score = df_score.loc[df_score['student_id'].isin(df_student_data['student_id'].unique())]


# splitting nlg based on median
mid_nlg = df_score['nlg'].describe()['50%']
df_score.loc[df_score['nlg']<mid_nlg, 'nlg'] = -100
df_score.loc[df_score['nlg']>=mid_nlg, 'nlg'] = 100


df_student_data = df_student_data.merge(df_score,
                                        on=['student_id'],
                                        how='left')

df_narrative_data = df_narrative_data.merge(df_score,
                                        on=['student_id'],
                                        how='left')

df_narrative_data

In [None]:
df_student_data.to_pickle('../processed_data/student_trajectories.pkl')
df_narrative_data.to_pickle('../processed_data/narrative_trajectories.pkl')
df_score.to_pickle('../processed_data/scores.pkl')

In [None]:
x = df_score.loc[df_score['nlg']==0]['student_id'].unique()
y = df_score.loc[df_score['nlg']==1]['student_id'].unique()

l = df.loc[(df['student_id'].isin(x)) & (df['action']=='FINALSCORE')].apply(lambda z: int(z['detail'].split('-')[-1]), axis=1)
h = df.loc[(df['student_id'].isin(y)) & (df['action']=='FINALSCORE')].apply(lambda z: int(z['detail'].split('-')[-1]), axis=1)

In [None]:
np.mean(l), np.median(l), np.std(l), np.max(l), np.min(l)

In [None]:
np.mean(h), np.median(h), np.std(h), np.max(h), np.min(h)