# pre processing

In [None]:
import pandas as pd

# read log data
with open('../raw data/training-log-corpus.log', 'r') as f:
    logs = f.readlines()

last_time = 0.0
last_student = ''
action_count = 0
l = []
for i, line in enumerate(logs):
    tokens = line.split('\n')[0]
    tokens = tokens.split('|')

    student_id = tokens[0]
    if len(student_id) != 8:
        print(
            f"ERROR: in line {i}, student id length should be 8, found {len(student_id)}!"
        )
        exit(0)

    action = tokens[1]

    # starting entries do not have time
    try:
        float(tokens[2])
    except:
        tokens.insert(2, 0.0)
    time = float(tokens[2])

    if last_student != student_id:
        last_time = 0.0
        print(f"{action_count} actions found for student {last_student}")
        action_count = 0

    # check is time is almost monotonic
    if last_time - time >= 1:
        print(
            f"ERROR: in line {i}, time is not monotonic! current action {action}. total action found {action_count}. student {student_id}"
        )

    last_time = time
    last_student = student_id
    action_count += 1

    try:
        detail = tokens[3]
    except:
        detail = ""

    others = ("|").join(tokens[4:])

    l.append({
        'student_id': student_id,
        'action': action,
        'time': time,
        'detail': detail,
        'others': others
    })

df = pd.DataFrame(l,
                  columns=['student_id', 'action', 'time', 'detail', 'others'])


# for students with "RESET"
# we are keeping if student reseted after planing signifcant part of the game (>70%)
# or resetted very early (<30%)
print("-- removing reset issues --")
dlt_idx = []
student_with_reset = df.loc[df['action'] == 'RESET']['student_id'].unique()
for student in student_with_reset:
    df_s = df.loc[df['student_id'] == student]
    total_steps = len(df_s)
    i = df.loc[(df['student_id'] == student)
               & (df['action'] == 'RESET')].index[0]
    before_reset_steps = len(df_s.loc[:i])
    percent_before_reset = round(before_reset_steps / total_steps * 100, 2)
    drop_curr = []
    if percent_before_reset < 30.0:
        drop_curr = list(df_s.loc[df_s.index[0]:i + 1].index)
    elif percent_before_reset > 70.0:
        drop_curr = list(df_s.loc[i:].index)
    else:
        drop_curr = list(df_s.index)

    print(
        "student: {0} | total_steps: {1} | reset_at: {2}({3}%) | dropping: {4}"
        .format(student, total_steps, before_reset_steps, percent_before_reset,
                len(drop_curr)))
    dlt_idx += drop_curr
    

df = df.drop(index=dlt_idx).reset_index(drop=True)
df

In [None]:
from collections import defaultdict

# creating action to feature maps
action_map = {
    'DIALOG': 's_dialog_turn',
    'OPEN': 's_open_door',
    'LOOKSTART': 's_view_poster',
    'PICKUP': 's_pickup_obj',
    'DROP': 's_drop_obj',
    'TESTOBJECT': 's_test_obj',
    'WORKSHEET': 's_worksheet',
    'QUIZ': 's_quiz',
    'PDAOPEN': 's_use_pda',
    'LABELING': 's_label_slide',
    'NOTES': 's_take_note',
    'BOOKREAD': 's_read_book',
    'BRYCECOMPUTER': 's_bryce_computer',
    'GAMEOVER': 's_end_game'
}

# TALK action needs to be segmented further for narrative planner
action_talk_map = {
    'cur-action-talk-bryce': 's_talk_bryce',
    'cur-action-talk-teresa': 's_talk_teresa', 
    'cur-action-talk-ford': 's_talk_ford_quen_rob', 
    'cur-action-talk-quentin': 's_talk_ford_quen_rob', 
    'cur-action-talk-robert': 's_talk_ford_quen_rob', 
    
    'cur-action-talk-extraa': 's_talk_others',
    'cur-action-talk-extrab': 's_talk_others',
    'cur-action-talk-elise': 's_talk_others',
    'cur-action-talk-kim': 's_talk_others',
    
}

# all types of adaptions. Not using mystry quiz (as this has 8 choise) and off-task behavior (not enough sample)
adaption_map = {
    'select-bryce-symptoms-level': 's_aes_bryce_symptoms',
    'select-teresa-symptoms-level': 's_aes_teresa_symptoms',
    'select-present-quiz': 's_aes_knowledge_quiz',
    'select-worksheet-level': 's_aes_diagnosis_feedback',
}

# start index of action triggers 
narrative_trigger_map = {
    's_aes_trigger_bryce_symptoms': -1, # two actions [0, 1] 
    's_aes_trigger_teresa_symptoms': 1, # three actions [2, 3, 4]
    's_aes_trigger_knowledge_quiz': 4, # two actions [5, 6]
    's_aes_trigger_diagnosis_feedback': 6 # three actions [7, 8, 9],
}

narrative_trigger_map_for_env = {
    0: ['s_aes_bryce_symptoms', 1],
    1: ['s_aes_bryce_symptoms', 2],
    2: ['s_aes_teresa_symptoms', 1],
    3: ['s_aes_teresa_symptoms', 2],
    4: ['s_aes_teresa_symptoms', 3],
    5: ['s_aes_knowledge_quiz', 1],
    6: ['s_aes_knowledge_quiz', 2],
    7: ['s_aes_diagnosis_feedback', 1],
    8: ['s_aes_diagnosis_feedback', 2],
    9: ['s_aes_diagnosis_feedback', 3]
}

student_trigger_map = {}
act_num = 0
for state in set(action_map.values()):
    student_trigger_map[state] = act_num
    act_num += 1
for state in set(action_talk_map.values()):
    student_trigger_map[state] = act_num
    act_num += 1
student_trigger_map['s_worksheet_submitted'] = act_num

states_student = list(student_trigger_map.keys()) + list(adaption_map.values())
init_row_student = defaultdict()
for state in states_student:
    init_row_student[state] = 0 
    
states_narrative = list(student_trigger_map.keys()) + list(adaption_map.values()) + list(narrative_trigger_map.keys())
init_row_narrative = defaultdict()
for state in states_narrative:
    init_row_narrative[state] = 0 

print(student_trigger_map)
print(states_narrative)

- do we need SOLUTION for s_aes_mystry_solution?
- kim revel adaption points player towards quentin's revel
- removing students who did restart. maybe worthwhile to include them later 
- 21 students didn't do pretest. thus nlg none. removing them. maybe worthwhile to include them later 

In [None]:
from copy import deepcopy
import pdb
pd.set_option("display.max_columns", None)

skipping_action = set()
skipping_adaption = set()

all_inserts_student = []
all_inserts_narrative = []
for student_id, rows in df.groupby(['student_id']):
    row_insert_student = deepcopy(init_row_student)
    row_insert_student['student_id'] = student_id
    row_insert_student['step'] = 0
    row_insert_student['done'] = False
    
    # this part is for narrative df only
    latest_adaptive_trigger = ''
    step_narrative = 0
    
    for i, row in rows.iterrows():
        if row['action'] == 'ADAPTATION':
            if row['detail'] not in adaption_map.keys():
                skipping_adaption.add(row['detail'])
                continue

            for adapt_feature in adaption_map.values():
                row_insert_student[adapt_feature] = 0

            row_insert_student[adaption_map[row['detail']]] = int(row['others'][-1:])
            
            # this part is for narrative df only
            
            # no adaption (among the selected ones) can happen as first action. 
            # thus, there must be another state before 
            row_insert_narrative = deepcopy(all_inserts_student[-1])
            for adapt_trigger_feature in narrative_trigger_map.keys():
                row_insert_narrative[adapt_trigger_feature] = 0
            row_insert_narrative[latest_adaptive_trigger] = 1
            row_insert_narrative['step'] = step_narrative
            
            # converting different AES actions into uniform actions
            row_insert_narrative['action'] = narrative_trigger_map[latest_adaptive_trigger] + int(row['others'][-1:])
            row_insert_narrative['action_name'] = latest_adaptive_trigger + '_' +str(int(row['others'][-1:]))
            step_narrative += 1
            all_inserts_narrative.append(deepcopy(row_insert_narrative))
            continue

        if row['action'] == 'TALK':
            char = row['others'].split('|')[0]
            if char not in action_talk_map.keys():
                print(
                    "-- ERROR TALK UNKNOWN | action: {0} | detail: {1} | others: {2} --"
                    .format(row['action'], row['detail'], row['others']))
                break
            row_insert_student[action_talk_map[char]] += 1
            action = action_talk_map[char]
        
        elif row['action'] == 'DIALOG':
            if row['detail'] == 'menu-choice':
                if row['others'].split('|')[-1] == 'IthinkIhaveadiagnosis':
                    row_insert_student['s_worksheet_submitted'] += 1
                    action = 's_worksheet_submitted'
                else:
                    row_insert_student[action_map[row['action']]] += 1
                    action = action_map[row['action']]
            else:
                continue

        elif row['action'] in action_map.keys():
            row_insert_student[action_map[row['action']]] += 1
            action = action_map[row['action']]
        else:
            skipping_action.add(row['action'])
            if row['action'] == 'RESET':
                print(
                    "-- ERROR RESET happened for student {0} --".format(student_id))
            continue

        row_insert_student['action'] = student_trigger_map[action]
        row_insert_student['action_name'] = action
        all_inserts_student.append(deepcopy(row_insert_student))
        row_insert_student['step'] += 1
        
        if action == 's_end_game':
            break
        
        
        # this part is for narrative df only
        if action == 's_talk_bryce':
            latest_adaptive_trigger = 's_aes_trigger_bryce_symptoms'
        elif action == 's_talk_teresa':
            latest_adaptive_trigger = 's_aes_trigger_teresa_symptoms'
        elif action == 's_talk_ford_quen_rob':
            latest_adaptive_trigger = 's_aes_trigger_knowledge_quiz'
        elif action == 's_worksheet_submitted':
            latest_adaptive_trigger = 's_aes_trigger_diagnosis_feedback'
    
    if all_inserts_student[-1]['action_name'] != 's_end_game':
        row_insert_student['s_eng_game'] = 1
        row_insert_student['step'] += 1
        row_insert_student['action'] = student_trigger_map['s_end_game']
        row_insert_student['action_name'] = 's_end_game'
        all_inserts_student.append(deepcopy(row_insert_student))
            
    all_inserts_student[-1]['done'] = True
    all_inserts_narrative[-1]['done'] = True

    print('-- finished student {0} --'.format(student_id))

df_student_data = pd.DataFrame(all_inserts_student,
                               columns=['student_id', 'step'] + states_student +
                               ['action', 'action_name', 'done'])

df_narrative_data = pd.DataFrame(all_inserts_narrative,
                               columns=['student_id', 'step'] + states_narrative +
                               ['action', 'action_name', 'done'])


print("Actions skipped", skipping_action)
print("Adaptions skipped", skipping_adaption)

In [None]:
# merging scores in the feature list

df_score = pd.read_csv('../raw data/training-survey-corpus.csv')
df_score = df_score[[
    'Student ID', 'Gender', 'Game-Playing Frequency', 'Content Pre Total',
    'Normalized Learning Gain'
]]
df_score = df_score.rename(
    columns={
        'Student ID': 'student_id',
        'Gender': 's_static_gender',
        'Game-Playing Frequency': 's_static_game_freq',
        'Content Pre Total': 's_static_pretest',
        'Normalized Learning Gain': 'reward'
    })

# few reward are NONE!
df_score = df_score.loc[df_score['reward']!='None']
df_score['reward'] = pd.to_numeric(df_score['reward'])


# making sure we have rewards for all students
df_student_data = df_student_data.loc[df_student_data['student_id'].isin(df_score['student_id'].unique())]
df_student_data = df_student_data.reset_index(drop=True)

df_narrative_data = df_narrative_data.loc[df_narrative_data['student_id'].isin(df_score['student_id'].unique())]
df_narrative_data = df_narrative_data.reset_index(drop=True)

df_score = df_score.loc[df_score['student_id'].isin(df_student_data['student_id'].unique())]


# splitting reward based on median
mid_reward = df_score['reward'].describe()['50%']
df_score.loc[df_score['reward']<mid_reward, 'reward'] = -100
df_score.loc[df_score['reward']>=mid_reward, 'reward'] = 100


df_student_data = df_student_data.merge(df_score,
                                        on=['student_id'],
                                        how='left')

df_narrative_data = df_narrative_data.merge(df_score,
                                        on=['student_id'],
                                        how='left')

df_narrative_data.loc[df_narrative_data['done']==False, 'reward'] = 0
df_student_data.loc[df_student_data['done']==False, 'reward'] = 0

df_student_data.head(50)

In [None]:
import numpy as np
# creating states
df_student_data['state'] = df_student_data.apply(lambda x: np.array([
    # 0-18 position matching student action numbers
    x['s_view_poster'],
    x['s_label_slide'],
    x['s_read_book'],
    x['s_test_obj'],
    x['s_bryce_computer'],
    x['s_end_game'], # 5
    x['s_take_note'],
    x['s_drop_obj'],
    x['s_dialog_turn'],
    x['s_open_door'],
    x['s_worksheet'],
    x['s_use_pda'],
    x['s_quiz'],
    x['s_pickup_obj'],
    x['s_talk_ford_quen_rob'], # 14
    x['s_talk_bryce'], # 15
    x['s_talk_others'], # 16
    x['s_talk_teresa'], # 17
    x['s_worksheet_submitted'], # 18
    
    # 19-22 position
    x['s_aes_bryce_symptoms'],
    x['s_aes_teresa_symptoms'],
    x['s_aes_knowledge_quiz'],
    x['s_aes_diagnosis_feedback'],
    
    # 23-25 position
    x['s_static_gender'], 
    x['s_static_game_freq'], 
    x['s_static_pretest'], 
    # 26 position new
    x['step'],
]), axis=1)

# creating states
df_narrative_data['state'] = df_narrative_data.apply(lambda x: np.array([
    # 0-18 position matching student action numbers
    x['s_view_poster'],
    x['s_label_slide'],
    x['s_read_book'],
    x['s_test_obj'],
    x['s_bryce_computer'],
    x['s_end_game'],
    x['s_take_note'],
    x['s_drop_obj'],
    x['s_dialog_turn'],
    x['s_open_door'],
    x['s_worksheet'],
    x['s_use_pda'],
    x['s_quiz'],
    x['s_pickup_obj'],
    x['s_talk_ford_quen_rob'],
    x['s_talk_bryce'],
    x['s_talk_others'],
    x['s_talk_teresa'],
    x['s_worksheet_submitted'],
    
    # 19-22 position
    x['s_aes_bryce_symptoms'],
    x['s_aes_teresa_symptoms'],
    x['s_aes_knowledge_quiz'],
    x['s_aes_diagnosis_feedback'],
    
    # 23-25 position
    x['s_static_gender'], 
    x['s_static_game_freq'], 
    x['s_static_pretest'],
    # 26 position new added
    x['step'],
    
    # 27-30 position
    x['s_aes_trigger_bryce_symptoms'],
    x['s_aes_trigger_teresa_symptoms'],
    x['s_aes_trigger_knowledge_quiz'],
    x['s_aes_trigger_diagnosis_feedback']
]), axis=1)


# df_student_data.to_pickle('../processed_data/student_trajectories.pkl')
# df_narrative_data.to_pickle('../processed_data/narrative_trajectories.pkl')
# df_score.to_pickle('../processed_data/scores.pkl')

In [None]:
df_student_data['state'].iloc[3]

In [None]:
df_narrative_data['state'].iloc[3]

In [None]:
states_narrative

# random

In [12]:
df.loc[(df['action']=='TALK') & (df['others'].str.contains('cur-action-talk-kim'))].groupby(['student_id']).count()

Unnamed: 0_level_0,action,time,detail,others
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100-0001,15,15,15,15
100-0003,5,5,5,5
100-0004,1,1,1,1
100-0005,7,7,7,7
100-0006,10,10,10,10
...,...,...,...,...
100-0514,6,6,6,6
100-0515,1,1,1,1
100-0516,3,3,3,3
100-0827,5,5,5,5
