In [2]:
import os
import pandas
import copy
from pprint import pprint
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
import xml.etree.cElementTree as et 




In [3]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 220;

<IPython.core.display.Javascript object>

In [4]:
# All possible error types made by the students:
ERROR_TYPES = {
    "1100": "correct", "1101": "correct",
    "1000": "incorrect", 
    "1001": "misapplied",
    "0000": "out of graph", "0001": "out of graph", "0100": "out of graph", "0101": "out of graph", 
    "1110": "correct repeat", 
    "1010": "repeat", "1011": "repeat", "1111": "repeat", 
    "0110": "where error", "0111": "where error",
    "0011": "when error", 
    "0010": "wild error"
    
    
    
}

In [5]:
''' Read in a transaction csv file '''
def transaction_file_to_df(path):
    df = pandas.read_csv(path, sep='\t', lineterminator='\n', skip_blank_lines=True).replace({r'\r': ''}, regex=True)
    df = df.rename(index=int, columns={a:a.rstrip() for a in df.keys()})
    return df

In [6]:
'''
extract_from_brd: returns a dataframe with information of all edges (correct steps) in brd file
Input: directory path to brd files
Output: a dataframe with columns of SAI, ID, source and destination of correct edges in brd
'''


def extract_from_brd(path):

    parsedXML = et.parse(path)
    dfcols = ["match_s","match_a","match_i","match_actor","match_ID","match_source","match_dest"]
    df_xml = pandas.DataFrame(columns= dfcols)

    for e in parsedXML.findall(".//edge"):    
        ID = e.find('./actionLabel/uniqueID').text
        source = e.find('./sourceID').text
        dest = e.find('./destID').text

        s = e.find('.actionLabel/matchers/Selection/matcher/matcherParameter').text
        a = e.find('.actionLabel/matchers/Action/matcher/matcherParameter').text
        i =  e.find('.actionLabel/matchers/Input/matcher/matcherParameter').text  
        actor = e.find('.actionLabel/matchers/Actor').text 
 
        df_xml = df_xml.append(
                pandas.Series([s, a, i, actor, ID, source, dest], index= dfcols),
                ignore_index=True)
        
        df_xml[['match_ID', 'match_source', 'match_dest']] = df_xml[['match_ID', 'match_source', 'match_dest']].apply(pandas.to_numeric)
    
    return(df_xml)
    





In [7]:
'''
update_input, update_action, update_selection: clean the SAI of edges in brds so they match the SAI of students in log
Input: one row in the brd dataframe before cleaning
Output: one row in the brd dataframe with cleaned SAI
'''

# clean the edge's input in humans brds
def update_input(row): 
    if(row['match_i'] == 'x'):
     #   return("I need to convert these fractions before solving.: true")
        return('x')
    
    if(row['match_i'] == 'v'):
        return("I need to convert these fractions before solving.: false")
    
    else:        
        return(str(row['match_i']))
    
    
# clean the edge's action in humans brd        
def update_action(row): 
    if(row['match_a'] == 'UpdateTextArea'):
        return('UpdateTextArea') 
    
    if(row['match_a'] == 'ButtonPressed'): 
           return('ButtonPressed')
        
        
# clean the edge's selection in humans brd         
def update_selection(row): 
    if(row['match_s'] == 'done'): 
        return('done') 
    elif(row['match_s'][10] == '8'): 
        return('check_convert') 
    else: 
        if(row['match_s'][10] == '4'): 
            box = '3' 
        if(row['match_s'][10] == '5'): 
            box = '4' 
        if(row['match_s'][10] == '6'): 
            box = '5' 
        if(row['match_s'][13] == '0'): 
            pos = 'num' 
        if(row['match_s'][13] == '1'): 
            pos = 'den' 
        return(pos+box)
           


''' 
clean_extract: clean variable names in CTAT interface so they match the ones used by human students
Input: original brd dataframe from extract_from_brd
Output: cleaned brd dataframe 
'''
def clean_extract(extract):
    new = extract.copy()
    new.insert(0, 'match_i_new', np.nan)
    new.insert(0, 'match_a_new', np.nan)
    new.insert(0, 'match_s_new', np.nan)
    new['match_s_new'] = new.apply(update_selection, axis=1)
    new['match_a_new'] = new.apply(update_action, axis=1)
    new['match_i_new'] = new.apply(update_input, axis=1)
    return(new)



'''
get_correct_transaction: return only student-performed SAIs that are correct
Input: original transactions of all problems from one student
Output: correct transactions of all problems from one student
'''
def get_correct_transaction(transaction):
    return (transaction[(transaction.Outcome == 'CORRECT') 
                & (transaction.Action != 'setVisible')
                & (transaction.Input != '+')])



''' 
check_transactions: filter out invalid transactions. If transactions end incorrectly, return False 
Input: 
    -df: student slice, a dataframe of a series of transactions that end with correct steps
    -graph: graph generated from the brd dataframe
Output: true if valid
'''

def check_transactions(df, graph):
    # transactions that end incorrectly
    
    if( (df['Outcome'].iloc[-1] == 'INCORRECT') | (df['Outcome'].iloc[-1] == 'HINT')): return(False)
    return(True)
    
    
    



In [8]:
'''
clean_transaction: remove correct steps in transactions that are not in the brds
Input: original student slice 
Outpud: student slice without correct steps that are not in brds 
'''
def clean_transaction(df):
    r = -1
    for _, table_row in df.iterrows():
        
        if(table_row['Input'] == 'I need to convert these fractions before solving.: false' and table_row['Outcome'] == 'CORRECT'):
            r = _
    if r!= -1:
        df = df.drop([r])
            
    return(df)





In [9]:
'''
get_graph_from_extract: return a graph created from brd dataframe
Input: brd dataframe 
Output: graph as a pair of mappings. Keys are edges in brd, values are its downstream neighbouring edges. 
'''
def get_graph_from_extract(extract):
    graph = {}
    tl = set()
    for _,table_row in extract.iterrows():
        ID = table_row['match_ID']
        source = table_row['match_source']
        dest = table_row['match_dest']
        t = (ID, source, dest)
        tl.add(t)
        graph[t] = []
    for i in graph:
        d = i[2]
        for j in tl:
            if (j[1] == d):
                graph[i].append(j)
    return(graph)


def get_first_correct_IDs(row, extract):
    lst = [] 
    S, A, I = row['Selection'], row['Action'], row['Input']
    
    for _,table_row in extract.iterrows():
        
        if(S == table_row['match_s_new'] and A == table_row['match_a_new'] and I == table_row['match_i_new']):
            ID, source, dest = table_row['match_ID'], table_row['match_source'], table_row['match_dest']
            lst.append((ID, source, dest))
    return(lst)



'''
get_node_SAIs: return a dictionary of SAI for each unique node 
Input: 
    -graph: graph generated from the brd file using get_graph
    -cleaned_extract: cleaned brd dataframe from clean_extract
Output: a dictionary of SAI for each node in graph 
'''
def get_node_SAIs(graph, cleaned_extract):
    correct_sai = {}
    for (ID, s, d) in graph:
        for _,table_row in cleaned_extract.iterrows():
            if(ID == table_row['match_ID'] and s == table_row['match_source'] and d == table_row['match_dest']):
                correct_sai[(ID, s, d)] = ((table_row['match_s_new'], table_row['match_a_new'], table_row['match_i_new']))
    return(correct_sai)



''' Find all starting nodes in a graph ''' 
def find_beginnings(graph):
    s = set()
    for i in graph:
        start = True
        for j in graph:
            if i in graph[j]:
                start = False
        if start:
            s.add(i)           
    return(s)
   
    
    
'''
find_current_index: for student's incorrect steps, identify the next correct edge in brd that the student 
is working toward      
Input: 
    -correct: list of indices for correct steps for a student slice
    -incorrect: list of indices for incorrect steps for a student slice
Output: a dictionary with keys being incorrect indices and values being the indices for next correct step 
for that incorrect index.

'''    
    
def find_current_index(correct, incorrect):
    res = {}
    correct_copy = copy.deepcopy(correct)
    
    
    # use an incorrect copy
    incorrect_copy = copy.deepcopy(incorrect)
    while (incorrect_copy and (incorrect_copy[-1] > correct_copy[-1])):
        res[incorrect_copy[-1]] = correct_copy[-1]
        incorrect_copy.pop()
        

    
    for i in range(len(incorrect_copy)):
        if (incorrect_copy[i] < correct_copy[0]):
            res[incorrect_copy[i]] = correct_copy[0]
        else:
            while(correct_copy and (incorrect_copy[i] > correct_copy[0])):
                correct_copy.pop(0)
            res[incorrect_copy[i]] = correct_copy[0]
    return(res)



'''
find_last_index: for student's incorrect steps, identify the immediate last edge in brd 
that the student has done correct 
Input: 
    -correct: list of indices for correct steps for a student slice
    -incorrect: list of indices for incorrect steps for a student slice
Output: a dictionary with keys being incorrect indices and values being the indices for last step in brd 
that the student has done correct 

'''
def find_last_index(correct, incorrect):
    incorrect_copy = copy.deepcopy(incorrect)
    while (incorrect_copy and (incorrect_copy[0] < correct[0])):
        incorrect_copy.pop(0)

    res = {}
    inc, cor = incorrect_copy[::-1], correct[::-1]
    for i in inc:
        if(i < cor[0]):
            while(i < cor[0]):
                cor.pop(0)
            res[i] = cor[0]
        else:
            res[i] = cor[0]
    return(res)



'''
search: return all downstream edges from a certain starting edge in brd graph using bfs
Input:
    -graph: graph generated from the brd dataframe
    -start: an edge to start searching downstream with
Output: a list of edges that is downstream from the start edge in the order of bfs 

'''
def search(graph, start):
    if start not in graph: return None
    visited, queue = [], [start]
    while queue:
        node = queue.pop(0)
        if node not in visited:
            visited.append(node)
            queue.extend(set(graph[node]) - set(visited))           
    return visited



'''
check_downstream: return true if an edge is in downstream 
Input:
    -graph: graph generated from the brd dataframe
    -up: the node to compare downstream with 
    -down: the node to check 
Output: return true if down is in downstream of up 
'''
def check_downstream(graph, up, down):
    if down in search(graph, up): return True
    return False 



'''
first_match: for student's correct step, return first edge in brd graph that matches it
Input:
    -df: a dataframe of a single row of one correct step made by student
    -lst: a list returned from search that contains all the downstream edges below a certain edge; 
    candidates to match with a certain correct student step 
    -d: dictionary returned from get_node_SAIs
    -last_node: the edge in brd that corresponds to last matched correct step made by student 
    -gaph: graph generated from the brd dataframe
Output: return first edge in brd graph that matches a correct step by student. If could not find one, return None

'''
def first_match(df, lst, d, last_node, graph):
    s_c, a_c, i_c = df['Selection'], df['Action'], df['Input']
    
    for node in lst:
        if check_downstream(graph, last_node, node):
            (s_l, a_l, i_l) = d[node] 
            
            if (s_c, a_c, i_c) ==  (s_l, a_l, i_l):
                return node          
    return None       



        


In [10]:

'''
match_steps: match one students' SAI with cleaned brd file, update the new four columns
with binary values in dataframe 
Input: 
    -one_student: all step slices of a student including all problems in correct time order
    -cleaned_extract: cleaned brd from clean_extract
    -graph: graph generated from the brd dataframe
Output: a copy of student slice dataframe with new columns of binary values 

'''

def match_steps(one_student, cleaned_extract, graph):
    match = one_student.copy()
    match['SAI'], match['node'], match['downstream'], match['d_nodes'] = None, None, None, None
    match['S_current'], match['I_current'], match['S_downstream'], match['I_downstream'] = None, None, None, None
    
    
    #Add new columns error types and KC to df
    match['KC_toward'], match['error_type'] = None, None
    
    
    correct = get_correct_transaction(match) 
    correct_queue = []
    all_first_nodes = find_beginnings(graph)
    sai_dict = get_node_SAIs(graph, cleaned_extract)
    
    for _,table_row in correct.iterrows():
        correct_queue.append([_, table_row])

        
    # Match correct steps:    
    # for loop because might start with multiple nodes in brd
    for begin in all_first_nodes:
        
        SAI_d = {}
        done = set()
        
        last_node = begin
        correct_order = search(graph, begin)
        correct_transaction = {}
        correct_queue_copy = copy.deepcopy(correct_queue)
        can_match = True
        
        
        while correct_queue_copy and correct_order:

            n = correct_queue_copy.pop(0)
            next_index, next_correct = n[0], n[1]
            
            s, a, i = next_correct['Selection'], next_correct['Action'], next_correct['Input']
            
            
            if (s,a,i) in done:
            #deal with duplications
            
                first_index = SAI_d[(s,a,i)]
                correct_transaction[next_index] = correct_transaction[first_index]
                
                
            
            else:
                done.add((s,a,i))
                SAI_d[(s,a,i)]= next_index
    
                f_match = first_match(next_correct, correct_order, sai_dict, last_node, graph)

            
                if f_match == None:
                    can_match = False
                    break
            
                
                while correct_order[0] != f_match:
                    correct_order.pop(0)
                correct_order.pop(0)
                    
                correct_transaction[next_index] = f_match
                last_node = f_match
        

        if can_match == True: break
            
    
    if can_match == False:  
        print("Can not match for this step_slice", match)
        return match
    
    
    else:
        
        index = sorted(list(correct_transaction.keys()))
       
        for i in index:
            match.at[i,'SAI'] = 'correct'
            match.at[i, 'node'] = correct_transaction[i]        
            match.at[i, 'S_current'], match.at[i, 'I_current'] = 1,1
       
        incorrect = list((match[match['Outcome'] == 'INCORRECT']).index)

        # match incorrect steps 
        if incorrect: 
    
            cur_index = find_current_index(index, incorrect)
            last_index = find_last_index(index, incorrect)


            for i in incorrect:
            
                match.at[i ,'SAI'] = 'incorrect'
                
                KC = match.at[cur_index[i], 'KC (Field)'] #KC of the node working towards 
                match.at[i, 'KC_toward'] = KC


                # Find current steps for incorrect transactions 
                if(i not in last_index.keys()):
                    match.at[i, 'node'] = list(find_beginnings(graph))

                if(i in last_index.keys()):

                    n_l = match.at[last_index[i], 'node'] 
                    match.at[i, 'node'] = graph[n_l]

                
                
                sel, inp = match.at[i, 'Selection'], match.at[i, 'Input']
                s_found_c, i_found_c = 0, 0
                
                match_s_c = None 
                for c in match.at[i, 'node']:
                    (s_c, a_c, i_c) = sai_dict[c]
                    if s_c == sel: 
                        s_found_c = 1
                        match_s_c = c
                        
                    if i_c == inp: i_found_c = 1
                match.at[i, 'S_current'], match.at[i, 'I_current'] = s_found_c, i_found_c
                        
                        
                downstream = set()
                # downstream for a particular branch
                if match_s_c != None:
                    down = graph[match_s_c]
                    for d in down: 
                        downstream.add(d)
                        
                else:
                
                    for node in match.at[i, 'node']:
                        down = graph[node]
                        for d in down: 
                            downstream.add(d)

                match.at[i, 'downstream'] = list(downstream)
                
                
                


                d_nodes = set()
                for n in match.at[i, 'downstream']:
                    d_nodes = d_nodes.union(search(graph, n))
     
                match.at[i, 'd_nodes'] = d_nodes
                s_found_d, i_found_d = 0, 0
                for dn in d_nodes:
                    (s_d, a_d, i_d) = sai_dict[dn]
                    if(sel == s_d):
                        s_found_d = 1    
                    if(inp == i_d):
                        i_found_d = 1

   
                match.at[i, 'S_downstream'], match.at[i, 'I_downstream']  = s_found_d, i_found_d
            

                e_type = str(s_found_c) + str(i_found_c) + str(s_found_d) + str(i_found_d) 
                match.at[i, 'error_type'] = ERROR_TYPES[e_type]
                     
            
        return match

In [11]:
'''
original_df: For cases that end in incorrect or hint, return the original df with new blank columns 
Input: all step slices for one student 
Output: all step slices for one student with new blank columns
'''
def original_df(one_student):

    match = one_student.copy()
    match['SAI'], match['node'], match['downstream'], match['d_nodes'] = None, None, None, None
    match['S_current'], match['I_current'], match['S_downstream'], match['I_downstream'] = None, None, None, None
    
     #Add error types and KC
    match['KC_toward'], match['error_type'] = None, None
    return match
    
    







In [12]:

'''
one_student_all_problems: match all problems from brd files for one student 
Input:
    -df: a dataframe of all problems for one student
    -directory: a directory path in which to find all brd files
Output: a new dataframe with new columns of binary values for this student 

'''
def one_student_all_problems(df, directory, stu):
    
    new = pandas.DataFrame(columns = list(df.columns))
    new['SAI'], new['node'], new['downstream'], new['d_nodes'] = None, None, None, None
    new['S_current'], new['I_current'], new['S_downstream'], new['I_downstream'] = None, None, None, None   
    new['KC_toward'], new['error_type'] = None, None

    
    
    for problem in df['Problem Name'].unique(): # ? preserve order

        stu_slice = ( df[df['Problem Name'] == problem] ).copy()
        stu_slice = clean_transaction(stu_slice)

        if (problem != "InstructionSlide"):
            brd = extract_from_brd(directory + "/" + problem + ".xml")
            graph = get_graph_from_extract(brd)

            if check_transactions(stu_slice, graph):
                tutor_SAI = clean_extract(brd)
                stu_match = match_steps(stu_slice, tutor_SAI, graph)
                new = new.append(stu_match)
                
            else:
                orig_match = original_df(stu_slice)
                new = new.append(orig_match)

    return(new)
    
    
    
    
    
    
    

In [15]:
'''
generate_truth_table_iso: take in paths to iso transactions and brd files to generate 
new dataframe with four new columns with binary values
Input:
    -iso_log_path: a path for the iso transaction log
    -brd_path: a directory path in which to find the brd files whose names match the problem 
    names in iso transaction log
    -save_path: a path to save the new file generated
Output: a new dataframe of student transaction log with new columns of binary values 
    
'''



def generate_truth_table_iso(iso_log_path, brd_path, save_path):
    t = transaction_file_to_df(iso_log_path)
    t =  (t[t['Level (Domain)'] == 'fraction_arithmetic']).reset_index(drop=True)
    
    df = pandas.DataFrame(columns = list(t.columns))
    df['SAI'], df['node'], df['downstream'], df['d_nodes'] = None, None, None, None
    df['S_current'], df['I_current'], df['S_downstream'], df['I_downstream'] = None, None, None, None
    
    df['KC_toward'], df['error_type'] = None, None
    
    
    for stu in t['Anon Student Id'].unique():
        p = t[t['Anon Student Id'] == stu]
    
        new = one_student_all_problems(p, brd_path, stu)
        df = df.append(new)

    df.to_csv(save_path) 

    return df



<br/><br/><br/><br/><br/><br/> 
### Matching SAI of iso with edges in brd files
> To generate and save a new log file with new columns 'S_current', 'I_current', 'S_downstream', and 'I_downstream', use **generate_truth_table_iso(*iso_log_path*, *brd_path*, *save_path*)** 
<br/><br/>
> Parameters: 
- iso_log_path - a path for the iso transactions log
- brd_path - a directory path in which to find the brd files whose names match the problem names in iso transaction log
- save_path - a path to save the new file generated

> Example in cell below: 

In [16]:


generate_truth_table_iso("/Users/daniellaye/Desktop/data/iso.txt", "/Users/daniellaye/Desktop/brds_iso", "/Users/daniellaye/Desktop/error_analysis_iso.csv")

Unnamed: 0,Row,Sample Name,Transaction Id,Anon Student Id,Session Id,Time,Time Zone,Duration (sec),Student Response Type,Student Response Subtype,...,SAI,node,downstream,d_nodes,S_current,I_current,S_downstream,I_downstream,KC_toward,error_type
0,178,All Data,669a671ebc4d65ef13a28145a65005be,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:02:58,UTC,1,ATTEMPT,,...,correct,"(7, 1, 5)",,,1,1,,,,
1,179,All Data,28bb91ee656f9b3d513e37ee586b5ebb,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:02:59,UTC,1,ATTEMPT,,...,correct,"(9, 5, 3)",,,1,1,,,,
2,180,All Data,ef4b6202924b3b0dfb531f3cc1f0b2b1,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:03:00,UTC,1,ATTEMPT,,...,correct,"(5, 3, 4)",,,1,1,,,,
3,181,All Data,abc5acacbc9458cf326c2d4f0974c91f,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:03:01,UTC,1,ATTEMPT,,...,correct,"(7, 1, 5)",,,1,1,,,,
4,182,All Data,1d6baa77c8cef1be7fb1e1fb8c49d7e3,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:03:02,UTC,1,ATTEMPT,,...,correct,"(9, 5, 3)",,,1,1,,,,
5,183,All Data,33234893068b3f5af891316c9370b43e,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:03:03,UTC,1,ATTEMPT,,...,correct,"(5, 3, 4)",,,1,1,,,,
6,184,All Data,974acd6055c6f258768d3da1c2b0874a,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:03:04,UTC,1,ATTEMPT,,...,correct,"(7, 1, 5)",,,1,1,,,,
7,185,All Data,87db96a3739e3a07fcea0640664f6352,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:03:05,UTC,1,ATTEMPT,,...,correct,"(9, 5, 3)",,,1,1,,,,
8,186,All Data,e9ba44b451de39c89248f88f59af195c,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:03:06,UTC,1,ATTEMPT,,...,correct,"(5, 3, 4)",,,1,1,,,,
9,187,All Data,5fbc69e606debcf615c427d708e1b6e9,Stu_01266dfb27cc2e1a087884753dbe4f67,cef2b6fb-107d-4ddd-ad9d-f2bd5555d456,1969-12-31 19:03:07,UTC,1,ATTEMPT,,...,correct,"(7, 1, 5)",,,1,1,,,,
