In [265]:
import pandas as pd
import numpy as np
import re
import time

In [857]:
# event types
ET_EDIT = 'File.Edit'
ET_KEY = 'X-Keystroke'
# edit types
INSERT = 'Insert'
DELETE = 'Delete'

In [865]:
class Entry:
    # change_text is the text to be inserted or deleted
    # row_idx is the index of the row in the dataframe that contains the change
    # text_index is the index in the string in InsertText or DeleteText column where the change_text is found
    # insert is a boolean: true if insert, false if delete
    def __init__(self, change_text, row_idx, text_index, insert, deleted_insert_idx=-1):
        self.change_text = change_text
        self.row_idx = row_idx
        self.text_index = text_index
        self.insert = insert
        self.deleted_insert_idx = deleted_insert_idx
        
    def __str__(self):
        return 'text: {}, row_idx: {}, text_index: {}, insert: {}'.format(self.change_text, self.row_idx, self.text_index, self.insert)
    
    def isInsert(self):
        return self.insert
        
    def isDelete(self):
        return not self.insert


## Reconstruct the final submission

In [894]:
def char_diff(row):
    l = 0
    if row.InsertText == row.InsertText:
        l = len(row.InsertText)
    if row.DeleteText == row.DeleteText:
        l -= len(row.DeleteText)
    return l

def reconstruct(df, debug=False):
    # find the number of characters added or subtracted at each step
    charsAdded = df.apply(char_diff, axis=1)
    max_size = charsAdded.cumsum().max()
    # Allocate buffers
    inserts = []
    deletes = []
    deleted_inserts = []
    buf = '' # the end file
    delete_buf = '' # characters that were deleted

    # Create an error data frame holding rows that failed
    error = pd.DataFrame(columns=df.columns)
    error['XError'] = 0
    # Build the file. Iterate over each row of the data frame.
#     for row_idx,row in df[df.EventType == ET_EDIT].iterrows():
    for row_idx,row in df.iterrows():
        code_idx = int(row.SourceLocation)
        if row.EditType == INSERT:
            local_inserts = [Entry(row.InsertText[k], row_idx, k, True) for k in range(len(row.InsertText))]
            inserts = inserts[:code_idx] + local_inserts + inserts[code_idx:]
            buf = buf[:code_idx] + row.InsertText + buf[code_idx:]
        elif row.EditType == DELETE:
            di_len = len(deleted_inserts)
            deleted_inserts = deleted_inserts + inserts[code_idx:code_idx+len(row.DeleteText)]
#             if row_idx == 191:
#                 print('*** {}'.format(df.iloc[inserts[code_idx].row_idx]))
            
            inserts = inserts[:code_idx] + inserts[code_idx+len(row.DeleteText):]
            buf = buf[:code_idx] + buf[code_idx+len(row.DeleteText):]
            
            local_deletes = [Entry(row.DeleteText[k], row_idx, k, False, deleted_insert_idx=di_len+k) for k in range(len(row.DeleteText))]
            deletes = local_deletes + deletes
            delete_buf = row.DeleteText + delete_buf
        else:
            error = error.append(row)
            error.iloc[0, error.columns.get_loc('XError')] = 'Unsupported edit type'
    if debug:
        print('Errors:')
        display(error)

    return buf, delete_buf, inserts, deletes, deleted_inserts


## Create a regular expression

In [867]:
from itertools import permutations, combinations

# Returns a regular expression to match and mask.
# subjectID - the name of the student with each of first, last, etc separated by a space
# include_Anum - whether to include a regular expression to match Aggie A#
def subjectID2mask_re(subjectID, include_Anum=True, substrings=True):
    res = []
    
    # Get "First Last" and "Last First"
    id_parts = subjectID.split() # Probably just first and last name but could include a middle initial
    for i in range(len(id_parts), 0, -1):
        for comb in combinations(id_parts, i):
            perms = permutations(comb)
            for perm in perms:
                r = ' '.join(perm)
                if len(r) > 1:
                    res = res + [r]

    # Find larger substrings of each name. For example, if the name is Christensen, this will add
    # hristense, ristens, and isten. The reason for this is if the name somehow is only partial
    # in the keystrokes. For example, the student might have missed typing the last character.
    # In our working example, this would result in Christense. In this case, we would match
    # hristense and the masked keys would result in C@@@@@@@@@. Ideally we would attempt matches
    # on every substring, but there are n(n+1)/2 of those -- way too many. There are a linear
    # number of substrings using our scheme and it strikes a balance between performance and
    # quality of masking/deidentification.
    if substrings:
        for name in id_parts:
            res = res + [name[i:-i] for i in range(1,int(len(name)/2)-1)]

    # Add a RE for A#
    if include_Anum:
        res = res + ['a#?[ ]{0,3}[0-9]{8}']
    return '|'.join(res)

r = subjectID2mask_re('John Lilliputian')
print(r)
print([m[0] for m in re.finditer(r, 'John Lilliputian John illiputi a12345678 a#35354646 A# 87658765 a 56565656 a  34563456  a    97597531', flags=re.IGNORECASE)])
r = subjectID2mask_re('Alomarian Stoic Parambulator Stu')
print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])

r = subjectID2mask_re('Alomarian Stoic Parambulator Stu', substrings=False)
print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])



John Lilliputian|Lilliputian John|John|Lilliputian|illiputia|lliputi|liput|a#?[ ]{0,3}[0-9]{8}
['John Lilliputian', 'John', 'lliputi', 'a12345678', 'a#35354646', 'A# 87658765', 'a 56565656', 'a  34563456']
['Stoic Stu', 'a12341234', 'rambulat']
['Stoic Stu', 'a12341234']


## Mask a dataframe

In [868]:
REPLACE_WITH = '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
def mask_char_in_insert_text(df, row_idx, text_index, replacement_char):
    insert_col_idx = df.columns.get_loc('InsertText')
    s = df.iloc[row_idx].InsertText
    char_to_replace = s[text_index]
    df.iloc[row_idx,insert_col_idx] = s[:text_index] + replacement_char + s[text_index+1:]
    
    # Look nearby (within two rows) for a keystroke and mask it as well
    arr = np.intersect1d(df.index[(df.EventType == ET_KEY)&
                                  (df.InsertText == char_to_replace)],
                         pd.RangeIndex(row_idx-2, row_idx+3))
    df.iloc[arr,insert_col_idx] = replacement_char
    
# Returns the masked dataframe and all strings that were masked
def mask(df, program, entries, is_insert, text_to_replace, replace_with=REPLACE_WITH, deleted_inserts = None, in_place = False):
#     display(df)
    masked_strings = set()
    insert_col_idx = df.columns.get_loc('InsertText')
    delete_col_idx = df.columns.get_loc('DeleteText')
    if in_place:
        masked = df
    else:
        masked = df.copy()

    for it in re.finditer(text_to_replace, program, flags=re.IGNORECASE):
        start_i = it.start()
        text_to_replace = it[0]
        masked_strings.add(text_to_replace)

        # Iterate through each character of the input text
        for i in range(len(text_to_replace)):
            replacement_char = replace_with[i]
            char_to_replace = text_to_replace[i]
            entry = entries[start_i+i] # entry with data on the character we are about to mask
            if entry.change_text != char_to_replace:
                print('ERROR: change_text not equal to char_to_replace: {}, {}', entry, char_to_replace)
            row_idx = entry.row_idx
            text_index = entry.text_index
            if (is_insert != entry.insert):
                print('Error: {} is not equal to insert = {}'.format(entry, is_insert))
            
            if entry.insert:
                mask_char_in_insert_text(masked, row_idx, text_index, replacement_char)
            else:
                # Deletion
                # Mask the DeleteText string
                s = masked.iloc[row_idx].DeleteText
                masked.iloc[row_idx,delete_col_idx] = s[:text_index] + replacement_char + s[text_index+1:]
                # Mask the InsertText string where the text was originally inserted
                di = deleted_inserts[entry.deleted_insert_idx]
#                 s = masked.iloc[di.row_idx].InsertText
#                 if s[di.text_index] != char_to_replace:
#                     print('ERROR: s[di.text_index] != char_to_replace: {}, {}, {}', s, di, char_to_replace)
#                     print('  {}, {}'.format(row_idx, di.row_idx))
#                     print('  {}'.format(masked.iloc[di.row_idx]))
                mask_char_in_insert_text(masked, di.row_idx, di.text_index, replacement_char)
            

    return masked, masked_strings


In [869]:
def mask_df(df, mask_re, replace_with=REPLACE_WITH):
    df = df.copy().reset_index()
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)

    masked = df.copy()
    masked, ms1 = mask(masked, program, entries, True, mask_re, replace_with)
    masked, ms2 = mask(masked, deleted_text, deletes, False, mask_re, replace_with, deleted_inserts)

    return masked, ms1.union(ms2)

## Test reconstruct and mask

In [None]:
# df = pd.read_csv('data/deident.csv')#, header=None)
# df = showyourwork2progsnap2('showyourwork.log', 'John Edwards', 'Assign1')
df = pd.read_csv('test.ps2')
subjectID = df.SubjectID.unique()[0]
display(df['CodeStateSection'].unique())
df = df[(df.SubjectID == subjectID)&(df['CodeStateSection'] == 'task1.py')]
# display(df)

print(reconstruct(df)[0])

print('\n\n******* Masking ********\n')

masked, masked_strings = mask_df(df, subjectID2mask_re(subjectID))
program, deleted_text,_,_,_ = reconstruct(masked)
print(f'Masked strings: {masked_strings}')
print(program)
print('\n\n** Deleted **\n' + deleted_text)
print('\n\n** Inserts **\n', ''.join(masked[(masked.EventType == ET_EDIT)&(masked.EditType == INSERT)].InsertText))
print('\n\n** Keystrokes **\n', ''.join(masked.loc[masked.EventType == ET_KEY, 'X-Metadata']))

# temp = pd.concat([df,masked]).drop_duplicates(keep=False)
# display(temp[temp.EventID < 100].head(40))


## Add a number to each character in an ascii string

In [870]:
def add_to_ascii(s, add):
    ch = [c+add for c in bytes(s, 'utf-8')]
    return ''.join([str(chr(c)) for c in ch])

s = 'Ebof!Sbtnvttfo!.!Bttjho23!.!ubtl3/qz'
print(add_to_ascii(s, -1))

Dane Rasmussen - Assign12 - task2.py


# Manual check

### deidentified-deleted.txt
* Line 1680 (Devin Winters - Assign10 - junkW.py): steVeH@@@@@ssSlice==    e0123456789012345678WosprintBackWordspmidWords        8978"@@@@@""KevinIsTheBest"    mixbtiapplesmacks    

### interesting stuff in deidentified-deleted.txt
* Line 19169 (Kaili Pearson - Assign9 - task2.py):    blobber = blobber.BlobberbBBBBooBberobBlobberBlBlobberabBlobberBloober.ob<Fill-In>b<Fill-In>obber.displayCol<Fill-In><Fill-In>bB<Fill-In><Fill-In><Fill-In>() )b+ <Fill-In>
* Line 21227 (Heidi Anderson - Assign8 - pattern.py):     turtle.speed(0)colorGeneratecolorGeneratecolorGeneratecolorGeneratecolorGeneratecolorGeneratesetup()randomColorGenerateercolor
setup()setup() ipturtle.speed(0)turtle.
* Line 40975 (Dane Rasmussen - Assign12 - task2.py):    print(theList);si = max        startingValuestartingValueif IrangeifIwas  wthe the     1Ist??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????yn tllminimumTe theList =IstIst.theList = theList.sort()ed, then yrid 


# Deidentify all students in ps2 file

In [891]:
# Returns (df, submission, deleted) where
#  df - the deidentified data frame
#  submission - the reconstructed submission code
#  deleted - all deleted characters in reverse order
def deidentifyps2(df_all, header_offset=1):
    df_result = []
    all_masked_strings = set()
    programs = ''
    program_heads = ''
    deleted = ''
    
    df_all = df_all[df_all['CodeStateSection'].str.slice(-3) == '.py'].copy()
    df_all['uniqueID'] = df_all.SubjectID + df_all.AssignmentID + df_all['CodeStateSection']
    for ID in df_all.uniqueID.unique():
        masked = df_all[df_all.uniqueID == ID].copy().reset_index()
        
        f = masked.iloc[0]
        id_string = f'{f.SubjectID} - {f.AssignmentID} - {f["CodeStateSection"]}'
        print(id_string)
        
        try:
            # Do not sort if deidentifying ShowYourWork.
            masked.sort_values('ClientTimestamp', inplace=True)
            mask_re = subjectID2mask_re(f.SubjectID)

            masked,masked_strings = mask_df(masked, mask_re)
            if (len(masked_strings) > 0):
                all_masked_strings = all_masked_strings.union(masked_strings)
                print(f'  Masked strings: {masked_strings}')

            program, deleted_text,_,_,_ = reconstruct(masked)
            program_header = f'\n\n____{add_to_ascii(id_string, header_offset)}****\n'
            if len(program.strip()) > 0:
                programs = programs+program_header+program
            lines = program.split('\n')
            num_lines = 3 if len(lines)>3 else len(lines)
            lines = lines[:num_lines]
            lines = '\n'.join(lines)
            if len(lines.strip()) > 0:
                program_heads = program_heads+program_header+lines
                
            if len(deleted_text.strip()) > 0:
                deleted = deleted+program_header+deleted_text

            df_result.append(masked)
#             if df_result is None:
#                 df_result = masked
#             else:
#                 df_result = pd.concat([df_result, masked])
        except Exception as e:
            print(f'Failed to mask {id_string}: {str(e)}')
            traceback.print_exc()

    return pd.concat(df_result), all_masked_strings, programs, program_heads, deleted



# Convert Phanon to ProgSnap2

In [365]:
# df = pd.read_csv('data-2019/keystrokes.csv')
df = pd.read_csv('data-2019/project-events.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [738]:
df = df.rename({'Unnamed: 0':'native_index'}, axis=1)
df = df.sort_values(['user_id','timestamp','native_index'])
df['change_index'] = np.nan

df['ID'] = df.user_id.astype('str') + df.project_id.astype('str') + df.task.astype('str')
df['ID_no_task'] = df.user_id.astype('str') + df.project_id.astype('str')

Unnamed: 0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,...,startLine,startCol,endLine,endCol,operation,key,elapsed,change_index,ID,ID_no_task
656714,5716878,100026,135,0,RUN,,,1550208272769,,,...,,,,,RUN,,5995.0,,1000261350,100026135
656715,5716885,100026,135,0,+input,\n,,1550208298237,,,...,67.0,41.0,67.0,41.0,+input,return,25468.0,,1000261350,100026135
656716,5716886,100026,135,0,+input,i,,1550208299364,,,...,68.0,0.0,68.0,0.0,+input,i,1127.0,,1000261350,100026135
656717,5716887,100026,135,0,+input,f,,1550208299545,,,...,68.0,1.0,68.0,1.0,+input,f,181.0,,1000261350,100026135
656718,5716888,100026,135,0,+input,,,1550208299560,,,...,68.0,2.0,68.0,2.0,+input,space,15.0,,1000261350,100026135
656719,5716889,100026,135,0,+input,u,,1550208330383,,,...,68.0,3.0,68.0,3.0,+input,u,30823.0,,1000261350,100026135
656720,5716890,100026,135,0,+input,s,,1550208330531,,,...,68.0,4.0,68.0,4.0,+input,s,148.0,,1000261350,100026135
656721,5716891,100026,135,0,+input,e,,1550208330650,,,...,68.0,5.0,68.0,5.0,+input,e,119.0,,1000261350,100026135
656722,5716892,100026,135,0,+input,r,,1550208330760,,,...,68.0,6.0,68.0,6.0,+input,r,110.0,,1000261350,100026135
656723,5716893,100026,135,0,+input,N,,1550208331788,,,...,68.0,7.0,68.0,7.0,+input,N,1028.0,,1000261350,100026135


In [596]:
def split_text(text):
    text = text.split('\n')
    if len(text) > 1:
        text = [e+'\n' for e in text[:-1]] + [text[-1]]
    return text
    
def insert(lines, text, irow, icol):
    if len(lines) == 0:
        lines = ['']
    if icol > len(lines[irow]):
        s = '"'+lines[irow].replace(' ','*').replace('\n','\\n')+'"'
        raise IndexError(f"Column out of range: irow={irow}, icol={icol}, line={s}")
    # Consider the line in the question and the line following.
    # Split into lines preceding and succeeding those two. Then
    # join the two in question.
    before = lines[:irow]
    after = []
    if len(lines) == irow+1:
        # last line
        two = lines[irow]
    elif len(lines) == irow+2:
        # second to last line
        two = ''.join(lines[irow:irow+2])
    else:
        # More than one line following
        two = ''.join(lines[irow:irow+2])
        after = lines[irow+2:]
        
    two = two[:icol] + text + two[icol:]
    if len(after) == 0:
        return before + split_text(two)
    return before + split_text(two)[:-1] + after

# print(split_text(''))
# print(split_text('\n'))
# print(split_text('abc'))
# print(split_text('abc\n'))
# print(split_text('abc\ndef'))
# print(split_text('abc\ndef\n'))
# print(split_text('abcdef\n\ndef'))
# lines = insert(['abc\n','def'], 'def', 0, 3)
# print('*'.join(lines))
# print(insert([], 'abc', 0, 0))
# print(insert([''], 'abc', 0, 0))
# print(insert([], 'abc\n', 0, 0))
# print(insert([''], 'abc\n', 0, 0))
# print(insert(['abc\n', ''], 'def\n', 0, 0))
# print(insert(['abc\n', ''], 'def\n', 0, 3))
# print(insert(['abc\n', ''], 'def\n', 1, 0))
# print(insert(['abc\n', ''], 'def', 0, 0))
# print(insert(['abc\n', ''], 'def', 0, 3))
# print(insert(['abc\n', ''], 'def', 1, 0))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 0, 0))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 0, 3))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 1, 0))
# print(insert(['abc\n','def'], 'def\n', 0, 3))


In [594]:
def add_empty_line_cond(lines):
    if len(lines) == 0:
        return ['']
    last_line = lines[-1]
    if len(last_line) > 0 and last_line[-1] == '\n':
        return ['']
    return []

#        i j
# i,j aaaddaaa
#
#        i j
# i,j aaadd
#     aaaaa    
#
#      i
# i aaadd
#   ddddddd
# j ddaaaaa
#     j
def remove_impl(lines, irow, icol, jrow, jcol):
#     if jrow == len(lines) and jcol == 0:
#         jrow = len(lines)-1
#         jcol = len(lines[-1])
    if icol > len(lines[irow]):
        s = '"'+lines[irow].replace(' ','*').replace('\n','\\n')+'"'
        raise IndexError(f"Column out of range: irow={irow}, icol={icol}, line={s}")
    if irow == jrow:
        line = lines[irow]
        line = line[0:icol] + line[jcol:]
        if len(line) == 0:
            return lines[:irow] + lines[irow+1:]
        if line[-1] != '\n' and irow < len(lines)-1:
            return lines[:irow] + [line+lines[irow+1]] + lines[irow+2:]
        return lines[:irow] + [line] + lines[irow+1:]
    else:
        line1 = lines[irow][:icol]
        line2 = lines[jrow][jcol:]
        if len(line1+line2) == 0:
            return lines[:irow] + lines[jrow+1:]
        if (line1+line2)[-1] != '\n' and jrow < len(lines)-1:
            return lines[:irow] + [line1+line2+lines[jrow+1]] + lines[jrow+2:]
        return lines[:irow] + [line1+line2] + lines[jrow+1:]
    
def remove(lines, irow, icol, jrow, jcol):
    lines = remove_impl(lines, irow, icol, jrow, jcol)
    return lines + add_empty_line_cond(lines)

# print(remove(['abc\n', ''], 0, 0, 0, 1))
# print(remove(['abc\n', ''], 0, 0, 0, 2))
# print(remove(['abc\n', ''], 0, 0, 0, 3))
# print(remove(['abc\n', ''], 0, 0, 0, 4))
# print(remove(['abc\n', ''], 0, 1, 0, 3))
# print(remove(['abc\n', 'def'], 0, 1, 1, 0))
# print(remove(['abc\n', 'def'], 0, 1, 1, 3))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 0))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 3))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 4))
# print(remove(['abc\n', 'def\n', ''], 1, 3, 1, 4))
# print(remove(['abc\n', 'def\n', 'g'], 2, 0, 2, 1))
# print(remove(['abc\n', 'def\n', 'g'], 2, 0, 3, 0))


In [620]:
# test[277-5:277+5]
# test[0:20]
test.head()

Unnamed: 0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,has_error,user_terminated,startLine,startCol,endLine,endCol,operation,key,elapsed,change_index
418393,2787261,100338,131,0,+input,i,,1548708913518,,,,,0.0,0.0,0.0,0.0,+input,i,325220912.0,
418394,2787262,100338,131,0,+input,n,,1548708913611,,,,,0.0,1.0,0.0,1.0,+input,n,93.0,
418395,2787263,100338,131,0,+input,t,,1548708913675,,,,,0.0,2.0,0.0,2.0,+input,t,64.0,
418396,2787264,100338,131,0,+delete,,t,1548708913978,,,,,0.0,2.0,0.0,3.0,+delete,delete,303.0,
418397,2787265,100338,131,0,+input,v,,1548708914075,,,,,0.0,2.0,0.0,2.0,+input,v,97.0,


In [798]:
import traceback
def phanon2progsnap2(df, debug = False):
    test = df.copy()
    change_indices = []
    lines = ['']
    i = 0
    for index,row in test.iterrows():
    #     print('**',i)
        i = i + 1
        try:
            irow = row.startLine
            icol = row.startCol
            jrow = row.endLine
            jcol = row.endCol
            change_type = row.change_type
            added = row.code_added
            removed = row.code_removed
            changed = False
            if removed and removed == removed and row.change_type != 'setValue' and (len(lines)>1 or len(lines[0])>0):
#             if removed and removed == removed and (len(lines)>1 or len(lines[0])>0):
                changed = True
                irow = int(irow)
                icol = int(icol)
                jrow = int(jrow)
                jcol = int(jcol)
                lines = remove(lines, irow, icol, jrow, jcol)
            if row.change_type == 'setValue':
                lines = ['']
            if added and added == added and added != '':
                changed = True
                irow = int(irow)
                icol = int(icol)
                lines = insert(lines, added, irow, icol)
            change_index = np.nan
            if changed:
                change_index = len(''.join(lines[:irow]))+icol
            change_indices.append(change_index)
        except Exception as e:
            if debug:
                display('i={}: {}'.format(i,e))
                print(''.join(lines).replace(' ', '·'))
                display(row)
                traceback.print_exc()
                test = test[:len(change_indices)]
                break
            else:
                raise e
    test.change_index = change_indices
    test['SubjectID'] = test.user_id
    test['EventID'] = test.native_index
    test['AssignmentID'] = test.project_id
    test['CodeStateSection'] = test.task
    test['EventType'] = test.change_type
    test['InsertText'] = test.code_added
    test['DeleteText'] = test.code_removed
    test['SourceLocation'] = test.change_index
    test['ClientTimestamp'] = test.timestamp
    # array(['RUN', 'SUBMIT', 'TASK', 'setValue', '+delete', '+input', 'paste',
#        'undo', 'redo', 'cut', 'drag'], dtype=object)
    test.EventType = test.EventType.replace({'+input':'File.Edit','+delete':'File.Edit',
                                             'undo':'File.Edit','redo':'File.Edit',
                                             'cut':'File.Edit','paste':'File.Edit','drag':'File.Edit',
                                             'RUN':'Run.Program','SUBMIT':'Submit',
                                             'TASK':'X-SwitchTask',
#                                              'setValue':'File.Edit'
                                            })
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == '+input'),'EditType'] = 'Insert'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == '+delete'),'EditType'] = 'Delete'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'setValue'),'EditType'] = 'Insert'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'undo'),'EditType'] = 'Undo'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'redo'),'EditType'] = 'Redo'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'paste'),'EditType'] = 'Paste'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'cut'),'EditType'] = 'Cut'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'drag'),'EditType'] = 'Drag'
    return test

In [796]:
def clean_task_switches(df):
    # Look nearby (within two rows) for rows with large both inserts and
    # deletes and clean them
    df = df.copy()
    indices = df.index[df.change_type == 'TASK']
    for row_idx in indices:
        arr = np.intersect1d(df.index[(df.code_added.str.len() > 5)&
                                      (df.code_removed.str.len() > 5)],
                             pd.RangeIndex(row_idx-2, row_idx+3))
        df.loc[arr,'code_added'] = ''
        df.loc[arr,'code_removed'] = ''
    return df

test
clean_task_switches(test)

Unnamed: 0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,...,AssignmentID,X-File,EventType,InsertText,DeleteText,CodeStateSection,ClientTimestamp,EditType,5,6
418393,2787261,100338,131,0,+input,i,,1548708913518,,,...,131,0,File.Edit,i,,0.0,1548708913518,Insert,,
418394,2787262,100338,131,0,+input,n,,1548708913611,,,...,131,0,File.Edit,n,,1.0,1548708913611,Insert,,
418395,2787263,100338,131,0,+input,t,,1548708913675,,,...,131,0,File.Edit,t,,2.0,1548708913675,Insert,,
418396,2787264,100338,131,0,+delete,,t,1548708913978,,,...,131,0,File.Edit,,t,2.0,1548708913978,Delete,,
418397,2787265,100338,131,0,+input,v,,1548708914075,,,...,131,0,File.Edit,v,,2.0,1548708914075,Insert,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423762,2791025,100338,131,0,SUBMIT,,,1548814804295,,,...,131,0,Submit,,,,1548814804295,,,
423763,2792519,100338,131,0,RUN,,,1548837114997,,,...,131,0,Run.Program,,,,1548837114997,,,
423778,2792529,100338,131,0,TASK,,,1549056835382,,,...,131,0,X-SwitchTask,,,,1549056835382,,,
423779,2792530,100338,131,0,setValue,,,1549056835386,,,...,131,0,X-SetValue,"#Jake Kinshella, Assn5, calculating future inv...","#Jake Kinshella, Assn5, calculating future inv...",0.0,1549056835386,,,


In [None]:
copy = df.copy()#[(df.user_id == 100338)|(df.user_id == 100339)|(df.user_id == 100340)].copy()
# copy = clean_task_switches(copy)
dfs = []
for ID in copy.ID.unique():
# for ID in copy.ID_no_task.unique():
    print(ID)
    subdf = copy[copy.ID == ID]
#     subdf = copy[copy.ID_no_task == ID]
    try:
        dfs.append(phanon2progsnap2(subdf, False))
    except:
        print('{} - Reconstruction failed'.format(ID))

copy = pd.concat(dfs)
copy.to_csv('phanon2ps2.csv', index=False)

In [826]:
print(len(copy))
display(copy.EventType.unique())
display(copy.groupby('EventType').count())

copy.EventType = copy.EventType.replace({'setValue':'File.Edit'})
copy.loc[(copy.EventType == 'File.Edit')&(copy.change_type == 'setValue'),'EditType'] = 'Replace'

print(len(copy))
display(copy.EventType.unique())
display(copy.groupby('EventType').count())
copy.to_csv('phanon2ps2-2.csv', index=False)

8944289


array(['File.Edit', 'X-SwitchTask', 'setValue', 'Run.Program', 'Submit',
       '*compose'], dtype=object)

Unnamed: 0_level_0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,...,ID_no_task,SubjectID,EventID,AssignmentID,X-File,InsertText,DeleteText,CodeStateSection,ClientTimestamp,EditType
EventType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
*compose,1470,1470,1470,1470,1470,1436,76,1470,0,0,...,1470,1470,1470,1470,1470,1436,76,1470,1470,0
File.Edit,8765085,8765085,8765085,8765085,8765085,6911497,1893690,8765085,0,0,...,8765085,8765085,8765085,8765085,8765085,6911497,1893690,8763713,8765085,8765085
Run.Program,131815,131815,131815,131815,131815,0,0,131815,64573,64573,...,131815,131815,131815,131815,131815,0,0,0,131815,0
Submit,3853,3853,3853,3853,3853,0,0,3853,0,0,...,3853,3853,3853,3853,3853,0,0,0,3853,0
X-SwitchTask,21178,21178,21178,21178,21178,0,0,21178,0,0,...,21178,21178,21178,21178,21178,0,0,0,21178,0
setValue,20888,20888,20888,20888,20888,19410,20440,20888,0,0,...,20888,20888,20888,20888,20888,19410,20440,19410,20888,0


8944289


array(['File.Edit', 'X-SwitchTask', 'Run.Program', 'Submit', '*compose'],
      dtype=object)

Unnamed: 0_level_0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,...,ID_no_task,SubjectID,EventID,AssignmentID,X-File,InsertText,DeleteText,CodeStateSection,ClientTimestamp,EditType
EventType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
*compose,1470,1470,1470,1470,1470,1436,76,1470,0,0,...,1470,1470,1470,1470,1470,1436,76,1470,1470,0
File.Edit,8785973,8785973,8785973,8785973,8785973,6930907,1914130,8785973,0,0,...,8785973,8785973,8785973,8785973,8785973,6930907,1914130,8783123,8785973,8785973
Run.Program,131815,131815,131815,131815,131815,0,0,131815,64573,64573,...,131815,131815,131815,131815,131815,0,0,0,131815,0
Submit,3853,3853,3853,3853,3853,0,0,3853,0,0,...,3853,3853,3853,3853,3853,0,0,0,3853,0
X-SwitchTask,21178,21178,21178,21178,21178,0,0,21178,0,0,...,21178,21178,21178,21178,21178,0,0,0,21178,0


# Convert PyPhanon and ShowYourWork to ProgSnap2

Custom Columns:
* X-File - The Python file open at the time of the event.
* X-Metadata - Data specific to a custom EventType.

Custom EventTypes:
* X-Copy - A text copy was executed. X-Metadata contains the copied text.
* X-Paste - A text paste was executed. X-Metadata contains the pasted text. There exists an accompanying File.Edit event.
* X-Action - An action was executed. X-Metadata contains the action.
* Run.Program - X-Metadata is "start" if execution was started, and the return value if it is upon execution completion.
* X-Message - X-Metadata contains the message.

In [11]:
# tool is PyPhanon or ShowYourWork
def pyphanon2progsnap2(fn, subjectID, assignmentID, tool='PP'):
    df = pd.read_csv(fn, header=None)
    cols = df.columns
    if tool == 'PP':
        cols = ['EventType', 'InsertText', 'DeleteText', 'SourceLocation', 'ClientTimestamp', 'CodeStateSection', 'version']
    else:
        cols = ['EventType', 'InsertText', 'DeleteText', 'SourceLocation', 'ClientTimestamp', 'CodeStateSection', 'version', 'XUnknown']
    df.columns = cols
    df['EditType'] = np.nan
    df['XMetadata'] = np.nan

    df.loc[(df.EventType == 'e'), 'EventType'] = 'File.Edit'
    df.loc[(df.EventType == 'File.Edit') & (~df.InsertText.isna()), 'EditType'] = 'Insert'
    df.loc[(df.EventType == 'File.Edit') & (~df.DeleteText.isna()), 'EditType'] = 'Delete'
    df.loc[(df.EventType == 'File.Edit') & (~df.InsertText.isna()) & (~df.DeleteText.isna()), 'EditType'] = 'Replace'
    df.loc[(df.EventType == 'c'), 'X-Metadata'] = df.InsertText
    df.loc[(df.EventType == 'c'), 'InsertText'] = np.nan
    df.loc[(df.EventType == 'c'), 'EventType'] = 'X-Copy'
    df.loc[(df.EventType == 'k'), 'X-Metadata'] = df.InsertText
    df.loc[(df.EventType == 'k'), 'InsertText'] = np.nan
    df.loc[(df.EventType == 'k'), 'EventType'] = 'X-Keystroke'
    df.loc[(df.EventType == 'p'), 'X-Metadata'] = df.InsertText
    df.loc[(df.EventType == 'p'), 'InsertText'] = np.nan
    df.loc[(df.EventType == 'p'), 'EventType'] = 'X-Paste'
    df.loc[(df.EventType == 'a'), 'X-Metadata'] = df.InsertText
    df.loc[(df.EventType == 'a'), 'InsertText'] = np.nan
    df.loc[(df.EventType == 'a'), 'EventType'] = 'X-Action'
    df.loc[(df.EventType == 'r'), 'X-Metadata'] = df.InsertText
    df.loc[(df.EventType == 'r')&(df.InsertText == 'Exit'), 'X-Metadata'] = df.DeleteText
    df.loc[(df.EventType == 'r'), 'InsertText'] = np.nan
    df.loc[(df.EventType == 'r'), 'DeleteText'] = np.nan
    df.loc[(df.EventType == 'r'), 'EventType'] = 'Run.Program'
    df.loc[(df.EventType == 'f'), 'X-Metadata'] = df.InsertText
    df.loc[(df.EventType == 'f'), 'InsertText'] = np.nan
    df.loc[(df.EventType == 'f'), 'EventType'] = 'X-Message'
    df.loc[(df.EventType == 't'), 'X-Metadata'] = df.DeleteText
    df.loc[(df.EventType == 't'), 'DeleteText'] = np.nan
    if len(df[(df.EventType == 't')&(df.InsertText != 'Attention')]) > 0:
        raise Exception('Unexpected InsertText in attention event', df[(df.EventType == 't')&(df.InsertText != 'Attention')].InsertText)
    df.loc[(df.EventType == 't'), 'InsertText'] = np.nan
    df.loc[(df.EventType == 't'), 'EventType'] = 'X-Attention'

    # Session may have a different meaning
#     df.loc[(df.EventType == 'X-Action') & (df.InsertText == 'ShowYourWork Init'), 'EventType'] = 'Session.Start'

    df['EventID'] = df.index
    # PC is PyCharm
    df['ToolInstances'] = 'PC;' + tool + ' ' + df.version.str.split(' ', expand=True).iloc[:,0]
    df['SubjectID'] = subjectID
    df['CodeStateID'] = np.nan
    df['AssignmentID'] = assignmentID
    
    if tool == 'PP':
        df = df[['EventID', 'EventType', 'SubjectID', 'ToolInstances', 'CodeStateID', 'SourceLocation',
                 'EditType', 'InsertText', 'DeleteText',
                 'ClientTimestamp', 'CodeStateSection','AssignmentID','X-Metadata'
           ]]
    else:
        df = df[['EventID', 'EventType', 'SubjectID', 'ToolInstances', 'CodeStateID', 'SourceLocation',
                 'EditType', 'InsertText', 'DeleteText',
                 'ClientTimestamp', 'CodeStateSection','X-Session','AssignmentID','X-Metadata'
           ]]
    
    return df

def showyourwork2progsnap2(fn, subjectID, assignmentID):
    return pyphanon2progsnap2(fn, subjectID, assignmentID, 'SYW')

df = pyphanon2progsnap2('./data/submissions-fall-2021/Assign11/Seeley-Adam-Assn11/.idea/phanon/phanonEditLog.csv',
                        'Adam Seeley', 'Assign11')
df.to_csv('test.ps2')

print(df.EventType.unique())
df[df.EventType == 'X-Copy']
df[df.EventType == 'X-Action']
df[df.EventType == 'X-Paste']
df[df.EventType == 'Run.Program']
df[df.EventType == 'X-Message']
df[df.EventType == 'X-Attention']




['X-Action' 'File.Edit' 'x' 'X-Paste' 'X-Keystroke' 'X-Attention' 'X-Copy'
 'Run.Program']


Unnamed: 0,EventID,EventType,SubjectID,ToolInstances,CodeStateID,CodeStateSection,EditType,InsertText,DeleteText,ClientTimestamp,X-File,AssignmentID,X-Metadata
55,55,X-Attention,Adam Seeley,PC;PP 1.1.10,,1489920,,,,1637105865531,orbianstarter.py,Assign11,Lockout
56,56,X-Attention,Adam Seeley,PC;PP 1.1.10,,1489920,,,,1637105867819,orbianstarter.py,Assign11,OFF
120,120,X-Attention,Adam Seeley,PC;PP 1.1.10,,1979736,,,,1637108310528,orbian.py,Assign11,Lockout
121,121,X-Attention,Adam Seeley,PC;PP 1.1.10,,1979736,,,,1637108318593,orbian.py,Assign11,OFF
2292,2292,X-Attention,Adam Seeley,PC;PP 1.1.10,,11885639,,,,1637266279612,task1.py,Assign11,Lockout
2293,2293,X-Attention,Adam Seeley,PC;PP 1.1.10,,11885639,,,,1637266283272,task1.py,Assign11,OFF
4583,4583,X-Attention,Adam Seeley,PC;PP 1.1.10,,91431,,,,1637343521420,orbian.py,Assign11,Lockout
4584,4584,X-Attention,Adam Seeley,PC;PP 1.1.10,,91431,,,,1637343523363,orbian.py,Assign11,OFF
5545,5545,X-Attention,Adam Seeley,PC;PP 1.1.10,,5035795,,,,1637371410533,plan1,Assign11,Lockout
5546,5546,X-Attention,Adam Seeley,PC;PP 1.1.10,,5035795,,,,1637371412515,plan1,Assign11,OFF


## Split a file path into its constituent parts

In [8]:
import os
import time
import traceback

def splitall(path):
    allparts = []
    while True:
        parts = os.path.split(path)
#         parts = path.split()
        if parts[0] == path: # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return allparts

# Pull all PyPhanon fall 2021 logs into a single ProgSnap2 data frame

In [None]:
# Walk through a directory hierarchy getting all PyPhanon log files,
# converting them to ProgSnap2, and concatenating them all together
# into a single dataframe.
def walkpyp2ps2(rootdir):
#     check_string = ''
    all_df = None
    students = set()
    participants = set()
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if file == 'phanonEditLog.csv':
                parts = splitall(subdir)
                assignment = parts[3]
                name_string = parts[4]
                if parts[5] == '.idea' and parts[6] == 'phanon' and len(parts) == 7:
                    name_string = name_string.replace('_', '-')
                    name_split = name_string.split('-')
                    if len(name_split) < 2:
                        print('Skipping', subdir)
                        continue
                    last_name = name_split[0]
                    first_name = name_split[1]
#                     check_string = check_string + '\n\n*******************\n' + subdir + '\n*******************\\n\n'
                    print(os.path.join(subdir, file))

                    fn = os.path.join(subdir, file)
                    student_id = first_name+' '+last_name
                    students.add(student_id)
                    try:
                        df = pyphanon2progsnap2(fn, student_id, assignment)

                        # Everything not nan should be a string
                        null_cells = df.DeleteText.isnull()
                        df.DeleteText = df.DeleteText.astype('str').mask(null_cells, np.NaN)
                        null_cells = df.InsertText.isnull()
                        df.InsertText = df.InsertText.astype('str').mask(null_cells, np.NaN)

                        # Get rid of "No Phanon history found"
                        df = df[(df.DeleteText.isna()) | (df.DeleteText.str.find('No Phanon history found') == -1)]
                        df = df[(df.InsertText.isna()) | (df.InsertText.str.find('No Phanon history found') == -1)]

                        part = df[df.EventType == 'x']
                        for i,row in part.iterrows():
                            # Mapping to get participation status of students
                            # Participation status per email from Kaden 2/3/2022
                            # x2ps2 = {'isAdult':'InsertText', 'isParticipating':'DeleteText',
                            #           'acceptFutureStudies':'SourceLocation', 'Email':'X-File'}
                            p = (row['InsertText']=='true') and (row['DeleteText']=='true') and (row['SourceLocation']=='true')
                        df = df[df.EventType != 'x']

                        if p:
                            participants.add(student_id)
                            all_df = pd.concat([all_df, df.copy()])
                        else:
                            if student_id in participants:
                                participants.remove(student_id)
                    except UnicodeDecodeError as e:
                        print('Failed to mask: {}/{}\n   {}'.format(subdir, fn, str(e)))
                    except Exception as e:
                        print('Failed to mask: {}/{}\n   {}'.format(subdir, fn, str(e)))
                        traceback.print_exc()

    print('non-participants: {}'.format(students - participants))
    return all_df

#     with open('deidentify-output.txt', 'w') as f:
#         f.write(check_string)

#     if not all_df is None:
#         all_df.to_csv('deidentify-output.csv')                    
                    
tic = time.perf_counter()

df = walkpyp2ps2('./data/submissions-fall-2021/')
# df = walkpyp2ps2('./data/submissions-fall-2021-test/')
# df = walkpyp2ps2('./data/submissions-fall-2021/Assign11/Seeley-Adam-Assn11')
# df = walkpyp2ps2('./data/submissions-fall-2021/Assign9/Mills-Melanie-Assn9')

toc = time.perf_counter()
print(f"Walk {toc - tic:0.4f} seconds")

df.to_csv('fall2021-nominal.ps2')

df.head()
df.EventType.unique()
df.SubjectID.unique()


### Remove extraneous non-participants

In [None]:
non = set(['Wakley Gage', 'Gage Wakley', 'Buster Morris', 'Bryan Armenta', 'Beau Martin', 'Phillip Summers', 'Robert Gordon', 'Nick Corgiat', 'Adam Seeley', 'Jared Solorio', 'Lucas Porter', 'PJ Huppi', 'Abbie Cox', 'Hunter Grange', 'Melanie Mills', 'Gabriel Flores', 'Emily Triggs', 'Talia Olsen', 'Bradd Poffenberger'])
df = df[~df.SubjectID.isin(non)]
df.SubjectID.unique()
df.to_csv('fall2021-nominal.ps2')

# Pull all PyPhanon fall 2021 compile/run logs into a single data frame


In [52]:
# Walk through a directory hierarchy getting all PyPhanon log files,
# converting them to ProgSnap2, and concatenating them all together
# into a single dataframe.
def walk_pyp_runs(rootdir):
#     check_string = ''
    all_df = []
    students = set()
    participants = set()
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if file == 'phanonRunLog.csv':
                parts = splitall(subdir)
                assignment = parts[3]
                name_string = parts[4]
                if parts[5] == '.idea' and parts[6] == 'phanon' and len(parts) == 7:
                    name_string = name_string.replace('_', '-')
                    name_split = name_string.split('-')
                    if len(name_split) < 2:
                        print('Skipping', subdir)
                        continue
                    last_name = name_split[0]
                    first_name = name_split[1]

                    print(os.path.join(subdir, file))

                    fn = os.path.join(subdir, file)
                    student_id = first_name+' '+last_name
                    students.add(student_id)
                    try:
                        cols = ['Action','RunID','Path','Output','Source','OutputDestination',
                                                    'ClientTimestamp','File']
                        df = pd.read_csv(fn, names=cols, dtype=str)
                        df['SubjectID'] = student_id
                        df['AssignmentID'] = assignment
                        df = df.drop('Path', axis=1)

                        all_df.append(df.copy())
#                         all_df = pd.concat([all_df, df.copy()])
                    except UnicodeDecodeError as e:
                        print('Failed to read: {}/{}\n   {}'.format(subdir, fn, str(e)))
                    except Exception as e:
                        print('Failed to read: {}/{}\n   {}'.format(subdir, fn, str(e)))
                        traceback.print_exc()
    return pd.concat(all_df)

tic = time.perf_counter()

df = walk_pyp_runs('./data/submissions-fall-2021/')
# df = walk_pyp_runs('./data/submissions-fall-2021-test/')
# df = walk_pyp_runs('./data/submissions-fall-2021/Assign7/Anderson-Heidi-Assn7')
# df = walk_pyp_runs('./data/submissions-fall-2021/Assign9/Mills-Melanie-Assn9')

toc = time.perf_counter()
print(f"Walk {toc - tic:0.4f} seconds")

df.head()
df.to_csv('fall2021-run.csv')



### Deidentify output of run files

In [70]:
df = pd.read_csv('fall2021-run.csv', dtype=str)

In [88]:
# test = df.head(500).copy()
test = df.copy()

r = re.compile(r'File \".*/')
mask = (test.Output.notna())&(test.Output.str.contains(r))
test.loc[mask, 'Output'] = test[mask].Output.replace(r, 'File \"')

r = re.compile(r'File \".*\\')
mask = (test.Output.notna())&(test.Output.str.contains(r))
test.loc[mask, 'Output'] = test[mask].Output.replace(r, 'File \"')

test.to_csv('fall2021-run.csv')
run = test

# Deidentify 2019

In [881]:
# df_2019.to_csv('data-2019/phanon2ps2-4.csv', index=False)

In [876]:
df_2019 = pd.read_csv('data-2019/phanon2ps2-3.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [888]:
df_2019.SubjectID = df_2019.SubjectID.astype('str')
df_2019.AssignmentID = df_2019.AssignmentID.astype('str')


In [893]:
students = df_2019.SubjectID.unique()
# students = set(students[:8])
students = set(['100069', '100070'])
print('Deidentifying {}'.format(students))
df = df_2019[df_2019.SubjectID.isin(students)].copy()
df.to_csv('test.csv')

Deidentifying {'100069', '100070'}


In [895]:
students = df_2019.SubjectID.unique()
# students = set(students[:8])
students = set(['100069', '100070'])
print('Deidentifying {}'.format(students))
df = df_2019[df_2019.SubjectID.isin(students)].copy()
print(len(df))

tic = time.perf_counter()
df,ms,programs,program_heads,deleted = deidentifyps2(df, header_offset=0)
toc = time.perf_counter()
print(f"Masked {toc - tic:0.4f} seconds")

print(f'Masked strings: {ms}')

with open('deidentified-programs.txt', 'w') as f:
    f.write(programs)
with open('deidentified-program-heads.txt', 'w') as f:
    f.write(program_heads)
with open('deidentified-deleted.txt', 'w') as f:
    f.write(deleted)
# if not df is None:
#     df = df.drop(['level_0','uniqueID','index'], axis=1)
#     df.to_csv('fall2021-deidentified-2.ps2')

Deidentifying {'100069', '100070'}
34541
100069 - 195 - task1.py
Failed to mask 100069 - 195 - task1.py: cannot convert float NaN to integer
100069 - 200 - task1.py


Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/4268047391.py", line 26, in reconstruct
    code_idx = int(row.SourceLocation)
ValueError: cannot convert float NaN to integer
Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45j

Failed to mask 100069 - 200 - task1.py: cannot convert float NaN to integer
100069 - 204 - task1.py
Failed to mask 100069 - 204 - task1.py: cannot convert float NaN to integer
100069 - 205 - task0.py
Failed to mask 100069 - 205 - task0.py: cannot convert float NaN to integer


Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/4268047391.py", line 26, in reconstruct
    code_idx = int(row.SourceLocation)
ValueError: cannot convert float NaN to integer
Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45j

100069 - 205 - task1.py
Failed to mask 100069 - 205 - task1.py: cannot convert float NaN to integer
100069 - 207 - task0.py
Failed to mask 100069 - 207 - task0.py: cannot convert float NaN to integer
100069 - 207 - task1.py
Failed to mask 100069 - 207 - task1.py: cannot convert float NaN to integer
100069 - 213 - task1.py
Failed to mask 100069 - 213 - task1.py: cannot convert float NaN to integer
100069 - 213 - task2.py
Failed to mask 100069 - 213 - task2.py: cannot convert float NaN to integer
100070 - 195 - task0.py


Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/4268047391.py", line 26, in reconstruct
    code_idx = int(row.SourceLocation)
ValueError: cannot convert float NaN to integer
Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45j

Failed to mask 100070 - 195 - task0.py: cannot convert float NaN to integer
100070 - 195 - task1.py
Failed to mask 100070 - 195 - task1.py: cannot convert float NaN to integer
100070 - 200 - task1.py
Failed to mask 100070 - 200 - task1.py: cannot convert float NaN to integer
100070 - 200 - task0.py
Failed to mask 100070 - 200 - task0.py: cannot convert float NaN to integer
100070 - 204 - task1.py


Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/4268047391.py", line 26, in reconstruct
    code_idx = int(row.SourceLocation)
ValueError: cannot convert float NaN to integer
Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45j

Failed to mask 100070 - 204 - task1.py: cannot convert float NaN to integer
100070 - 205 - task0.py


Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/4268047391.py", line 26, in reconstruct
    code_idx = int(row.SourceLocation)
ValueError: cannot convert float NaN to integer


Failed to mask 100070 - 205 - task0.py: cannot convert float NaN to integer
100070 - 205 - task1.py
Failed to mask 100070 - 205 - task1.py: cannot convert float NaN to integer
100070 - 213 - task1.py
Failed to mask 100070 - 213 - task1.py: cannot convert float NaN to integer
100070 - 213 - task2.py


Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/4268047391.py", line 26, in reconstruct
    code_idx = int(row.SourceLocation)
ValueError: cannot convert float NaN to integer
Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45j

Failed to mask 100070 - 213 - task2.py: cannot convert float NaN to integer


Traceback (most recent call last):
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/1850307259.py", line 26, in deidentifyps2
    masked,masked_strings = mask_df(masked, mask_re)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/3939029229.py", line 3, in mask_df
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)
  File "/var/folders/54/q8d45jvx1md8zqt_82xjx90h0000gn/T/ipykernel_993/4268047391.py", line 26, in reconstruct
    code_idx = int(row.SourceLocation)
ValueError: cannot convert float NaN to integer


ValueError: No objects to concatenate

# Deidentify fall 2021

In [31]:
df_fall2021 = pd.read_csv('fall2021-identifiable.ps2')

In [25]:
# Fix Ben Smith Assign10 - run this code only once after compiling PyPhanon data together
df = df_fall2021.copy()
df.loc[df.SubjectID == 'Assn10 SmithBen', 'SubjectID'] = 'Ben Smith'
df[df.SubjectID == 'Assn10 SmithBen'].head()
df[(df.SubjectID == 'Ben Smith')&(df.AssignmentID == 'Assign10')].head()
df.to_csv('fall2021-identifiable.ps2')
df_fall2021 = df

In [39]:
# Fix Tommy Bolton - run this code only once after compiling PyPhanon data together
df = df_fall2021.copy()
df.loc[df.SubjectID == 'Tommy Bolton', 'SubjectID'] = 'Tommy Thomsen Bolton'
# display(df[df.SubjectID == 'Thomsen Bolton'].head())
# display(df[(df.SubjectID == 'Tommy Bolton')&(df.AssignmentID == 'Assign7')&(df['X-File'] == 'task.2.py')&(df.EventType == 'File.Edit')].head(30))
df.to_csv('fall2021-identifiable.ps2')
df_fall2021 = df

In [None]:
# Fix Gage Wakley - run this code only once after compiling PyPhanon data together
df = df_fall2021.copy()
df.loc[df.SubjectID == 'assn12 pythonProject', 'SubjectID'] = 'Gage Wakley'
display(df[df.SubjectID == 'assn12 pythonProject'].head())
display(df[(df.SubjectID == 'Gage Wakley')&(df.AssignmentID == 'Assign12')].head())
df.to_csv('fall2021-identifiable.ps2')
df_fall2021 = df

In [30]:
# Remove Gage Wakley - not a participant - run this code only once after compiling PyPhanon data together
df = df_fall2021.copy()
# display(len(df))
# display(len(df.SubjectID.unique()))
# display(df[df.SubjectID == 'Gage Wakley'].AssignmentID.unique())
df = df[df.SubjectID != 'Gage Wakley']
# display(len(df))
# display(len(df.SubjectID.unique()))
# display(df[(df.SubjectID == 'Gage Wakley')&(df.AssignmentID == 'Assign12')].head())
df.to_csv('fall2021-identifiable.ps2')
df_fall2021 = df

In [None]:
# Fix Ben Smith Assign13 - run this code only once after compiling PyPhanon data together
df = df_fall2021.copy()
df.loc[df.SubjectID == 'Assn13 SmithBen', 'SubjectID'] = 'Ben Smith'
display(df[df.SubjectID == 'Assn13 SmithBen'].head())
display(df[(df.SubjectID == 'Ben Smith')&(df.AssignmentID == 'Assign13')].head())
df.to_csv('fall2021-identifiable.ps2')
df_fall2021 = df

In [82]:
# Fix Holli Cook - run this code only once after compiling PyPhanon data together
df = df_fall2021.copy()
df.loc[df.SubjectID == 'Holli Coo', 'SubjectID'] = 'Holli Cook'
df.to_csv('fall2021-identifiable.ps2')
df_fall2021 = df

## Incorporate runs

In [19]:
# Incorporate runs
df = pd.read_csv('fall2021-deidentified.ps2')

In [22]:
df = df[(df.SubjectID == 'Heidi Anderson')&(df.AssignmentID == 'Assign7')]
df = df[df.EventType == 'Run.Program']
df = df[df['CodeStateSection'] == 'task2.py']
display(df[df.ClientTimestamp == 1634926340484])
df.head()

Unnamed: 0.2,Unnamed: 0,level_0,index,Unnamed: 0.1,Unnamed: 0.1.1,EventID,EventType,SubjectID,ToolInstances,CodeStateID,CodeStateSection,EditType,InsertText,DeleteText,ClientTimestamp,X-File,AssignmentID,X-Metadata,uniqueID
351031,587,587,629309,733619,1625,1625,Run.Program,Heidi Anderson,PC;PP 1.1.10,,,,,,1634926340484,task2.py,Assign7,Start,Heidi AndersonAssign7task2.py


Unnamed: 0.2,Unnamed: 0,level_0,index,Unnamed: 0.1,Unnamed: 0.1.1,EventID,EventType,SubjectID,ToolInstances,CodeStateID,CodeStateSection,EditType,InsertText,DeleteText,ClientTimestamp,X-File,AssignmentID,X-Metadata,uniqueID
351023,579,579,629301,733611,1617,1617,Run.Program,Heidi Anderson,PC;PP 1.1.10,,,,,,1634926321386,task2.py,Assign7,Start,Heidi AndersonAssign7task2.py
351024,580,580,629302,733612,1618,1618,Run.Program,Heidi Anderson,PC;PP 1.1.10,,,,,,1634926321393,task2.py,Assign7,1,Heidi AndersonAssign7task2.py
351031,587,587,629309,733619,1625,1625,Run.Program,Heidi Anderson,PC;PP 1.1.10,,,,,,1634926340484,task2.py,Assign7,Start,Heidi AndersonAssign7task2.py
351032,588,588,629310,733620,1626,1626,Run.Program,Heidi Anderson,PC;PP 1.1.10,,,,,,1634926340487,task2.py,Assign7,1,Heidi AndersonAssign7task2.py
351164,720,720,630185,734495,2503,2503,Run.Program,Heidi Anderson,PC;PP 1.1.10,,,,,,1634927258087,task2.py,Assign7,Start,Heidi AndersonAssign7task2.py


## Do the deidentification

In [14]:
# df_fall2021 = pd.read_csv('fall2021-identifiable.ps2')
df_fall2021 = pd.read_csv('fall2021-deidentified.ps2')

students = df_fall2021.SubjectID.unique()
# students = set(students[:8])
# students = set(['Thomas Nelson'])
print('Deidentifying {}'.format(students))
df = df_fall2021[df_fall2021.SubjectID.isin(students)].copy()

tic = time.perf_counter()
df,ms,programs,program_heads,deleted = deidentifyps2(df)
toc = time.perf_counter()
print(f"Masked {toc - tic:0.4f} seconds")

print(f'Masked strings: {ms}')

with open('deidentified-programs.txt', 'w') as f:
    f.write(programs)
with open('deidentified-program-heads.txt', 'w') as f:
    f.write(program_heads)
with open('deidentified-deleted.txt', 'w') as f:
    f.write(deleted)
if not df is None:
    df = df.drop(['level_0','uniqueID','index'], axis=1)
    df.to_csv('fall2021-deidentified-2.ps2')

Deidentifying ['Thomas Nelson' 'Joshua Wright' 'Mia Reynolds' 'Grant Sorenson'
 'Anna Taylor' 'Dane Rasmussen' 'Dallin Wilson' 'Tyler Bowden'
 'Wyatt Chadwick' 'Perry Sundstrom' 'Jacob Dayley' 'Camden Glover'
 'Benson Riley' 'Michael Scott' 'Harrison Wynn' 'Colton Martin'
 'Jake Murdock' 'Landon Reber' 'Rhett Parry' 'Ryan Gubler' 'Alex Spencer'
 'Holli Cook' 'Brooklyn Haight' 'Carli Mano' 'Harrison Beckett'
 'Matthew White' 'Devin Winters' 'Braden Christensen' 'Dillan Hart'
 'Kaili Pearson' 'Ben Smith' 'Lauren Rose' 'Reber Landon' 'Glover Camden'
 'Heidi Anderson' 'Linlea West' 'Tommy Thomsen Bolton' 'Emma Wright'
 'Brighton Ellis' 'Gabriel Erb' 'Jaron Grass' 'Laura Adams'
 'Michael Westberg' 'Abby Englund' 'Eddie Faires' 'Oscar Lopez'
 'Chadwick Wyatt']
Thomas Nelson - Assign10 - task1.py
Thomas Nelson - Assign10 - wordinator.py
Joshua Wright - Assign10 - task1.py
Joshua Wright - Assign10 - Task 2.py
Joshua Wright - Assign10 - Task 1.1.py
Joshua Wright - Assign10 - Task 1.py
Joshua Wr

Harrison Wynn - Assign11 - pattern.py
Harrison Wynn - Assign11 - scratch.py
Harrison Wynn - Assign11 - scratch_2.py
Harrison Wynn - Assign11 - scratch_3.py
Harrison Wynn - Assign11 - scratch_4.py
Harrison Wynn - Assign11 - orbian.py
Harrison Wynn - Assign11 - scratch_14.py
Benson Riley - Assign11 - task1.py
Benson Riley - Assign11 - task2.py
Benson Riley - Assign11 - Task2.py
Benson Riley - Assign11 - Task1.py
Benson Riley - Assign11 - assn7-task2-starter.py
Benson Riley - Assign11 - chessboard.py
Benson Riley - Assign11 - unit5_task1_starter.py
Benson Riley - Assign11 - wordinator.py
Benson Riley - Assign11 - orbian.py
Rhett Parry - Assign11 - task1.py
Rhett Parry - Assign11 - task2.py
Rhett Parry - Assign11 - blobber.py
Rhett Parry - Assign11 - wordinator.py
Rhett Parry - Assign11 - orbian.py
Reber Landon - Assign11 - orbian.py
Reber Landon - Assign11 - task1.py
Reber Landon - Assign11 - tt.py
Jake Murdock - Assign11 - task1.py
Jake Murdock - Assign11 - orbian.py
Joshua Wright - Assi

Dallin Wilson - Assign7 - task.py
Brooklyn Haight - Assign7 - task2.py
Brooklyn Haight - Assign7 - task1.py
Brooklyn Haight - Assign7 - chessboard.py
Brooklyn Haight - Assign7 - plan2.txt.py
Anna Taylor - Assign7 - task1.py
Anna Taylor - Assign7 - task2.py
Anna Taylor - Assign7 - chessboard.py
Anna Taylor - Assign7 - practice.py
Brighton Ellis - Assign7 - scratchBook.py
Brighton Ellis - Assign7 - task1.py
Brighton Ellis - Assign7 - task2.py
Brighton Ellis - Assign7 - primes.py
Brighton Ellis - Assign7 - scratchWork.py
Brighton Ellis - Assign7 - chessboard.py
Brighton Ellis - Assign7 - multiplication table.py
Brighton Ellis - Assign7 - junk.py
Michael Scott - Assign9 - main.py
Michael Scott - Assign9 - task2.py
Michael Scott - Assign9 - task1.py
Michael Scott - Assign9 - Chessboard.py
Michael Scott - Assign9 - pattern.py
Camden Glover - Assign9 - task1.py
Camden Glover - Assign9 - modulesTask1.py
Camden Glover - Assign9 - task2.py
Camden Glover - Assign9 - blobber.py
Camden Glover - Ass

Jacob Dayley - Assign8 - assn4-task3-starter.py
Jacob Dayley - Assign8 - task1.py
Landon Reber - Assign8 - task1.py
Landon Reber - Assign8 - task2.py
Landon Reber - Assign8 - chessboard.py
Landon Reber - Assign8 - pattern.py
Landon Reber - Assign8 - testingFile.py
Heidi Anderson - Assign8 - task1.py
Heidi Anderson - Assign8 - pattern.py
Heidi Anderson - Assign8 - main.py
Heidi Anderson - Assign8 - chessboard.py
Harrison Wynn - Assign8 - Task1.py
Harrison Wynn - Assign8 - pattern.py
Harrison Wynn - Assign8 - scratch.py
Harrison Wynn - Assign8 - scratch_2.py
Rhett Parry - Assign8 - task1.py
Rhett Parry - Assign8 - pattern.py
Ryan Gubler - Assign8 - Task1.py
Ryan Gubler - Assign8 - Task2.py
Ryan Gubler - Assign8 - task1.py
Ryan Gubler - Assign8 - task2.py
Ryan Gubler - Assign8 - chessboard.py
Ryan Gubler - Assign8 - pattern.py
Colton Martin - Assign8 - main.py
Colton Martin - Assign8 - task1.py
Colton Martin - Assign8 - starter.py
Colton Martin - Assign8 - pattern.py
Jake Murdock - Assign

Lauren Rose - Assign6 - task2.py
Wyatt Chadwick - Assign6 - messing round.py
Wyatt Chadwick - Assign6 - Task1.py
Wyatt Chadwick - Assign6 - Task2.py
Thomas Nelson - Assign6 - task1.py
Thomas Nelson - Assign6 - main.py
Thomas Nelson - Assign6 - seeddemo-1.py
Thomas Nelson - Assign6 - flukytester.py
Thomas Nelson - Assign6 - task2.py
Anna Taylor - Assign6 - ComputeExpression.py
Anna Taylor - Assign6 - main.py
Anna Taylor - Assign6 - welcome.py
Anna Taylor - Assign6 - task1.py
Anna Taylor - Assign6 - task2.py
Oscar Lopez - Assign6 - main.py
Oscar Lopez - Assign6 - Task1.py
Oscar Lopez - Assign6 - Task.2.py
Brooklyn Haight - Assign6 - task2.py
Brooklyn Haight - Assign6 - task1.py
Dallin Wilson - Assign6 - trash.py
Dallin Wilson - Assign6 - task1.py
Dallin Wilson - Assign6 - task2.py
Brighton Ellis - Assign6 - scratchBook.py
Brighton Ellis - Assign6 - task1.py
Brighton Ellis - Assign6 - task2.py
Brighton Ellis - Assign6 - scratchWork.py
Rhett Parry - Assign13 - task2.py
Rhett Parry - Assign

Lauren Rose - Assign13 - task1.py
Lauren Rose - Assign13 - task2.py
Lauren Rose - Assign13 - chessboard.py
Lauren Rose - Assign13 - main.py
Lauren Rose - Assign13 - pattern.py
Lauren Rose - Assign13 - test.py
Landon Reber - Assign12 - unit6_task1_starter.py
Landon Reber - Assign12 - task2.py
Landon Reber - Assign12 - orbian.py
Landon Reber - Assign12 - unit6_task3_starter.py
Rhett Parry - Assign12 - task1.py
Rhett Parry - Assign12 - task3.py
Rhett Parry - Assign12 - task2.py
Rhett Parry - Assign12 - card.py
Rhett Parry - Assign12 - gronkyutil.py
Rhett Parry - Assign12 - deck.py
Jake Murdock - Assign12 - task1.py
Jake Murdock - Assign12 - task2.py
Jake Murdock - Assign12 - task3.py
Joshua Wright - Assign12 - __init__.py
Joshua Wright - Assign12 - Task 1.py
Joshua Wright - Assign12 - orbian_starter.py
Joshua Wright - Assign12 - orbian.py
Carli Mano - Assign12 - task1.py
Carli Mano - Assign12 - task2.py
Carli Mano - Assign12 - unit6_task3_starter.py
Carli Mano - Assign12 - task3.py
Carli 

Masked 627.9760 seconds
Masked strings: {'Joshua'}


## Remove paths from "Creating file" X-Actions.

In [143]:
# Actions create files and give the filename. Replace those.
#   ("Creating file C:\Users\@@@@.@@@@@\Desktop\CS 1400\@@@@-@@@@@-Assn8\plan1.txt"
ddf = pd.read_csv('fall2021-deidentified.ps2')

r = re.compile(r'Creating file .*\\')
mask = (ddf.EventType == 'X-Action')&(ddf['X-Metadata'].str.contains(r))
ddf.loc[mask, 'X-Metadata'] = ddf.loc[mask,'X-Metadata'].str.replace(r, 'Creating file ')

r = re.compile(r'Creating file .*/')
mask = (ddf.EventType == 'X-Action')&(ddf['X-Metadata'].str.contains(r))
ddf.loc[mask, 'X-Metadata'] = ddf.loc[mask,'X-Metadata'].str.replace(r, 'Creating file ')

ddf.to_csv('fall2021-deidentified.ps2')


## Fix Joshua Wright

In [165]:
ddf = pd.read_csv('fall2021-deidentified.ps2')
ddf.loc[ddf.SubjectID == 'Joshus Wright', 'SubjectID'] = 'Joshua Wright'
ddf.to_csv('fall2021-deidentified.ps2')

## De-identify X-Metadata (for X-Copy and X-Paste and whatever else might need it)

In [187]:
ddf = pd.read_csv('fall2021-deidentified.ps2')

In [182]:
temp = ddf.copy()
subjects = temp.SubjectID.unique()
for subjectID in subjects:
    r = re.compile(subjectID2mask_re(subjectID, include_Anum=False), flags=re.IGNORECASE)
    mask = (temp.SubjectID == subjectID)&(temp['X-Metadata'].notna())
    temp.loc[mask, 'X-Metadata'] = temp.loc[mask, 'X-Metadata'].str.replace(r, '-@@@-')
temp.to_csv('fall2021-deidentified.ps2')


In [189]:
temp = ddf.copy()
mask = temp.SubjectID == 'Eddie Faires'
# display(temp[mask].head())
temp.loc[mask, 'CodeStateSection'] = temp.loc[mask, 'CodeStateSection'].str.replace('Faires-Eddie-','')
# temp[mask].head()
temp.to_csv('fall2021-deidentified.ps2')

## Check

In [190]:
ddf = pd.read_csv('fall2021-deidentified.ps2')
temp = ddf.copy()
temp.SubjectID = ''
temp.to_csv('test.ps2')

In [191]:
s = ''
with open('test.ps2') as f:
    lines = f.readlines()
    s = ''.join(lines)
for subjectID in ddf.SubjectID.unique():
    st = subjectID2mask_re(subjectID, include_Anum=False, substrings=False)
    for it in re.finditer(st, s, flags=re.IGNORECASE):
        print(it[0], it.start(), it.end())

anna 264153015 264153019
anna 264154496 264154500
anna 264289978 264289982
anna 264291816 264291820
anna 265886994 265886998
anna 265890555 265890559
anna 265892442 265892446
anna 265894079 265894083
white 41172131 41172136
white 41172307 41172312
white 41173471 41173476
white 41173808 41173813
white 123830943 123830948
white 179562226 179562231
white 179578147 179578152
white 180092062 180092067
West 247293977 247293981
West 247295452 247295456
West 247306427 247306431
erb 48521973 48521976
erb 48524674 48524677
erb 48868278 48868281
erb 72056451 72056454
erb 72056626 72056629
erb 72057322 72057325
erb 72057495 72057498
erb 118709638 118709641
erb 188570405 188570408
erB 212258753 212258756


In [186]:
ddf.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,EventID,EventType,...,ToolInstances,CodeStateID,CodeStateSection,EditType,InsertText,DeleteText,ClientTimestamp,X-File,AssignmentID,X-Metadata
0,0,0,0,0,1,1,1,2,2,File.Edit,...,PC;PP 1.1.10,,0.0,Insert,# @@@@@@@@@@@@@\n# CS1400 - MW1\n# Assignment ??,,1636696834375,task1.py,Assign10,
1,1,1,1,1,2,2,2,3,3,File.Edit,...,PC;PP 1.1.10,,46.0,Insert,\n,,1636696835921,task1.py,Assign10,
2,2,2,2,2,3,3,3,4,4,X-Action,...,PC;PP 1.1.10,,,,,,1636696838793,task1.py,Assign10,Enter
3,3,3,3,3,4,4,4,5,5,File.Edit,...,PC;PP 1.1.10,,47.0,Insert,\n,,1636696839138,task1.py,Assign10,
4,4,4,4,4,5,5,5,6,6,X-Action,...,PC;PP 1.1.10,,,,,,1636696840014,task1.py,Assign10,Paste


In [168]:
s = ddf.SubjectID.unique()
s.sort()
print(s)

['Abby Englund' 'Alex Spencer' 'Anna Taylor' 'Ben Smith' 'Benson Riley'
 'Braden Christensen' 'Brighton Ellis' 'Brooklyn Haight' 'Camden Glover'
 'Carli Mano' 'Chadwick Wyatt' 'Colton Martin' 'Dallin Wilson'
 'Dane Rasmussen' 'Devin Winters' 'Dillan Hart' 'Eddie Faires'
 'Emma Wright' 'Gabriel Erb' 'Glover Camden' 'Grant Sorenson'
 'Harrison Beckett' 'Harrison Wynn' 'Heidi Anderson' 'Holli Cook'
 'Jacob Dayley' 'Jake Murdock' 'Jaron Grass' 'Joshua Wright'
 'Kaili Pearson' 'Landon Reber' 'Laura Adams' 'Lauren Rose' 'Linlea West'
 'Matthew White' 'Mia Reynolds' 'Michael Scott' 'Michael Westberg'
 'Oscar Lopez' 'Perry Sundstrom' 'Reber Landon' 'Rhett Parry'
 'Ryan Gubler' 'Thomas Nelson' 'Tommy Thomsen Bolton' 'Tyler Bowden'
 'Wyatt Chadwick']


# Students

In [40]:
students = pd.read_csv('data/students.csv')
keystrokes = pd.read_csv('data/fall2021-deidentified.ps2')


In [39]:

# {'Chadwick Wyatt',
keystrokes.loc[keystrokes.SubjectID == 'Chadwick Wyatt', 'SubjectID'] = 'Wyatt Chadwick'

#  'Colton Martin',
students.loc[students.Name == 'olton James Martin', 'Name'] = 'Colton James Martin'
students.loc[students.Name == 'Colton James Martin', 'Name'] = 'Colton Martin'

#  'Dillan Hart',
students.loc[students.Name == 'Dillan S Hart', 'Name'] = 'Dillan Hart'

#  'Eddie Faires', # I think he dropped the class
print(len(keystrokes[keystrokes.SubjectID == 'Eddie Faires']))
#  'Gabriel Erb', # He may have dropped it too
print(len(keystrokes[keystrokes.SubjectID == 'Gabriel Erb']))
#  'Glover Camden',
keystrokes.loc[keystrokes.SubjectID == 'Glover Camden', 'SubjectID'] = 'Camden Glover'
#  'Harrison Beckett',
students.loc[students.Name == 'Harrison Dale Beckett', 'Name'] = 'Harrison Beckett'
#  'Joshua Wright',
students.loc[students.Name == 'Josh Wright', 'Name'] = 'Joshua Wright'
#  'Laura Adams',
students.loc[students.Name == 'Laura Lyn Adams', 'Name'] = 'Laura Adams'
#  'Linlea West',
students.loc[students.Name == 'Linlea Harrison West', 'Name'] = 'Linlea West'
#  'Oscar Lopez', # dropped
print(len(keystrokes[keystrokes.SubjectID == 'Oscar Lopez']))
#  'Reber Landon',
keystrokes.loc[keystrokes.SubjectID == 'Reber Landon', 'SubjectID'] = 'Landon Reber'
#  'Tommy Thomsen Bolton'}
keystrokes.loc[keystrokes.SubjectID == 'Tommy Thomsen Bolton', 'SubjectID'] = 'Tommy Bolton'

students.to_csv('data/students.csv')
keystrokes.to_csv('data/fall2021-deidentified.ps2')


3771
24803
9376


In [41]:
s = set(students.Name)
d = set(keystrokes.SubjectID.unique())
display(d-s)

{'Eddie Faires', 'Gabriel Erb', 'Oscar Lopez'}

## Grades

In [100]:
grades = pd.read_csv('data/grades.csv')
# print(list(grades.columns))
grades = grades[['Student', 'Assignment 7 (3477106)', 'Assignment 8 (3477107)', 'Assignment 9 (3477108)',
                 'Assignment 10 (3477097)', 'Assignment 11 (3477098)', 'Assignment 12 (3477099)',
                 'Exam 1 (Remotely Proctored) (3477077)', 'Exam 2 (Remotely Proctored) (3477088)',
                 'Exam 3 (Remotely Proctored) (3477080)', 'Final Score']]
grades.columns = ['Name', 'Assign7', 'Assign8', 'Assign9',
                 'Assign10', 'Assign11', 'Assign12',
                 'Exam1', 'Exam2', 'Exam3', 'FinalScore']
spl = grades.Name.str.split(',', expand=True)
grades.Name = spl[1] + ' ' + spl[0]
grades.Name = grades.Name.str.strip()

grades.loc[grades.Name == 'Colton James Martin', 'Name'] = 'Colton Martin'
grades.loc[grades.Name == 'Dillan S Hart', 'Name'] = 'Dillan Hart'
grades.loc[grades.Name == 'Harrison Dale Beckett', 'Name'] = 'Harrison Beckett'
grades.loc[grades.Name == 'Josh Wright', 'Name'] = 'Joshua Wright'
grades.loc[grades.Name == 'Laura Lyn Adams', 'Name'] = 'Laura Adams'
grades.loc[grades.Name == 'Linlea Harrison West', 'Name'] = 'Linlea West'
grades = grades[grades.Name.isin(set(keystrokes.SubjectID.unique()))]
# display(grades.Name)
sg = pd.merge(grades, students, how='outer', on='Name')
# sg
sg = sg[['Name', 'Assign7', 'Assign8', 'Assign9', 'Assign10', 'Assign11',
       'Assign12', 'Exam1', 'Exam2', 'Exam3', 'FinalScore', 'PRIMARY_MAJOR', 'HIGHEST_COMPOSITE_ACT', 'HS_GPA']]
sg.columns = ['SubjectID', 'Assign7', 'Assign8', 'Assign9', 'Assign10', 'Assign11',
       'Assign12', 'Exam1', 'Exam2', 'Exam3', 'FinalScore', 'major', 'HighestACT', 'HighSchoolGPA']
# display(sg)

sg = sg[sg.SubjectID.isin(set(keystrokes.SubjectID.unique()))]
# 'Eddie Faires', 'Gabriel Erb', 'Oscar Lopez'
sg = sg.append(pd.DataFrame({'SubjectID':['Eddie Faires', 'Gabriel Erb', 'Oscar Lopez']}))
# display(sg)

print(len(sg.SubjectID.unique()))
print(len(keystrokes.SubjectID.unique()))
# display(set(keystrokes.SubjectID.unique())-(set(students.Name)))
# display(set(keystrokes.SubjectID.unique())-(set(grades.Name)))
# display(set(keystrokes.SubjectID.unique()) - set(sg.SubjectID.unique()))
# display(grades.Name)

44
44


In [105]:
runs = pd.read_csv('data/fall2021-run.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [106]:
runs.columns
runs = runs[['SubjectID', 'AssignmentID', 'File', 'Action', 'RunID', 'Output', 'Source',
       'OutputDestination', 'ClientTimestamp']]


In [109]:
len(runs.SubjectID.unique())
runs.loc[runs.SubjectID == 'Chadwick Wyatt', 'SubjectID'] = 'Wyatt Chadwick'
runs.loc[runs.SubjectID == 'Glover Camden', 'SubjectID'] = 'Camden Glover'
runs.loc[runs.SubjectID == 'Reber Landon', 'SubjectID'] = 'Landon Reber'
runs.loc[runs.SubjectID == 'Tommy Thomsen Bolton', 'SubjectID'] = 'Tommy Bolton'
len(runs.SubjectID.unique())


65

In [110]:
s = runs.SubjectID.unique()

In [114]:
m = {'Thomsen Bolton':'Tommy Bolton', 'Assn10 SmithBen':'Ben Smith', 'Joshus Wright':'Joshua Wright'}
runs.SubjectID = runs.SubjectID.replace(m)
print(len(runs))
runs = runs[runs.SubjectID.isin(set(keystrokes.SubjectID.unique()))]
print(len(runs))


47613543
39658193


In [115]:
runs.to_csv('data/fall2021-run.csv')

## Replace names and write final files

In [None]:
import random
students = sg
names = list(students.SubjectID.unique())
random.shuffle(names)
m = {}
i = 1
for s in names:
    m[s] = f'Student{i}'
    i = i+1


In [122]:
name2id = m
keystrokes.SubjectID = keystrokes.SubjectID.replace(name2id)
students.SubjectID = students.SubjectID.replace(name2id)
runs.SubjectID = runs.SubjectID.replace(name2id)


In [131]:
runs = runs[['SubjectID', 'AssignmentID', 'File', 'Action', 'RunID', 'Output',
       'Source', 'OutputDestination', 'ClientTimestamp']]

In [137]:
keystrokes = keystrokes[['EventID', 'SubjectID', 'AssignmentID', 'CodeStateSection', 'EventType', 'ToolInstances', 'CodeStateID', 'SourceLocation', 'EditType',
       'InsertText', 'DeleteText', 'X-Metadata', 'ClientTimestamp']]

In [148]:
keystrokes.to_csv('data-fall2021-deidentified/keystrokes.csv')
students.to_csv('data-fall2021-deidentified/students.csv')
runs.to_csv('data-fall2021-deidentified/runs.csv')


## Sort keystrokes by subject then timestamp

In [223]:
keystrokes = pd.read_csv('data-fall2021-deidentified/keystrokes.csv')

In [225]:
keystrokes = keystrokes.sort_values(['SubjectID','ClientTimestamp'])
keystrokes.to_csv('data-fall2021-deidentified/keystrokes.csv')

## Add number of output lines from run file

In [211]:
test = runs#runs[(runs.SubjectID == 'Student38')|(runs.SubjectID == 'Student39')].copy()
# display(len(test))

In [None]:
# test2 = test.copy()
test2 = test[0:100].copy()
test2 = test2.sort_values(['SubjectID','ClientTimestamp'])
test2.File = test2.

y = test2.Action == 'o'
y = y * (y.groupby((y != y.shift()).cumsum()).cumcount() + 1)
test2['y'] = y
test2 = test2[['SubjectID', 'AssignmentID', 'File', 'Action', 'y', 'RunID', 'Output',
       'Source', 'OutputDestination', 'ClientTimestamp']]
z = test2.groupby(['SubjectID','AssignmentID','RunID']).agg({'y':'max'}).reset_index()
z = z.rename({'y':'NumOutputLines'}, axis='columns', copy=False)
display(z)
# test2 = test2[test2.Action != 'o']
test2 = pd.merge(test2, z)
# test2.loc[test2.Action == 'o', 'NumOutputLines'] = np.nan
test2 = test2[['SubjectID', 'AssignmentID', 'File', 'Action', 'RunID', 'NumOutputLines',
               'Source', 'OutputDestination', 'Output', 'ClientTimestamp']]
# print(test2.columns)
display(test2.head(20))
# test2.groupby('SubjectID').count()

runs = test2

In [197]:
runs.to_csv('data-fall2021-deidentified/runs.csv')

## Fix r@@@@m and he@@@@

In [259]:
keystrokes = pd.read_csv('data-fall2021-deidentified/keystrokes.csv')

In [275]:
keystrokes_orig = keystrokes.copy()

In [281]:
keystrokes.size
# len(keystrokes)

33421856

In [296]:
keystrokes = keystrokes_orig.copy()
cols = keystrokes.columns
ids = keystrokes.SubjectID+keystrokes.AssignmentID+keystrokes['CodeStateSection']
keystrokes['ID'] = ids
ids_to_check = ['Student15Assign7task2.py','Student15Assign7chessboard.py','Student15Assign8task1.py',
                'Student15Assign8pattern.py','Student15Assign9task2.py','Student44Assign6task1.py',
                'Student44Assign6task2.py','Student44Assign8pattern.py','Student44Assign11task1.py']
all_masked = []
for i in ids_to_check:
    df_mask = (keystrokes.ID == i)
    df = keystrokes[df_mask].copy()
    keystrokes = keystrokes[~df_mask].copy()
    masked, masked_strings = mask_df(df, 'he@@@@', replace_with='height')
    masked, masked_strings = mask_df(masked, 'r@@@@m', replace_with='random')
    all_masked.append(masked)
#     program,deleted_text,_,_,_ = reconstruct(masked)
#     if program.find('he@@@@') > -1:
#         print('h -', i)
#     if program.find('r@@@@m') > -1:
#         print('r -', i)
    
all_masked = pd.concat(all_masked)
new_keystrokes = pd.concat([keystrokes, all_masked])

# keystrokes_orig[keystrokes_orig.ID.isin(set(ids_to_check))].to_csv('old.csv')
# new_keystrokes[new_keystrokes.ID.isin(set(ids_to_check))].to_csv('new.csv')

# Remove ID column
new_keystrokes = new_keystrokes[cols]
keystrokes_orig = keystrokes_orig[cols]

diff = pd.concat([keystrokes_orig, new_keystrokes]).drop_duplicates(keep=False)
display(diff)

# display(keystrokes.head())

# # masked, masked_strings = mask_df(df, subjectID2mask_re(subjectID))
# # program, deleted_text,_,_,_ = reconstruct(masked)
# # print(f'Masked strings: {masked_strings}')

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,EventID,SubjectID,AssignmentID,X-File,EventType,ToolInstances,CodeStateID,CodeStateSection,EditType,InsertText,DeleteText,X-Metadata,ClientTimestamp,ID
195712,592218,592218,326,Student15,Assign7,task2.py,File.Edit,PC;PP 1.1.10,,53.0,Insert,'''\n This is the starter file. Only fill i...,,,1634751816649,Student15Assign7task2.py
195941,593280,593280,598,Student15,Assign7,chessboard.py,File.Edit,PC;PP 1.1.10,,92.0,Insert,drawChessboard function\n 1. Create...,,,1634950715304,Student15Assign7chessboard.py
195973,593312,593312,630,Student15,Assign7,chessboard.py,File.Edit,PC;PP 1.1.10,,92.0,Delete,,drawChessboard function\n 1. Create...,,1634950718605,Student15Assign7chessboard.py
195975,593314,593314,632,Student15,Assign7,chessboard.py,File.Edit,PC;PP 1.1.10,,92.0,Insert,drawChessboard function\n 1. Create formal ...,,,1634950728780,Student15Assign7chessboard.py
196146,593485,593485,803,Student15,Assign7,chessboard.py,File.Edit,PC;PP 1.1.10,,143.0,Insert,@,,,1634950954096,Student15Assign7chessboard.py
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,128029,128029,9,Student44,Assign11,task1.py,File.Edit,PC;PP 1.1.10,,0.0,Insert,from modules.orbian import Orbian\nfrom time i...,,,1637094270326,Student44Assign11task1.py
1976,130004,130004,4571,Student44,Assign11,task1.py,File.Edit,PC;PP 1.1.10,,4283.0,Insert,a,,,1637198201055,Student44Assign11task1.py
1978,130006,130006,4573,Student44,Assign11,task1.py,File.Edit,PC;PP 1.1.10,,4284.0,Insert,n,,,1637198201411,Student44Assign11task1.py
1980,130008,130008,4575,Student44,Assign11,task1.py,File.Edit,PC;PP 1.1.10,,4285.0,Insert,d,,,1637198201502,Student44Assign11task1.py


In [297]:
keystrokes = new_keystrokes
keystrokes = keystrokes.sort_values(['SubjectID','ClientTimestamp'])
keystrokes.to_csv('data-fall2021-deidentified/keystrokes.csv')

In [305]:
keystrokes.columns
keystrokes = keystrokes[['EventID', 'SubjectID', 'AssignmentID',
       'CodeStateSection', 'EventType', 'ToolInstances', 'CodeStateID',
       'SourceLocation', 'EditType', 'InsertText', 'DeleteText',
       'X-Metadata', 'ClientTimestamp', 'ID']]
keystrokes.to_csv('data-fall2021-deidentified/keystrokes.csv')

## Some checking

In [226]:
runs = pd.read_csv('data-fall2021-deidentified/runs.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [235]:
keystrokes = pd.read_csv('data-fall2021-deidentified/keystrokes.csv')
students = pd.read_csv('data-fall2021-deidentified/students.csv')


In [231]:
# runs[runs.RunID == 'r']
# runs.iloc[35786064-10:35786064+10]
print(len(runs))
runs = runs[runs.RunID != 'r']
print(len(runs))


39658193
39658192


In [232]:
runs.to_csv('data-fall2021-deidentified/runs.csv')

In [258]:
runs.head()

Unnamed: 0.1,Unnamed: 0,SubjectID,AssignmentID,File,Action,RunID,NumOutputLines,Source,OutputDestination,Output,ClientTimestamp
0,0,Student1,Assign6,task1.py,r,5,8.0,PythonRunner,,,1634537000000.0
1,1,Student1,Assign6,,o,5,,,stdout,Fluky Number : 155\n,1634537000000.0
2,2,Student1,Assign6,,o,5,,,stdout,Fluky Number : 168\n,1634537000000.0
3,3,Student1,Assign6,,o,5,,,stdout,Fluky Number : 357\n,1634537000000.0
4,4,Student1,Assign6,,o,5,,,stdout,Fluky Number : 580\n,1634537000000.0


In [344]:
due = pd.read_csv('data-fall2021-deidentified/due.csv')
due = due[['AssignmentID', 'Due MST', 'Due UTC', 'Timestamp']]
# due['Due UTC'] = pd.to_datetime(due['Due MST'], infer_datetime_format=True, utc=True)  
due['Timestamp'] = pd.to_datetime(due['Due UTC']).apply(lambda x: int(x.timestamp()*1000))
# due['Timestamp'] = pd.Timestamp(due['Due UTC'])#.timestamp()
due.to_csv('data-fall2021-deidentified/due.csv', index=False)

due


Unnamed: 0,AssignmentID,Due MST,Due UTC,Timestamp
0,Assign6,2021-10-18 23:59 -0600,2021-10-19 05:59:00+00:00,1634623140000
1,Assign7,2021-10-22 23:59 -0600,2021-10-23 05:59:00+00:00,1634968740000
2,Assign8,2021-10-29 23:59 -0600,2021-10-30 05:59:00+00:00,1635573540000
3,Assign9,2021-11-5 23:59 -0600,2021-11-06 05:59:00+00:00,1636178340000
4,Assign10,2021-11-12 23:59 -0700,2021-11-13 06:59:00+00:00,1636786740000
5,Assign11,2021-11-19 23:59 -0700,2021-11-20 06:59:00+00:00,1637391540000
6,Assign12,2021-12-3 23:59 -0700,2021-12-04 06:59:00+00:00,1638601140000
7,Assign13,2021-12-10 23:59 -0700,2021-12-11 06:59:00+00:00,1639205940000


In [358]:
temp = keystrokes[(keystrokes.SubjectID == 'Student15')&(keystrokes.AssignmentID == 'Assign8')&(keystrokes['CodeStateSection'] == 'task2.py')]
d = due[due.AssignmentID == 'Assign8'].Timestamp
print(d)
diff = d - temp.ClientTimestamp
diff/(1000*60*60)

2    1635573540000
Name: Timestamp, dtype: int64


2        NaN
204453   NaN
204454   NaN
204457   NaN
204458   NaN
204472   NaN
204473   NaN
204474   NaN
204475   NaN
204478   NaN
204479   NaN
204486   NaN
204487   NaN
204494   NaN
204495   NaN
204500   NaN
204501   NaN
204509   NaN
204510   NaN
204511   NaN
204517   NaN
204518   NaN
204523   NaN
204524   NaN
204529   NaN
204530   NaN
204539   NaN
dtype: float64

In [244]:
l = list(set(keystrokes.SubjectID.unique()) - set(keystrokes[keystrokes.AssignmentID == 'Assign13'].SubjectID.unique()))
l.sort()
display(l)
display(students.sort_values('SubjectID'))

['Student10',
 'Student11',
 'Student13',
 'Student14',
 'Student15',
 'Student17',
 'Student2',
 'Student20',
 'Student22',
 'Student23',
 'Student25',
 'Student31',
 'Student32',
 'Student33',
 'Student34',
 'Student35',
 'Student4',
 'Student40',
 'Student41',
 'Student44',
 'Student8']

Unnamed: 0.1,Unnamed: 0,SubjectID,Assign6,Assign7,Assign8,Assign9,Assign10,Assign11,Assign12,Assign13,Exam1,Exam2,Exam3,FinalScore,major,HighestACT,HighSchoolGPA
19,19,Student1,99.0,99.0,100.0,100.0,98.0,0.0,82.0,86.0,90.0,86.0,80.0,97.7,Computer Science,34.0,3.96
7,7,Student10,95.0,100.0,100.0,85.0,88.0,94.0,100.0,0.0,88.0,86.0,84.0,97.6,Non-Matriculated,30.0,3.68
0,0,Student11,70.0,88.0,45.0,63.0,86.0,97.0,20.0,88.0,82.0,82.0,78.0,89.58,Non-Matriculated,,
23,23,Student12,79.0,87.5,96.0,95.0,90.0,86.0,99.0,77.0,76.0,78.0,86.0,95.73,Computer Science,15.0,3.95
33,33,Student13,73.0,100.0,99.0,0.0,79.0,100.0,100.0,95.0,82.0,80.0,76.0,94.52,Computer Science,30.0,4.0
41,0,Student14,,,,,,,,,,,,,,,
9,9,Student15,93.0,86.0,100.0,92.0,83.0,93.0,76.0,66.0,84.0,82.0,84.0,96.88,Computer Science,25.0,3.83
22,22,Student16,90.0,75.0,86.0,75.0,76.0,96.0,90.0,98.0,86.0,80.0,84.0,92.77,Computer Science,30.0,3.48
3,3,Student17,89.0,96.0,100.0,94.0,91.0,100.0,89.0,0.0,88.0,84.0,80.0,96.95,Statistics,26.0,3.933
31,31,Student18,0.0,89.0,95.0,96.0,92.0,88.0,78.0,51.0,82.0,86.0,90.0,94.19,Management,33.0,3.74


# Interactive deidentification

In [None]:
from IPython.display import display
import ipywidgets as widgets
from IPython.core.display import HTML


In [None]:
def populate():
    global masked
    program, deleted_text,_,_,_ = reconstruct(masked)
#     progTextWidget = widgets.Textarea(program + '\n\n**** Deleted ****\n\n' + deleted_text, layout=widgets.Layout(width='95%', height='350px'))
    text = program + '\n\n******** Deleted ********\n\n' + deleted_text
#     text = text + '\n\n'
#     text = text + '****************************************************\n'
#     text = text + '****************************************************\n'
#     text = text + '****************************************************\n'
#     text = text + '****************************************************\n'
#     text = text + '****************************************************\n'
#     text = text + '\n\n'
#     text = text + '** Inserts **\n'
#     text = text + ''.join(masked[(masked.EventType == ET_EDIT)&(masked.EditType == INSERT)].InsertText)
#     text = text + '** Keystrokes **\n'
#     text = text + ''.join(masked[masked.EventType == ET_KEY].InsertText)
    progTextWidget.value = text

def undo_btn_eventhandler(obj):
    global masked, stack
    if len(stack) > 0:
        masked = stack[-1]
        stack = stack[:-1]
        populate()
        maskStringWidget.value = ''
        
def do_mask(text_to_replace):
    global masked, stack
#     maskStringWidget.value = to_replace
    stack = stack + [masked]
    masked = mask_df(masked, text_to_replace)
    populate()
    maskStringWidget.value = ''
    
def mask_btn_eventhandler(obj):
    do_mask(maskStringWidget.value)
#     global masked, stack
#     stack = stack + [masked]
#     text_to_replace = maskStringWidget.value
#     masked = mask_df(masked, text_to_replace)
#     populate()
#     maskStringWidget.value = ''
    

# Undo button
undo_btn = widgets.Button(description='Undo')
undo_btn.on_click(undo_btn_eventhandler)
display(undo_btn)

# Mask button
btn = widgets.Button(description='Mask')
btn.on_click(mask_btn_eventhandler)
display(btn)

# Make text in program widget monospace
display(HTML("<style>textarea, input { font-family: monospace; }</style>"))

# Mask string widget
maskStringWidget = widgets.Text('', description="Mask string")
display(maskStringWidget)

# Program text widget
progTextWidget = widgets.Textarea(layout=widgets.Layout(width='95%', height='600px'))
display(progTextWidget)

def pp_interactive_mask(fn, first_name, last_name, assign_id):
    global masked, stack
    student_id = first_name+' '+last_name
    df = pyphanon2progsnap2(fn, student_id, assign_id)
    masked = df
    stack = []
    populate()

    btn.disabled = True
    do_mask(last_name)
    do_mask(first_name)
    do_mask(last_name.lower())
    do_mask(first_name.lower())
    do_mask(last_name.upper())
    do_mask(first_name.upper())
    btn.disabled = False
    

def syw_interactive_mask(fn, student_id, assign_id):
    global masked, stack
    # df = pd.read_csv('deident.csv')#, header=None)
    df = showyourwork2progsnap2(fn, student_id, assign_id)
#     df = pyphanon2progsnap2(fn, student_id, assign_id)
    masked = df
    stack = []
    populate()


In [None]:
syw_interactive_mask('data/deident.log', 'John Edwards', 'Assign1')
