In [1]:
import pandas as pd
import numpy as np
import re
import time
import traceback

In [2]:
# event types
# ET_EDIT = 'File.Edit'
# ET_KEY = 'X-Keystroke'
# edit types
INSERT = 'Insert'
DELETE = 'Delete'

In [24]:
class Entry:
    # change_text is the text to be inserted or deleted
    # row_idx is the index of the row in the dataframe that contains the change
    # text_index is the index in the string in InsertText or DeleteText column where the change_text is found
    # insert is a boolean: true if insert, false if delete
    def __init__(self, change_text, row_idx, text_index, insert, deleted_insert_idx=-1):
        self.change_text = change_text
        self.row_idx = row_idx
        self.text_index = text_index
        self.insert = insert
        self.deleted_insert_idx = deleted_insert_idx
        
    def __str__(self):
        return 'text: {}, row_idx: {}, text_index: {}, insert: {}'.format(self.change_text, self.row_idx, self.text_index, self.insert)
    
    def isInsert(self):
        return self.insert
        
    def isDelete(self):
        return not self.insert


## Reconstruct the final submission

In [222]:
def char_diff(row):
    l = 0
    if row.InsertText == row.InsertText:
        l = len(row.InsertText)
    if row.DeleteText == row.DeleteText:
        l -= len(row.DeleteText)
    return l

def reconstruct(df, debug=False):
    # find the number of characters added or subtracted at each step
    charsAdded = df.apply(char_diff, axis=1)
    max_size = charsAdded.cumsum().max()
    # Allocate buffers
    inserts = []
    deletes = []
    deleted_inserts = []
    buf = '' # the end file
    delete_buf = '' # characters that were deleted

    # Create an error data frame holding rows that failed
    error = pd.DataFrame(columns=df.columns)
    error['XError'] = 0
    # Build the file. Iterate over each row of the data frame.
    for row_idx,row in df[df.EventType == 'File.Edit'].iterrows():
#     for row_idx,row in df.iterrows():
        code_idx = int(row.SourceLocation)
#         code_idx = -1
#         if not np.isnan(row.SourceLocation):
#             code_idx = int(row.SourceLocation)
        if row.DeleteText and row.DeleteText == row.DeleteText:
            di_len = len(deleted_inserts)
            deleted_inserts = deleted_inserts + inserts[code_idx:code_idx+len(row.DeleteText)]
            
#             # Use what's in the buffer because what's in DeleteText may not match
            deleted_text = buf[code_idx:code_idx+len(row.DeleteText)]
            if (len(deleted_text) != len(row.DeleteText)):
#                 raise Exception(f'deleted_text != row.DeleteText: {len(deleted_text)} {len(row.DeleteText)}')
                print(f'Warning: len(deleted_text) != len(row.DeleteText): {len(deleted_text)} {len(row.DeleteText)} at index {code_idx}')
            
            inserts = inserts[:code_idx] + inserts[code_idx+len(row.DeleteText):]
            buf = buf[:code_idx] + buf[code_idx+len(row.DeleteText):]
            local_deletes = [Entry(row.DeleteText[k], row_idx, k, False, deleted_insert_idx=di_len+k) for k in range(len(row.DeleteText))]
            deletes = local_deletes + deletes
            delete_buf = row.DeleteText + delete_buf

#             inserts = inserts[:code_idx] + inserts[code_idx+len(deleted_text):]
#             buf = buf[:code_idx] + buf[code_idx+len(deleted_text):]
#             local_deletes = [Entry(deleted_text, row_idx, k, False, deleted_insert_idx=di_len+k) for k in range(len(row.DeleteText))]
#             deletes = local_deletes + deletes
#             delete_buf = deleted_text + delete_buf
        if row.InsertText and row.InsertText == row.InsertText:
            local_inserts = [Entry(row.InsertText[k], row_idx, k, True) for k in range(len(row.InsertText))]
            inserts = inserts[:code_idx] + local_inserts + inserts[code_idx:]
            buf = buf[:code_idx] + row.InsertText + buf[code_idx:]
        else:
            error = error.append(row)
            error.iloc[0, error.columns.get_loc('XError')] = 'Unsupported edit type'
    if debug:
        print('Errors:')
        display(error)

    return buf, delete_buf, inserts, deletes, deleted_inserts


## Fix bug with Jake Miller

In [235]:
df = pd.read_csv('data-2019/bak/project-events.csv')


  exec(code_obj, self.user_global_ns, self.user_ns)


In [237]:
df.head()
test = df[df.user_id == 100036].copy()
test = test[(test.timestamp > 1548976440848)&(test.timestamp < 1548976584332)]
test

Unnamed: 0.1,Unnamed: 0,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,has_error,user_terminated,startLine,startCol,endLine,endCol,operation,key,elapsed
5413731,2707933,100036,131,1,TASK,,,1548976444793,,,,,,,,,TASK,,3945.0
5413732,2707931,100036,131,1,setValue,#We are now going to create a target that is s...,#We are now going to create a target that is s...,1548976444800,,,,,0.0,0.0,51.0,0.0,setValue,,7.0
5413733,2707932,100036,131,0,TASK,,,1548976451216,,,,,,,,,TASK,,6416.0
5413734,2707923,100036,131,0,setValue,#We are now going to create a target that is s...,#We are now going to create a target that is s...,1548976451222,,,,,0.0,0.0,51.0,0.0,setValue,,6.0
5413735,2707934,100036,131,0,+delete,,#We are now going to create a target that is s...,1548976456364,,,,,0.0,0.0,51.0,0.0,+delete,delete,5142.0
5413736,2707935,100036,131,0,+input,p,,1548976583889,,,,,0.0,0.0,0.0,0.0,+input,p,127525.0
5413737,2707936,100036,131,0,+input,r,,1548976584008,,,,,0.0,1.0,0.0,1.0,+input,r,119.0
5413738,2707937,100036,131,0,+input,i,,1548976584120,,,,,0.0,2.0,0.0,2.0,+input,i,112.0


In [232]:
test = df_2019[df_2019.SubjectID.isin(['100036'])].copy()
# test = df_2019[df_2019.SubjectID.isin(list(range(100030,100040)))]
# display(test.head())
# test = test[test.AssignmentID.isin(['129','131','200'])]
# test = test.sort_values(['user_id','timestamp','native_index'])

# test = test[test.CodeStateSection == 'task0.py']

# test.to_csv('test.csv', index=False)
# with open('test.csv') as f:
#     text = f.read()
# text = re.sub(re.compile('jake miller', flags=re.IGNORECASE), '@@@@@@@@@@@', text)
# # print(text[:100])
# with open('test.csv', 'w') as f:
#     f.write(text)
    
# test = pd.read_csv('test.csv')
# test.SubjectID = test.SubjectID.astype('str')
# test.AssignmentID = test.AssignmentID.astype('str')

# test = test[~(test.EditType == 'Replace')]

# If setValue has only a delete then the source location is going to be nan. Fix
# that to be 0.
# test[(test.EventType == 'File.Edit')&(test.SourceLocation.isna())]
# test[~(test.DeleteText.isna())&(test.SourceLocation.isna())]
test.loc[(test.EventType == 'File.Edit')&(test.SourceLocation.isna()), 'SourceLocation'] = 0


# display(test)

# # test[~(test.DeleteText.isna())&(test.EventType != 'File.Edit')].head()

# # test = test.iloc[:1180].copy().reset_index()
# test = test.copy().reset_index()
# # program, deleted_text, entries, deletes, deleted_inserts = reconstruct(test.iloc[:990])
# program, deleted_text, entries, deletes, deleted_inserts = reconstruct(test)
# # print(deleted_text)
# # with open('test.txt', 'w') as f:
# #     f.write(deleted_text)

# # for i in re.finditer('jake', deleted_text, flags=re.IGNORECASE):
# #     print(i)

# mask_re = 'ake m'
# replace_with = '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'

# masked = test.copy()
# # masked, ms1 = mask(masked, program, entries, True, mask_re, replace_with)
# masked, ms2 = mask(masked, deleted_text, deletes, False, mask_re, replace_with, deleted_inserts)

# program, deleted_text, entries, deletes, deleted_inserts = reconstruct(masked.iloc[:2060])
# print(program)
# masked.to_csv('test.csv')

# Everything
id2names = {'100034':['Mary Chidester'], '100036':['jake miller']}
test,ms,programs,program_heads,deleted = deidentifyps2(test, id2names=id2names, header_offset=0)

test.to_csv('test.csv')


100036 - 129 - task0.py
  Masked strings: {'Jake Miller'}
100036 - 129 - task1.py
  Masked strings: {'Jake Miller', 'Jake'}
100036 - 131 - task0.py
  Masked strings: {'Jake Miller'}
100036 - 131 - task1.py
  Masked strings: {'Jake Miller'}
100036 - 133 - task1.py
  Masked strings: {'Brim', 'A02204724', 'Andy', 'Miller'}
100036 - 135 - task0.py
  Masked strings: {'Andy', 'A02204724', 'Jake Miller', 'Jake', 'Brim'}
100036 - 135 - task1.py
  Masked strings: {'Andy', 'A02204724', 'Jake Miller', 'Brim'}
100036 - 138 - task0.py
100036 - 138 - task1.py


## Create a regular expression

In [26]:
# Version used for fall 2021 data
from itertools import permutations, combinations

# Returns a regular expression to match and mask.
# subjectID - the name of the student with each of first, last, etc separated by a space
# include_Anum - whether to include a regular expression to match Aggie A#
def subjectID2mask_re_2021(subjectID, include_Anum=True, substrings=True):
    res = []
    
    # Get "First Last" and "Last First"
    id_parts = subjectID.split() # Probably just first and last name but could include a middle initial
    for i in range(len(id_parts), 0, -1):
        for comb in combinations(id_parts, i):
            perms = permutations(comb)
            for perm in perms:
                r = ' '.join(perm)
                if len(r) > 1:
                    res = res + [r]

    # Find larger substrings of each name. For example, if the name is Christensen, this will add
    # hristense, ristens, and isten. The reason for this is if the name somehow is only partial
    # in the keystrokes. For example, the student might have missed typing the last character.
    # In our working example, this would result in Christense. In this case, we would match
    # hristense and the masked keys would result in C@@@@@@@@@. Ideally we would attempt matches
    # on every substring, but there are n(n+1)/2 of those -- way too many. There are a linear
    # number of substrings using our scheme and it strikes a balance between performance and
    # quality of masking/deidentification.
    if substrings:
        for name in id_parts:
            res = res + [name[i:-i] for i in range(1,int(len(name)/2)-1)]

    # Add a RE for A#
    if include_Anum:
        res = res + ['a#?[ ]{0,3}[0-9]{8}']
    return '|'.join(res)

r = subjectID2mask_re_2021('John Lilliputian')
print(r)
print([m[0] for m in re.finditer(r, 'John Lilliputian John illiputi a12345678 a#35354646 A# 87658765 a 56565656 a  34563456  a    97597531', flags=re.IGNORECASE)])
r = subjectID2mask_re_2021('Alomarian Stoic Parambulator Stu')
print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])

r = subjectID2mask_re_2021('Alomarian Stoic Parambulator Stu', substrings=False)
print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])



John Lilliputian|Lilliputian John|John|Lilliputian|illiputia|lliputi|liput|a#?[ ]{0,3}[0-9]{8}
['John Lilliputian', 'John', 'lliputi', 'a12345678', 'a#35354646', 'A# 87658765', 'a 56565656', 'a  34563456']
['Stoic Stu', 'a12341234', 'rambulat']
['Stoic Stu', 'a12341234']


In [33]:
from itertools import permutations, combinations

# Returns a regular expression to match and mask.
# names - names for the student
# include_Anum - whether to include a regular expression to match Aggie A#
def subjectID2mask_re_2019(names, include_Anum=True, substrings=True):
    res = names.copy()
    
    for name in names:
        for n in name.split():
            n = n.strip()
            if len(n) > 1 and n[1] != '.':
                res.append(n)
                
    res = res + ['Andy','Andrew','Brim','Chad','Mano','Mono']

    # Add a RE for A#
    if include_Anum:
        res = res + ['a#?[ ]{0,3}[0-9]{8}']
    return '|'.join(res)

r = subjectID2mask_re_2019(['John Lilliputian', 'John L'])
print(r)
r = subjectID2mask_re_2019(['John M. Edwards', 'John Edwards'])
print(r)
# print([m[0] for m in re.finditer(r, 'John Lilliputian John illiputi a12345678 a#35354646 A# 87658765 a 56565656 a  34563456  a    97597531', flags=re.IGNORECASE)])
# r = subjectID2mask_re_2019('Alomarian Stoic Parambulator Stu')
# print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])

# r = subjectID2mask_re_2019('Alomarian Stoic Parambulator Stu', substrings=False)
# print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])



John Lilliputian|John L|John|Lilliputian|John|Andy|Andrew|Brim|Chad|Mano|Mono|a#?[ ]{0,3}[0-9]{8}
John M. Edwards|John Edwards|John|Edwards|John|Edwards|Andy|Andrew|Brim|Chad|Mano|Mono|a#?[ ]{0,3}[0-9]{8}


## Mask a dataframe

In [28]:
REPLACE_WITH = '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
def mask_char_in_insert_text(df, row_idx, text_index, replacement_char):
    insert_col_idx = df.columns.get_loc('InsertText')
    s = df.iloc[row_idx].InsertText
    char_to_replace = s[text_index]
    df.iloc[row_idx,insert_col_idx] = s[:text_index] + replacement_char + s[text_index+1:]
    
    # Look nearby (within two rows) for a keystroke and mask it as well
    arr = np.intersect1d(df.index[(df.EventType == 'X-Keystroke')&
                                  (df.InsertText == char_to_replace)],
                         pd.RangeIndex(row_idx-2, row_idx+3))
    df.iloc[arr,insert_col_idx] = replacement_char
    
# Returns the masked dataframe and all strings that were masked
def mask(df, program, entries, is_insert, text_to_replace, replace_with=REPLACE_WITH, deleted_inserts = None, in_place = False):
#     display(df)
    masked_strings = set()
    insert_col_idx = df.columns.get_loc('InsertText')
    delete_col_idx = df.columns.get_loc('DeleteText')
    if in_place:
        masked = df
    else:
        masked = df.copy()

    for it in re.finditer(text_to_replace, program, flags=re.IGNORECASE):
        start_i = it.start()
        text_to_replace = it[0]
        masked_strings.add(text_to_replace)

        # Iterate through each character of the input text
        for i in range(len(text_to_replace)):
            replacement_char = replace_with[i]
            char_to_replace = text_to_replace[i]
            entry = entries[start_i+i] # entry with data on the character we are about to mask
            if entry.change_text != char_to_replace:
                print('ERROR: change_text not equal to char_to_replace: {}, {}', entry, char_to_replace)
            row_idx = entry.row_idx
            text_index = entry.text_index
            if (is_insert != entry.insert):
                print('Error: {} is not equal to insert = {}'.format(entry, is_insert))
            
            if entry.insert:
                mask_char_in_insert_text(masked, row_idx, text_index, replacement_char)
            else:
                # Deletion
                # Mask the DeleteText string
                s = masked.iloc[row_idx].DeleteText
                masked.iloc[row_idx,delete_col_idx] = s[:text_index] + replacement_char + s[text_index+1:]
                # Mask the InsertText string where the text was originally inserted
                di = deleted_inserts[entry.deleted_insert_idx]
#                 s = masked.iloc[di.row_idx].InsertText
#                 if s[di.text_index] != char_to_replace:
#                     print('ERROR: s[di.text_index] != char_to_replace: {}, {}, {}', s, di, char_to_replace)
#                     print('  {}, {}'.format(row_idx, di.row_idx))
#                     print('  {}'.format(masked.iloc[di.row_idx]))
                mask_char_in_insert_text(masked, di.row_idx, di.text_index, replacement_char)
            

    return masked, masked_strings


In [29]:
def mask_df(df, mask_re, replace_with=REPLACE_WITH):
    df = df.copy().reset_index()
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)

    masked = df.copy()
    masked, ms1 = mask(masked, program, entries, True, mask_re, replace_with)
    masked, ms2 = mask(masked, deleted_text, deletes, False, mask_re, replace_with, deleted_inserts)

    return masked, ms1.union(ms2)

## Test reconstruct and mask

In [None]:
# df = pd.read_csv('data/deident.csv')#, header=None)
# df = showyourwork2progsnap2('showyourwork.log', 'John Edwards', 'Assign1')
df = pd.read_csv('test.ps2')
subjectID = df.SubjectID.unique()[0]
display(df['CodeStateSection'].unique())
df = df[(df.SubjectID == subjectID)&(df['CodeStateSection'] == 'task1.py')]
# display(df)

print(reconstruct(df)[0])

print('\n\n******* Masking ********\n')

masked, masked_strings = mask_df(df, subjectID2mask_re(subjectID))
program, deleted_text,_,_,_ = reconstruct(masked)
print(f'Masked strings: {masked_strings}')
print(program)
print('\n\n** Deleted **\n' + deleted_text)
print('\n\n** Inserts **\n', ''.join(masked[(masked.EventType == 'File.Edit')&(masked.EditType == INSERT)].InsertText))
print('\n\n** Keystrokes **\n', ''.join(masked.loc[masked.EventType == 'X-Keystroke', 'X-Metadata']))

# temp = pd.concat([df,masked]).drop_duplicates(keep=False)
# display(temp[temp.EventID < 100].head(40))


## Add a number to each character in an ascii string

In [30]:
def add_to_ascii(s, add):
    ch = [c+add for c in bytes(s, 'utf-8')]
    return ''.join([str(chr(c)) for c in ch])

s = 'Ebof!Sbtnvttfo!.!Bttjho23!.!ubtl3/qz'
print(add_to_ascii(s, -1))

Dane Rasmussen - Assign12 - task2.py


# Deidentify all students in ps2 file

In [31]:
# Returns (df, submission, deleted) where
#  df - the deidentified data frame
#  submission - the reconstructed submission code
#  deleted - all deleted characters in reverse order
def deidentifyps2(df_all, id2names=None, header_offset=1):
    df_result = []
    all_masked_strings = set()
    programs = ''
    program_heads = ''
    deleted = ''
    
    df_all = df_all[df_all['CodeStateSection'].str.slice(-3) == '.py'].copy()
    df_all['uniqueID'] = df_all.SubjectID + df_all.AssignmentID + df_all['CodeStateSection']
    for ID in df_all.uniqueID.unique():
        masked = df_all[df_all.uniqueID == ID].copy().reset_index()
        
        f = masked.iloc[0]
        id_string = f'{f.SubjectID} - {f.AssignmentID} - {f["CodeStateSection"]}'
        print(id_string)
        
        try:
            # Do not sort if deidentifying ShowYourWork.
            masked.sort_values('ClientTimestamp', inplace=True)
            if not id2names:
                # fall 2021
                mask_re = subjectID2mask_re_2021(f.SubjectID)
            else:
                # 2019
                mask_re = subjectID2mask_re_2019(list(id2names[f.SubjectID]))

            masked,masked_strings = mask_df(masked, mask_re)
            if (len(masked_strings) > 0):
                all_masked_strings = all_masked_strings.union(masked_strings)
                print(f'  Masked strings: {masked_strings}')

            program, deleted_text,_,_,_ = reconstruct(masked)
            program_header = f'\n\n____{add_to_ascii(id_string, header_offset)}****\n'
            if len(program.strip()) > 0:
                programs = programs+program_header+program
            lines = program.split('\n')
            num_lines = 3 if len(lines)>3 else len(lines)
            lines = lines[:num_lines]
            lines = '\n'.join(lines)
            if len(lines.strip()) > 0:
                program_heads = program_heads+program_header+lines
                
            if len(deleted_text.strip()) > 0:
                deleted = deleted+program_header+deleted_text

            df_result.append(masked)
#             if df_result is None:
#                 df_result = masked
#             else:
#                 df_result = pd.concat([df_result, masked])
        except Exception as e:
            print(f'Failed to mask {id_string}: {str(e)}')
            traceback.print_exc()

    return pd.concat(df_result), all_masked_strings, programs, program_heads, deleted



# Convert Phanon to ProgSnap2

In [None]:
# df = pd.read_csv('data-2019/keystrokes.csv')
df = pd.read_csv('data-2019/project-events.csv')

In [None]:
df = df.rename({'Unnamed: 0':'native_index'}, axis=1)
df = df.sort_values(['user_id','timestamp','native_index'])
df['change_index'] = np.nan

df['ID'] = df.user_id.astype('str') + df.project_id.astype('str') + df.task.astype('str')
df['ID_no_task'] = df.user_id.astype('str') + df.project_id.astype('str')

In [None]:
def split_text(text):
    text = text.split('\n')
    if len(text) > 1:
        text = [e+'\n' for e in text[:-1]] + [text[-1]]
    return text
    
def insert(lines, text, irow, icol):
    if len(lines) == 0:
        lines = ['']
    if icol > len(lines[irow]):
        s = '"'+lines[irow].replace(' ','*').replace('\n','\\n')+'"'
        raise IndexError(f"Column out of range: irow={irow}, icol={icol}, line={s}")
    # Consider the line in the question and the line following.
    # Split into lines preceding and succeeding those two. Then
    # join the two in question.
    before = lines[:irow]
    after = []
    if len(lines) == irow+1:
        # last line
        two = lines[irow]
    elif len(lines) == irow+2:
        # second to last line
        two = ''.join(lines[irow:irow+2])
    else:
        # More than one line following
        two = ''.join(lines[irow:irow+2])
        after = lines[irow+2:]
        
    two = two[:icol] + text + two[icol:]
    if len(after) == 0:
        return before + split_text(two)
    return before + split_text(two)[:-1] + after

# print(split_text(''))
# print(split_text('\n'))
# print(split_text('abc'))
# print(split_text('abc\n'))
# print(split_text('abc\ndef'))
# print(split_text('abc\ndef\n'))
# print(split_text('abcdef\n\ndef'))
# lines = insert(['abc\n','def'], 'def', 0, 3)
# print('*'.join(lines))
# print(insert([], 'abc', 0, 0))
# print(insert([''], 'abc', 0, 0))
# print(insert([], 'abc\n', 0, 0))
# print(insert([''], 'abc\n', 0, 0))
# print(insert(['abc\n', ''], 'def\n', 0, 0))
# print(insert(['abc\n', ''], 'def\n', 0, 3))
# print(insert(['abc\n', ''], 'def\n', 1, 0))
# print(insert(['abc\n', ''], 'def', 0, 0))
# print(insert(['abc\n', ''], 'def', 0, 3))
# print(insert(['abc\n', ''], 'def', 1, 0))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 0, 0))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 0, 3))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 1, 0))
# print(insert(['abc\n','def'], 'def\n', 0, 3))


In [None]:
def add_empty_line_cond(lines):
    if len(lines) == 0:
        return ['']
    last_line = lines[-1]
    if len(last_line) > 0 and last_line[-1] == '\n':
        return ['']
    return []

#        i j
# i,j aaaddaaa
#
#        i j
# i,j aaadd
#     aaaaa    
#
#      i
# i aaadd
#   ddddddd
# j ddaaaaa
#     j
def remove_impl(lines, irow, icol, jrow, jcol):
#     if jrow == len(lines) and jcol == 0:
#         jrow = len(lines)-1
#         jcol = len(lines[-1])
    if icol > len(lines[irow]):
        s = '"'+lines[irow].replace(' ','*').replace('\n','\\n')+'"'
        raise IndexError(f"Column out of range: irow={irow}, icol={icol}, line={s}")
    if irow == jrow:
        line = lines[irow]
        line = line[0:icol] + line[jcol:]
        if len(line) == 0:
            return lines[:irow] + lines[irow+1:]
        if line[-1] != '\n' and irow < len(lines)-1:
            return lines[:irow] + [line+lines[irow+1]] + lines[irow+2:]
        return lines[:irow] + [line] + lines[irow+1:]
    else:
        line1 = lines[irow][:icol]
        line2 = lines[jrow][jcol:]
        if len(line1+line2) == 0:
            return lines[:irow] + lines[jrow+1:]
        if (line1+line2)[-1] != '\n' and jrow < len(lines)-1:
            return lines[:irow] + [line1+line2+lines[jrow+1]] + lines[jrow+2:]
        return lines[:irow] + [line1+line2] + lines[jrow+1:]
    
def remove(lines, irow, icol, jrow, jcol):
    lines = remove_impl(lines, irow, icol, jrow, jcol)
    return lines + add_empty_line_cond(lines)

# print(remove(['abc\n', ''], 0, 0, 0, 1))
# print(remove(['abc\n', ''], 0, 0, 0, 2))
# print(remove(['abc\n', ''], 0, 0, 0, 3))
# print(remove(['abc\n', ''], 0, 0, 0, 4))
# print(remove(['abc\n', ''], 0, 1, 0, 3))
# print(remove(['abc\n', 'def'], 0, 1, 1, 0))
# print(remove(['abc\n', 'def'], 0, 1, 1, 3))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 0))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 3))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 4))
# print(remove(['abc\n', 'def\n', ''], 1, 3, 1, 4))
# print(remove(['abc\n', 'def\n', 'g'], 2, 0, 2, 1))
# print(remove(['abc\n', 'def\n', 'g'], 2, 0, 3, 0))


In [None]:
# test[277-5:277+5]
# test[0:20]
test.head()

In [None]:
def phanon2progsnap2(df, debug = False):
    test = df.copy()
    change_indices = []
    lines = ['']
    i = 0
    for index,row in test.iterrows():
    #     print('**',i)
        i = i + 1
        try:
            irow = row.startLine
            icol = row.startCol
            jrow = row.endLine
            jcol = row.endCol
            change_type = row.change_type
            added = row.code_added
            removed = row.code_removed
            changed = False
            if removed and removed == removed and row.change_type != 'setValue' and (len(lines)>1 or len(lines[0])>0):
#             if removed and removed == removed and (len(lines)>1 or len(lines[0])>0):
                changed = True
                irow = int(irow)
                icol = int(icol)
                jrow = int(jrow)
                jcol = int(jcol)
                lines = remove(lines, irow, icol, jrow, jcol)
            if row.change_type == 'setValue':
                lines = ['']
            if added and added == added and added != '':
                changed = True
                irow = int(irow)
                icol = int(icol)
                lines = insert(lines, added, irow, icol)
            change_index = np.nan
            if changed:
                change_index = len(''.join(lines[:irow]))+icol
            change_indices.append(change_index)
        except Exception as e:
            if debug:
                display('i={}: {}'.format(i,e))
                print(''.join(lines).replace(' ', '·'))
                display(row)
                traceback.print_exc()
                test = test[:len(change_indices)]
                break
            else:
                raise e
    test.change_index = change_indices
    test['SubjectID'] = test.user_id
    test['EventID'] = test.native_index
    test['AssignmentID'] = test.project_id
    test['CodeStateSection'] = test.task
    test['EventType'] = test.change_type
    test['InsertText'] = test.code_added
    test['DeleteText'] = test.code_removed
    test['SourceLocation'] = test.change_index
    test['ClientTimestamp'] = test.timestamp
    # array(['RUN', 'SUBMIT', 'TASK', 'setValue', '+delete', '+input', 'paste',
#        'undo', 'redo', 'cut', 'drag'], dtype=object)
    test.EventType = test.EventType.replace({'+input':'File.Edit','+delete':'File.Edit',
                                             'undo':'File.Edit','redo':'File.Edit',
                                             'cut':'File.Edit','paste':'File.Edit','drag':'File.Edit',
                                             'RUN':'Run.Program','SUBMIT':'Submit',
                                             'TASK':'X-SwitchTask',
#                                              'setValue':'File.Edit'
                                            })
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == '+input'),'EditType'] = 'Insert'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == '+delete'),'EditType'] = 'Delete'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'setValue'),'EditType'] = 'Insert'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'undo'),'EditType'] = 'Undo'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'redo'),'EditType'] = 'Redo'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'paste'),'EditType'] = 'Paste'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'cut'),'EditType'] = 'Cut'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'drag'),'EditType'] = 'Drag'
    return test

In [None]:
def clean_task_switches(df):
    # Look nearby (within two rows) for rows with large both inserts and
    # deletes and clean them
    df = df.copy()
    indices = df.index[df.change_type == 'TASK']
    for row_idx in indices:
        arr = np.intersect1d(df.index[(df.code_added.str.len() > 5)&
                                      (df.code_removed.str.len() > 5)],
                             pd.RangeIndex(row_idx-2, row_idx+3))
        df.loc[arr,'code_added'] = ''
        df.loc[arr,'code_removed'] = ''
    return df

test
clean_task_switches(test)

In [None]:
copy = df.copy()#[(df.user_id == 100338)|(df.user_id == 100339)|(df.user_id == 100340)].copy()
# copy = clean_task_switches(copy)
dfs = []
for ID in copy.ID.unique():
# for ID in copy.ID_no_task.unique():
    print(ID)
    subdf = copy[copy.ID == ID]
#     subdf = copy[copy.ID_no_task == ID]
    try:
        dfs.append(phanon2progsnap2(subdf, False))
    except:
        print('{} - Reconstruction failed'.format(ID))

copy = pd.concat(dfs)
copy.to_csv('phanon2ps2.csv', index=False)

In [None]:
print(len(copy))
display(copy.EventType.unique())
display(copy.groupby('EventType').count())

copy.EventType = copy.EventType.replace({'setValue':'File.Edit'})
copy.loc[(copy.EventType == 'File.Edit')&(copy.change_type == 'setValue'),'EditType'] = 'Replace'

print(len(copy))
display(copy.EventType.unique())
display(copy.groupby('EventType').count())
copy.to_csv('phanon2ps2-2.csv', index=False)

# Deidentify 2019

In [None]:
# df_2019.to_csv('data-2019/phanon2ps2-4.csv', index=False)

In [18]:
df_2019 = pd.read_csv('data-2019/phanon2ps2-4.csv')
df_2019.SubjectID = df_2019.SubjectID.astype('str')
df_2019.AssignmentID = df_2019.AssignmentID.astype('str')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
lines = ''
with open('data-2019/id2name.txt') as f:
    lines = f.readlines()

id2names = {}
joined = ''.join(lines)
arr = ['100'+s for s in joined.split('100')][1:]
for entry in arr:
    earr = entry.split('\n')
    subjectID = earr[0].split()[0].strip()
    entries = set([])
    for line in earr[1:]:
        line = line.strip()
        if len(line)>0:
            entries.add(line)
    id2names[subjectID] = entries

for key in id2names.keys():
    print(key)
    for val in id2names[key]:
        print(f'  {val}')


In [109]:
df_2019.head()

Unnamed: 0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,...,SubjectID,EventID,AssignmentID,CodeStateSection,EventType,InsertText,DeleteText,SourceLocation,ClientTimestamp,EditType
0,10436863,100000,195,0,+input,#,,1568242704669,,,...,100000,10436863,195,task0.py,File.Edit,#,,0.0,1568242704669,Insert
1,10436864,100000,195,0,+input,,,1568242704830,,,...,100000,10436864,195,task0.py,File.Edit,,,1.0,1568242704830,Insert
2,10436865,100000,195,0,+delete,,,1568242705198,,,...,100000,10436865,195,task0.py,File.Edit,,,1.0,1568242705198,Delete
3,10436866,100000,195,0,+input,,,1568242705450,,,...,100000,10436866,195,task0.py,File.Edit,,,1.0,1568242705450,Insert
4,10436867,100000,195,0,+input,J,,1568242705692,,,...,100000,10436867,195,task0.py,File.Edit,@,,2.0,1568242705692,Insert


## Write a single user to test.csv

In [None]:
df_2019 = pd.read_csv('data-2019/bak/phanon2ps2-4.csv')
df_2019.SubjectID = df_2019.SubjectID.astype('str')
df_2019.AssignmentID = df_2019.AssignmentID.astype('str')

## Do the deidentification

In [None]:
students = list(df_2019.SubjectID.unique())
# students = set(students[:8])
# students = ['100069', '100070', '100071', '100072', '100073']
print('Deidentifying {}'.format(students))
df = df_2019[df_2019.SubjectID.isin(set(students))].copy()

# remove invalid source location events
bad_mask = ((df.InsertText.isna())&(df.DeleteText.isna()))|~(df.SourceLocation.isna())
df = df[bad_mask]

dfs = []
n = 20
for i in range(0,len(students),n):
    tic = time.perf_counter()
    # 20 students at a time
    df_slice = df[df.SubjectID.isin(students[i:i+n])]
    df_masked,ms,programs,program_heads,deleted = deidentifyps2(df_slice, id2names=id2names, header_offset=0)
    toc = time.perf_counter()
    print(f"Masked {toc - tic:0.4f} seconds")

    print(f'Masked strings: {ms}')

    with open(f'run3/deidentified-programs{i}.txt', 'w') as f:
        f.write(programs)
    with open(f'run3/deidentified-program-heads{i}.txt', 'w') as f:
        f.write(program_heads)
    with open(f'run3/deidentified-deleted{i}.txt', 'w') as f:
        f.write(deleted)
    if not df_masked is None:
        df_masked = df_masked.drop(['level_0','uniqueID','index'], axis=1)
        dfs.append(df_masked)

pd.concat(dfs).to_csv('data-2019/phanon2ps2-7.csv', index=False)

# To fix
x means done
* setValue can have nan source location: test.loc[(test.EventType == 'File.Edit')&(test.SourceLocation.isna()), 'SourceLocation'] = 0
* 100003 Garnder
* 100030 O'Loughlin
* 100036 Jake Miller
* x 100108 - 132 - task1.py -- "i am will"
* x 100108 - 136 - task0.py -- will perfect
* x 100184 - 128 - task0.py -- jimmy

## Take out common words and symbols from "check" files

In [81]:
with open('run2/aaa-masked-strings.txt') as f:
    text = f.read()
masked = set([s.strip("' \n").lower() for s in text.split(',')])

# Words that we will pull out of the check text
# with open('words-no-names.txt') as f:
with open('wordsbig.txt') as f:
    lines = f.readlines()
words = set([w.strip() for w in lines])

# The file to reduce
# with open('run2/heads.txt') as f:
with open('run2/programs.txt') as f:
    lines = f.readlines()

# Don't pull out any words that we masked
words = words - set(masked)
words = words - set(['name', 'http', 'zoom'])

s = ''
has_body = False
header = ''
for line in lines:
    if line[:7] == '____100':
        if not has_body and len(header)>0:
            s = s[:-len(header)]
        s = s + '\n\n'+line.strip()
        header = '\n\n'+line.strip()
        has_body = False
    else:
        keep = [w.lower().strip() for w in re.split('[^a-zA-Z]', line)]
        keep = [w for w in keep if len(w)>0 and w not in words]
        for i in re.finditer('[0-9]{5}', line):
            keep.append(i[0])
        line = ' '.join(keep).strip()
        if len(line)>0:
            s = s + '\n' + line
            has_body = True
    


with open('run2/programs-min.txt', 'w') as f:
    f.write(s)

In [74]:
for i in re.finditer('[0-9]{5}', '12345612341234'):
    print(i[0])

12345
61234


In [None]:
df1 = pd.read_csv('data-2019/phanon2ps2-2.csv')
df2 = pd.read_csv('data-2019/phanon2ps2-5.csv')


In [38]:
display(df1.head())
display(df2.head())

Unnamed: 0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,...,SubjectID,EventID,AssignmentID,X-File,EventType,InsertText,DeleteText,CodeStateSection,ClientTimestamp,EditType
0,10436853,100000,195,0,+delete,,\n,1568242702759,,,...,100000,10436853,195,0,File.Edit,,\n,,1568242702759,Delete
1,10436854,100000,195,0,+delete,,here,1568242702892,,,...,100000,10436854,195,0,File.Edit,,here,,1568242702892,Delete
2,10436855,100000,195,0,+delete,,go,1568242703392,,,...,100000,10436855,195,0,File.Edit,,go,,1568242703392,Delete
3,10436856,100000,195,0,+delete,,that,1568242703443,,,...,100000,10436856,195,0,File.Edit,,that,,1568242703443,Delete
4,10436857,100000,195,0,+delete,,comments,1568242703494,,,...,100000,10436857,195,0,File.Edit,,comments,,1568242703494,Delete


Unnamed: 0.1,Unnamed: 0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,...,SubjectID,EventID,AssignmentID,CodeStateSection,EventType,InsertText,DeleteText,SourceLocation,ClientTimestamp,EditType
0,0,10436863,100000,195,0,+input,#,,1568242704669,,...,100000,10436863,195,task0.py,File.Edit,#,,0.0,1568242704669,Insert
1,1,10436864,100000,195,0,+input,,,1568242704830,,...,100000,10436864,195,task0.py,File.Edit,,,1.0,1568242704830,Insert
2,2,10436865,100000,195,0,+delete,,,1568242705198,,...,100000,10436865,195,task0.py,File.Edit,,,1.0,1568242705198,Delete
3,3,10436866,100000,195,0,+input,,,1568242705450,,...,100000,10436866,195,task0.py,File.Edit,,,1.0,1568242705450,Insert
4,4,10436867,100000,195,0,+input,J,,1568242705692,,...,100000,10436867,195,task0.py,File.Edit,@,,2.0,1568242705692,Insert


In [40]:
df2.columns
df2 = df2[['native_index', 'user_id', 'project_id', 'task',
       'change_type', 'code_added', 'code_removed', 'timestamp', 'input',
       'output', 'has_error', 'user_terminated', 'startLine', 'startCol',
       'endLine', 'endCol', 'operation', 'key', 'elapsed', 'change_index',
       'ID', 'ID_no_task', 'SubjectID', 'EventID', 'AssignmentID',
       'CodeStateSection', 'EventType', 'InsertText', 'DeleteText',
       'SourceLocation', 'ClientTimestamp', 'EditType']]

In [41]:
df2.to_csv('data-2019/phanon2ps2-6.csv', index=False)