In [1]:
import pandas as pd
import numpy as np
import re
import time
import traceback

In [2]:
# event types
# ET_EDIT = 'File.Edit'
# ET_KEY = 'X-Keystroke'
# edit types
INSERT = 'Insert'
DELETE = 'Delete'

In [3]:
class Entry:
    # change_text is the text to be inserted or deleted
    # row_idx is the index of the row in the dataframe that contains the change
    # text_index is the index in the string in InsertText or DeleteText column where the change_text is found
    # insert is a boolean: true if insert, false if delete
    def __init__(self, change_text, row_idx, text_index, insert, deleted_insert_idx=-1):
        self.change_text = change_text
        self.row_idx = row_idx
        self.text_index = text_index
        self.insert = insert
        self.deleted_insert_idx = deleted_insert_idx
        
    def __str__(self):
        return 'text: {}, row_idx: {}, text_index: {}, insert: {}'.format(self.change_text, self.row_idx, self.text_index, self.insert)
    
    def isInsert(self):
        return self.insert
        
    def isDelete(self):
        return not self.insert


## Reconstruct the final submission

In [4]:
def char_diff(row):
    l = 0
    if row.InsertText == row.InsertText:
        l = len(row.InsertText)
    if row.DeleteText == row.DeleteText:
        l -= len(row.DeleteText)
    return l

def reconstruct(df, debug=False):
    # find the number of characters added or subtracted at each step
    charsAdded = df.apply(char_diff, axis=1)
    max_size = charsAdded.cumsum().max()
    # Allocate buffers
    inserts = []
    deletes = []
    deleted_inserts = []
    buf = '' # the end file
    delete_buf = '' # characters that were deleted

    # Create an error data frame holding rows that failed
    error = pd.DataFrame(columns=df.columns)
    error['XError'] = 0
    # Build the file. Iterate over each row of the data frame.
    for row_idx,row in df[df.EventType == 'File.Edit'].iterrows():
#     for row_idx,row in df.iterrows():
        code_idx = int(row.SourceLocation)
#         code_idx = -1
#         if not np.isnan(row.SourceLocation):
#             code_idx = int(row.SourceLocation)
        if row.DeleteText and row.DeleteText == row.DeleteText:
            di_len = len(deleted_inserts)
            deleted_inserts = deleted_inserts + inserts[code_idx:code_idx+len(row.DeleteText)]
            
#             # Use what's in the buffer because what's in DeleteText may not match
            deleted_text = buf[code_idx:code_idx+len(row.DeleteText)]
            if (len(deleted_text) != len(row.DeleteText)):
#                 raise Exception(f'deleted_text != row.DeleteText: {len(deleted_text)} {len(row.DeleteText)}')
                print(f'Warning: len(deleted_text) != len(row.DeleteText): {len(deleted_text)} {len(row.DeleteText)} at index {code_idx}')
            
            inserts = inserts[:code_idx] + inserts[code_idx+len(row.DeleteText):]
            buf = buf[:code_idx] + buf[code_idx+len(row.DeleteText):]
            local_deletes = [Entry(row.DeleteText[k], row_idx, k, False, deleted_insert_idx=di_len+k) for k in range(len(row.DeleteText))]
            deletes = local_deletes + deletes
            delete_buf = row.DeleteText + delete_buf

#             inserts = inserts[:code_idx] + inserts[code_idx+len(deleted_text):]
#             buf = buf[:code_idx] + buf[code_idx+len(deleted_text):]
#             local_deletes = [Entry(deleted_text, row_idx, k, False, deleted_insert_idx=di_len+k) for k in range(len(row.DeleteText))]
#             deletes = local_deletes + deletes
#             delete_buf = deleted_text + delete_buf
        if row.InsertText and row.InsertText == row.InsertText:
            local_inserts = [Entry(row.InsertText[k], row_idx, k, True) for k in range(len(row.InsertText))]
            inserts = inserts[:code_idx] + local_inserts + inserts[code_idx:]
            buf = buf[:code_idx] + row.InsertText + buf[code_idx:]
        else:
            error = error.append(row)
            error.iloc[0, error.columns.get_loc('XError')] = 'Unsupported edit type'
    if debug:
        print('Errors:')
        display(error)

    return buf, delete_buf, inserts, deletes, deleted_inserts


## Look between timestamps

In [None]:
df = pd.read_csv('data-2019/bak/project-events.csv')


In [None]:
df.head()
test = df[df.user_id == 100036].copy()
test = test[(test.timestamp > 1548976440848)&(test.timestamp < 1548976584332)]
test

## Create a regular expression

In [5]:
# Version used for fall 2021 data
from itertools import permutations, combinations

# Returns a regular expression to match and mask.
# subjectID - the name of the student with each of first, last, etc separated by a space
# include_Anum - whether to include a regular expression to match Aggie A#
def subjectID2mask_re_2021(subjectID, include_Anum=True, substrings=True):
    res = []
    
    # Get "First Last" and "Last First"
    id_parts = subjectID.split() # Probably just first and last name but could include a middle initial
    for i in range(len(id_parts), 0, -1):
        for comb in combinations(id_parts, i):
            perms = permutations(comb)
            for perm in perms:
                r = ' '.join(perm)
                if len(r) > 1:
                    res = res + [r]

    # Find larger substrings of each name. For example, if the name is Christensen, this will add
    # hristense, ristens, and isten. The reason for this is if the name somehow is only partial
    # in the keystrokes. For example, the student might have missed typing the last character.
    # In our working example, this would result in Christense. In this case, we would match
    # hristense and the masked keys would result in C@@@@@@@@@. Ideally we would attempt matches
    # on every substring, but there are n(n+1)/2 of those -- way too many. There are a linear
    # number of substrings using our scheme and it strikes a balance between performance and
    # quality of masking/deidentification.
    if substrings:
        for name in id_parts:
            res = res + [name[i:-i] for i in range(1,int(len(name)/2)-1)]

    # Add a RE for A#
    if include_Anum:
        res = res + ['a#?[ ]{0,3}[0-9]{8}']
    return '|'.join(res)

r = subjectID2mask_re_2021('John Lilliputian')
print(r)
print([m[0] for m in re.finditer(r, 'John Lilliputian John illiputi a12345678 a#35354646 A# 87658765 a 56565656 a  34563456  a    97597531', flags=re.IGNORECASE)])
r = subjectID2mask_re_2021('Alomarian Stoic Parambulator Stu')
print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])

r = subjectID2mask_re_2021('Alomarian Stoic Parambulator Stu', substrings=False)
print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])



John Lilliputian|Lilliputian John|John|Lilliputian|illiputia|lliputi|liput|a#?[ ]{0,3}[0-9]{8}
['John Lilliputian', 'John', 'lliputi', 'a12345678', 'a#35354646', 'A# 87658765', 'a 56565656', 'a  34563456']
['Stoic Stu', 'a12341234', 'rambulat']
['Stoic Stu', 'a12341234']


In [6]:
from itertools import permutations, combinations

# Returns a regular expression to match and mask.
# names - names for the student
# include_Anum - whether to include a regular expression to match Aggie A#
def subjectID2mask_re_2019(names, include_Anum=True, substrings=True):
    res = names.copy()
    
    for name in names:
        for n in name.split():
            n = n.strip()
            if len(n) > 1 and n[1] != '.':
                res.append(n)
                
    res = res + ['Andy','Andrew','Brim','Chad','Mano','Mono']

    # Add a RE for A#
    if include_Anum:
        res = res + ['a#?[ ]{0,3}[0-9]{8}']
    return '|'.join(res)

r = subjectID2mask_re_2019(['John Lilliputian', 'John L'])
print(r)
r = subjectID2mask_re_2019(['John M. Edwards', 'John Edwards'])
print(r)
# print([m[0] for m in re.finditer(r, 'John Lilliputian John illiputi a12345678 a#35354646 A# 87658765 a 56565656 a  34563456  a    97597531', flags=re.IGNORECASE)])
# r = subjectID2mask_re_2019('Alomarian Stoic Parambulator Stu')
# print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])

# r = subjectID2mask_re_2019('Alomarian Stoic Parambulator Stu', substrings=False)
# print([m[0] for m in re.finditer(r, 'Stoic Stu\na12341234 parambulat', flags=re.IGNORECASE)])



John Lilliputian|John L|John|Lilliputian|John|Andy|Andrew|Brim|Chad|Mano|Mono|a#?[ ]{0,3}[0-9]{8}
John M. Edwards|John Edwards|John|Edwards|John|Edwards|Andy|Andrew|Brim|Chad|Mano|Mono|a#?[ ]{0,3}[0-9]{8}


## Mask a dataframe

In [7]:
REPLACE_WITH = '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
def mask_char_in_insert_text(df, row_idx, text_index, replacement_char):
    insert_col_idx = df.columns.get_loc('InsertText')
    s = df.iloc[row_idx].InsertText
    char_to_replace = s[text_index]
    df.iloc[row_idx,insert_col_idx] = s[:text_index] + replacement_char + s[text_index+1:]
    
    # Look nearby (within two rows) for a keystroke and mask it as well
    arr = np.intersect1d(df.index[(df.EventType == 'X-Keystroke')&
                                  (df.InsertText == char_to_replace)],
                         pd.RangeIndex(row_idx-2, row_idx+3))
    df.iloc[arr,insert_col_idx] = replacement_char
    
# Returns the masked dataframe and all strings that were masked
def mask(df, program, entries, is_insert, text_to_replace, replace_with=REPLACE_WITH, deleted_inserts = None, in_place = False):
#     display(df)
    masked_strings = set()
    insert_col_idx = df.columns.get_loc('InsertText')
    delete_col_idx = df.columns.get_loc('DeleteText')
    if in_place:
        masked = df
    else:
        masked = df.copy()

    for it in re.finditer(text_to_replace, program, flags=re.IGNORECASE):
        start_i = it.start()
        text_to_replace = it[0]
        masked_strings.add(text_to_replace)

        # Iterate through each character of the input text
        for i in range(len(text_to_replace)):
            replacement_char = replace_with[i]
            char_to_replace = text_to_replace[i]
            entry = entries[start_i+i] # entry with data on the character we are about to mask
            if entry.change_text != char_to_replace:
                print('ERROR: change_text not equal to char_to_replace: {}, {}', entry, char_to_replace)
            row_idx = entry.row_idx
            text_index = entry.text_index
            if (is_insert != entry.insert):
                print('Error: {} is not equal to insert = {}'.format(entry, is_insert))
            
            if entry.insert:
                mask_char_in_insert_text(masked, row_idx, text_index, replacement_char)
            else:
                # Deletion
                # Mask the DeleteText string
                s = masked.iloc[row_idx].DeleteText
                masked.iloc[row_idx,delete_col_idx] = s[:text_index] + replacement_char + s[text_index+1:]
                # Mask the InsertText string where the text was originally inserted
                di = deleted_inserts[entry.deleted_insert_idx]
#                 s = masked.iloc[di.row_idx].InsertText
#                 if s[di.text_index] != char_to_replace:
#                     print('ERROR: s[di.text_index] != char_to_replace: {}, {}, {}', s, di, char_to_replace)
#                     print('  {}, {}'.format(row_idx, di.row_idx))
#                     print('  {}'.format(masked.iloc[di.row_idx]))
                mask_char_in_insert_text(masked, di.row_idx, di.text_index, replacement_char)
            

    return masked, masked_strings


In [8]:
def mask_df(df, mask_re, replace_with=REPLACE_WITH):
    df = df.copy().reset_index()
    program, deleted_text, entries, deletes, deleted_inserts = reconstruct(df)

    masked = df.copy()
    masked, ms1 = mask(masked, program, entries, True, mask_re, replace_with)
    masked, ms2 = mask(masked, deleted_text, deletes, False, mask_re, replace_with, deleted_inserts)

    return masked, ms1.union(ms2)

## Test reconstruct and mask

In [None]:
# df = pd.read_csv('data/deident.csv')#, header=None)
# df = showyourwork2progsnap2('showyourwork.log', 'John Edwards', 'Assign1')
df = pd.read_csv('test.ps2')
subjectID = df.SubjectID.unique()[0]
display(df['CodeStateSection'].unique())
df = df[(df.SubjectID == subjectID)&(df['CodeStateSection'] == 'task1.py')]
# display(df)

print(reconstruct(df)[0])

print('\n\n******* Masking ********\n')

masked, masked_strings = mask_df(df, subjectID2mask_re(subjectID))
program, deleted_text,_,_,_ = reconstruct(masked)
print(f'Masked strings: {masked_strings}')
print(program)
print('\n\n** Deleted **\n' + deleted_text)
print('\n\n** Inserts **\n', ''.join(masked[(masked.EventType == 'File.Edit')&(masked.EditType == INSERT)].InsertText))
print('\n\n** Keystrokes **\n', ''.join(masked.loc[masked.EventType == 'X-Keystroke', 'X-Metadata']))

# temp = pd.concat([df,masked]).drop_duplicates(keep=False)
# display(temp[temp.EventID < 100].head(40))


## Add a number to each character in an ascii string

In [9]:
def add_to_ascii(s, add):
    ch = [c+add for c in bytes(s, 'utf-8')]
    return ''.join([str(chr(c)) for c in ch])

s = 'Ebof!Sbtnvttfo!.!Bttjho23!.!ubtl3/qz'
print(add_to_ascii(s, -1))

Dane Rasmussen - Assign12 - task2.py


# Deidentify all students in ps2 file

In [10]:
# Returns (df, submission, deleted) where
#  df - the deidentified data frame
#  submission - the reconstructed submission code
#  deleted - all deleted characters in reverse order
def deidentifyps2(df_all, id2names=None, header_offset=1):
    df_result = []
    all_masked_strings = set()
    programs = ''
    program_heads = ''
    deleted = ''
    
    df_all = df_all[df_all['CodeStateSection'].str.slice(-3) == '.py'].copy()
    df_all['uniqueID'] = df_all.SubjectID + df_all.AssignmentID + df_all['CodeStateSection']
    for ID in df_all.uniqueID.unique():
        masked = df_all[df_all.uniqueID == ID].copy().reset_index()
        
        f = masked.iloc[0]
        id_string = f'{f.SubjectID} - {f.AssignmentID} - {f["CodeStateSection"]}'
        print(id_string)
        
        try:
            # Do not sort if deidentifying ShowYourWork.
            masked.sort_values('ClientTimestamp', inplace=True)
            if not id2names:
                # fall 2021
                mask_re = subjectID2mask_re_2021(f.SubjectID)
            else:
                # 2019
                mask_re = subjectID2mask_re_2019(list(id2names[f.SubjectID]))

            masked,masked_strings = mask_df(masked, mask_re)
            if (len(masked_strings) > 0):
                all_masked_strings = all_masked_strings.union(masked_strings)
                print(f'  Masked strings: {masked_strings}')

            program, deleted_text,_,_,_ = reconstruct(masked)
            program_header = f'\n\n____{add_to_ascii(id_string, header_offset)}****\n'
            if len(program.strip()) > 0:
                programs = programs+program_header+program
            lines = program.split('\n')
            num_lines = 3 if len(lines)>3 else len(lines)
            lines = lines[:num_lines]
            lines = '\n'.join(lines)
            if len(lines.strip()) > 0:
                program_heads = program_heads+program_header+lines
                
            if len(deleted_text.strip()) > 0:
                deleted = deleted+program_header+deleted_text

            df_result.append(masked)
#             if df_result is None:
#                 df_result = masked
#             else:
#                 df_result = pd.concat([df_result, masked])
        except Exception as e:
            print(f'Failed to mask {id_string}: {str(e)}')
            traceback.print_exc()

    return pd.concat(df_result), all_masked_strings, programs, program_heads, deleted



# Deidentify 2019

## Read a csv

In [145]:
df_2019 = pd.read_csv('data-2019/phanon2ps2-6.csv')
df_2019.SubjectID = df_2019.SubjectID.astype('str')
df_2019.AssignmentID = df_2019.AssignmentID.astype('str')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Do the deidentification

In [33]:
students = list(df_2019.SubjectID.unique())
# students = set(students[:8])
students = ['100007']#, '100003', '100030', '100036', '100072', '100073']
# print('Deidentifying {}'.format(students))
df = df_2019[df_2019.SubjectID.isin(set(students))].copy()
df = df.drop(['index'], axis=1, errors='ignore')
# display(df.head())

# remove invalid source location events
bad_mask = ((df.InsertText.isna())&(df.DeleteText.isna()))|~(df.SourceLocation.isna())
df = df[bad_mask]

# display(df.head())

dfs = []
n = 20
for i in range(0,len(students),n):
    tic = time.perf_counter()
    # 20 students at a time
    df_slice = df[df.SubjectID.isin(students[i:i+n])]
    df_masked,ms,programs,program_heads,deleted = deidentifyps2(df_slice, id2names=id2names, header_offset=0)
    toc = time.perf_counter()
    print(f"Masked {toc - tic:0.4f} seconds")

    print(f'Masked strings: {ms}')

    with open(f'data-2019/run4/deidentified-programs{i}.txt', 'w') as f:
        f.write(programs)
    with open(f'data-2019/run4/deidentified-program-heads{i}.txt', 'w') as f:
        f.write(program_heads)
    with open(f'data-2019/run4/deidentified-deleted{i}.txt', 'w') as f:
        f.write(deleted)
    if not df_masked is None:
        df_masked = df_masked.drop(['level_0','uniqueID','index'], axis=1)
        dfs.append(df_masked)

# pd.concat(dfs).to_csv('data-2019/phanon2ps2-3.csv', index=False)
pd.concat(dfs).to_csv('../website/KeystrokePlayback/test.csv', index=False)


100007 - 128 - tasks.py
100007 - 130 - tasks.py
100007 - 132 - tasks.py
100007 - 134 - tasks.py
100007 - 136 - tasks.py
Masked 42.5549 seconds
Masked strings: set()


## Deidentify a single assignment

In [None]:
problems = [
    ('____100176 - 130 - tasks.py****','montgoomery'),
    ('____100380 - 132 - tasks.py****','zhenngxi'),
    ('____100481 - 200 - tasks.py****','harmom'),
    ('____100130 - 136 - tasks.py****','grvuer')
]

df_rest = df_2019.copy()
print(len(df_2019))
to_add = []
for header,search in problems:
    print(header,search)

    df_to_mask, df_rest = split_df_from_header(df_rest, header, return_rest=True)
    df_to_mask = df_to_mask.copy()
    student = header[4:10]
    id2names[student] = [search]

    df_masked,ms,programs,program_heads,deleted = deidentifyps2(df_to_mask, id2names=id2names, header_offset=0)
    
    to_add.append(df_masked)

    # pd.concat(dfs).to_csv('data-2019/phanon2ps2-3.csv', index=False)
    # pd.concat(dfs).to_csv('../website/KeystrokePlayback/test.csv', index=False)
    df_masked.to_csv('../website/KeystrokePlayback/test.csv', index=False)

new_df = pd.concat(to_add+[df_rest])
print(len(new_df))
# new_df.to_csv('data-2019/phanon2ps2-5.csv', index=False)

## Construct id2names

In [None]:
lines = ''
with open('data-2019/id2name.txt') as f:
    lines = f.readlines()

id2names = {}
joined = ''.join(lines)
arr = ['100'+s for s in joined.split('100')][1:]
for entry in arr:
    earr = entry.split('\n')
    subjectID = earr[0].split()[0].strip()
    entries = set([])
    for line in earr[1:]:
        line = line.strip()
        if len(line)>0:
            entries.add(line)
    id2names[subjectID] = entries

for key in id2names.keys():
    print(key)
    for val in id2names[key]:
        print(f'  {val}')


## Retrofit Phanon code_added/code_removed fields

In [134]:
df = df_2019.copy()
df.code_added = df.InsertText
df.code_removed = df.DeleteText

df.to_csv('data-2019/phanon2ps2-6.csv', index=False)


## Retrofit to Phanon format

In [146]:
df_2019.columns
df_phanon = df_2019[['user_id', 'project_id', 'task',
       'change_type', 'code_added', 'code_removed', 'timestamp', 'input',
       'output', 'has_error', 'user_terminated', 'startLine', 'startCol',
       'endLine', 'endCol', 'operation', 'key']].copy()


In [None]:
# df = pd.read_csv('data-2019/phanon-keystrokes.csv')
# df.head()

df_phanon.sort_values(['user_id','project_id','timestamp'], inplace=True)
df_phanon.head(50)

In [150]:
df_phanon.to_csv('data-2019/phanon-keystrokes.csv', index=False)

## Remove problem submissions
Look at each user and see if there are any names in inserted or deleted code, then remove in the following cell.

In [60]:
temp = df_2019#[df_2019.SubjectID.isin([str(n) for n in range(100000,100100)])]

students = list(temp.SubjectID.unique())
masks = [subjectID2mask_re_2019(list(id2names[student])) for student in students]

# ifound = df[~(df.InsertText.isna())&(df.InsertText.str.contains(mask_re, flags=re.IGNORECASE))]
# dfound = df[~(df.DeleteText.isna())&(df.DeleteText.str.contains(mask_re, flags=re.IGNORECASE))]

for student in students:
    df = temp[temp.SubjectID == student]
    mask_re = subjectID2mask_re_2019(list(id2names[student]))
    ifound = df[~(df.InsertText.isna())&(df.InsertText.str.contains(mask_re, flags=re.IGNORECASE))]
    dfound = df[~(df.DeleteText.isna())&(df.DeleteText.str.contains(mask_re, flags=re.IGNORECASE))]
    if len(ifound) > 0:
        print('insert', student, len(ifound), ifound.AssignmentID.values, mask_re)
#         display(ifound)
    if len(dfound) > 0:
        print('delete', student, len(dfound), ifound.AssignmentID.values, mask_re)

These are submissions where an identifying string somehow ended up in the InsertText or DeleteText. If this happens, remove the entire submission.

In [55]:
to_remove = set(['100007130','100068132','100084204','100087204','100116129','100183204','100206195','100208132','100212138','100241132','100287200','100326204','100336133','100387136','100393133','100404129','100416132','100417130','100417132','100418133','100425204','100427132','100435128','100435136','100444138','100452130','100461204','100482195','100507204','100511138'])
df_2019[~((df_2019.SubjectID.astype('str')+df_2019.AssignmentID.astype('str')).isin(to_remove))].to_csv('data-2019/phanon2ps2-3.csv', index=False)



In [120]:
df_2019[(df_2019.SubjectID == '100068')].AssignmentID.unique()

array(['128', '130', '134', '136'], dtype=object)

## Write a single user to test.csv

In [100]:
# Returns the dataframe matching the header and the dataframe not matching the header
def split_df_from_header(df, header, return_rest=False):
    s = header
    s = s[4:]
    s = s.split()
    student = s[0]
    assignment = s[2]
    masked = (df.SubjectID == student)&(df.AssignmentID == assignment)
    if return_rest:
        return df[masked], df[~masked]
    return df[masked], None
    

In [138]:
# df_2019[df_2019.SubjectID.isin([str(n) for n in range(100007,100008)])].to_csv('../website/KeystrokePlayback/test.csv', index=False)

# problems = [
#     ('____100176 - 130 - tasks.py****','montgoomery'),
#     ('____100380 - 132 - tasks.py****','zhenngxi'),
#     ('____100481 - 200 - tasks.py****','harmom'),
#     ('____100130 - 136 - tasks.py****','grvuer')
# ]

header = '____100103 - 207 - tasks.py****'
split_df_from_header(df_2019, header)[0].to_csv('../website/KeystrokePlayback/test.csv', index=False)
# s = s[4:]
# s = s.split()
# student = s[0]
# assignment = s[2]
# df_2019[(df_2019.SubjectID == student)&(df_2019.AssignmentID == assignment)]

# for i in range(100015, 100550, 5):
#     df_2019[df_2019.SubjectID.isin([str(n) for n in range(i,i+5)])].to_csv(f'../website/KeystrokePlayback/test{i}.csv', index=False)


# To fix


# Done
* 100176 - 130 - montgoomery
* 100380 - 132 - zhenngxi
* 100481 - 200 - harmom
* 100130 - 136 - grvuer
* 100007 130 Erin Griffin edit 4745, 5513 -- just delete the participant?
* setValue can have nan source location: test.loc[(test.EventType == 'File.Edit')&(test.SourceLocation.isna()), 'SourceLocation'] = 0
* 100003 Garnder
* 100030 O'Loughlin
* 100036 Jake Miller
* 100034 Mary Chidester
* 100108 - 132 - task1.py -- "i am will"
* 100108 - 136 - task0.py -- will perfect
* 100184 - 128 - task0.py -- jimmy

## Take out common words and symbols from "check" files

In [82]:
run = 'data-2019/run3'
# file = 'heads'
file = 'programs'
# file = 'deleted'

with open(run+'/aaa-masked-strings.txt') as f:
    text = f.read()
masked = set([s.strip("' \n").lower() for s in text.split(',')])

# Words that we will pull out of the check text
# with open('words-no-names.txt') as f:
with open('wordsbig.txt') as f:
    lines = f.readlines()
words = set([w.strip() for w in lines])

# The file to reduce
with open(run+'/'+file+'.txt') as f:
    lines = f.readlines()

# Don't pull out any words that we masked
words = words - set(masked)
words = words - set(['http', 'zoom'])

# Add some common words to pull out
words = words.union(set(['pendown','penup','fillcolor','setheading','phanon']))

header2body = {}
for line in lines:
    if line[:7] == '____100':
        header = line.strip()
        header2body[header] = set()
    else:
        keep = [w.lower().strip() for w in re.split('[^a-zA-Z]', line)]
        keep = [w for w in keep if len(w)>0 and w not in words]
        for i in re.finditer('[0-9]{5}', line):
            keep.append(i[0])
        if len(keep)>0:
            header2body[header] = header2body[header].union(set(keep))

s = ''
for header, body in header2body.items():
    if len(body) > 0:
        s = s + '\n\n' + header + '\n' + ' '.join(body)

with open(run+'/'+file+'-min.txt', 'w') as f:
    f.write(s)

In [40]:
df2.columns
df2 = df2[['native_index', 'user_id', 'project_id', 'task',
       'change_type', 'code_added', 'code_removed', 'timestamp', 'input',
       'output', 'has_error', 'user_terminated', 'startLine', 'startCol',
       'endLine', 'endCol', 'operation', 'key', 'elapsed', 'change_index',
       'ID', 'ID_no_task', 'SubjectID', 'EventID', 'AssignmentID',
       'CodeStateSection', 'EventType', 'InsertText', 'DeleteText',
       'SourceLocation', 'ClientTimestamp', 'EditType']]