In [2]:
import pandas as pd
import numpy as np
import re
import time
import traceback

# Supporting functions

In [3]:
def split_text(text):
    text = text.split('\n')
    if len(text) > 1:
        text = [e+'\n' for e in text[:-1]] + [text[-1]]
    return text
    
def insert(lines, text, irow, icol):
    if len(lines) == 0:
        lines = ['']
    if icol > len(lines[irow]):
        s = '"'+lines[irow].replace(' ','*').replace('\n','\\n')+'"'
        raise IndexError(f"Column out of range: irow={irow}, icol={icol}, line={s}")
    # Consider the line in the question and the line following.
    # Split into lines preceding and succeeding those two. Then
    # join the two in question.
    before = lines[:irow]
    after = []
    if len(lines) == irow+1:
        # last line
        two = lines[irow]
    elif len(lines) == irow+2:
        # second to last line
        two = ''.join(lines[irow:irow+2])
    else:
        # More than one line following
        two = ''.join(lines[irow:irow+2])
        after = lines[irow+2:]
        
    two = two[:icol] + text + two[icol:]
    if len(after) == 0:
        return before + split_text(two)
    return before + split_text(two)[:-1] + after

# print(split_text(''))
# print(split_text('\n'))
# print(split_text('abc'))
# print(split_text('abc\n'))
# print(split_text('abc\ndef'))
# print(split_text('abc\ndef\n'))
# print(split_text('abcdef\n\ndef'))
# lines = insert(['abc\n','def'], 'def', 0, 3)
# print('*'.join(lines))
# print(insert([], 'abc', 0, 0))
# print(insert([''], 'abc', 0, 0))
# print(insert([], 'abc\n', 0, 0))
# print(insert([''], 'abc\n', 0, 0))
# print(insert(['abc\n', ''], 'def\n', 0, 0))
# print(insert(['abc\n', ''], 'def\n', 0, 3))
# print(insert(['abc\n', ''], 'def\n', 1, 0))
# print(insert(['abc\n', ''], 'def', 0, 0))
# print(insert(['abc\n', ''], 'def', 0, 3))
# print(insert(['abc\n', ''], 'def', 1, 0))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 0, 0))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 0, 3))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 1, 0))
# print(insert(['abc\n','def'], 'def\n', 0, 3))


In [4]:
def add_empty_line_cond(lines):
    if len(lines) == 0:
        return ['']
    last_line = lines[-1]
    if len(last_line) > 0 and last_line[-1] == '\n':
        return ['']
    return []

#        i j
# i,j aaaddaaa
#
#        i j
# i,j aaadd
#     aaaaa    
#
#      i
# i aaadd
#   ddddddd
# j ddaaaaa
#     j
def remove_impl(lines, irow, icol, jrow, jcol):
#     if jrow == len(lines) and jcol == 0:
#         jrow = len(lines)-1
#         jcol = len(lines[-1])
    if icol > len(lines[irow]):
        s = '"'+lines[irow].replace(' ','*').replace('\n','\\n')+'"'
        raise IndexError(f"Column out of range: irow={irow}, icol={icol}, line={s}")
    if irow == jrow:
        line = lines[irow]
        line = line[0:icol] + line[jcol:]
        if len(line) == 0:
            return lines[:irow] + lines[irow+1:]
        if line[-1] != '\n' and irow < len(lines)-1:
            return lines[:irow] + [line+lines[irow+1]] + lines[irow+2:]
        return lines[:irow] + [line] + lines[irow+1:]
    else:
        line1 = lines[irow][:icol]
        line2 = lines[jrow][jcol:]
        if len(line1+line2) == 0:
            return lines[:irow] + lines[jrow+1:]
        if (line1+line2)[-1] != '\n' and jrow < len(lines)-1:
            return lines[:irow] + [line1+line2+lines[jrow+1]] + lines[jrow+2:]
        return lines[:irow] + [line1+line2] + lines[jrow+1:]
    
def remove(lines, irow, icol, jrow, jcol):
    lines = remove_impl(lines, irow, icol, jrow, jcol)
    return lines + add_empty_line_cond(lines)

# print(remove(['abc\n', ''], 0, 0, 0, 1))
# print(remove(['abc\n', ''], 0, 0, 0, 2))
# print(remove(['abc\n', ''], 0, 0, 0, 3))
# print(remove(['abc\n', ''], 0, 0, 0, 4))
# print(remove(['abc\n', ''], 0, 1, 0, 3))
# print(remove(['abc\n', 'def'], 0, 1, 1, 0))
# print(remove(['abc\n', 'def'], 0, 1, 1, 3))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 0))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 3))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 4))
# print(remove(['abc\n', 'def\n', ''], 1, 3, 1, 4))
# print(remove(['abc\n', 'def\n', 'g'], 2, 0, 2, 1))
# print(remove(['abc\n', 'def\n', 'g'], 2, 0, 3, 0))


# Conversion function

In [109]:
def phanon2progsnap2(df, debug = False):
    test = df.copy()
    change_indices = []
    lines = ['']
    i = 0
    skip_to_checkpoint = False
    for index,row in test.iterrows():
    #     print('**',i)
        i = i + 1
        try:
            irow = row.startLine
            icol = row.startCol
            jrow = row.endLine
            jcol = row.endCol
            change_type = row.change_type
            added = row.code_added
            removed = row.code_removed
            changed = False
            change_index = np.nan
            if skip_to_checkpoint:
                if row.change_type == 'setValue':
                    # SourceLocation = -2 means remove remove value but leave add value
                    change_index = -2
                    removed = None
                    skip_to_checkpoint = False
                else:
                    # SourceLocation = -1 means remove the event -- it's invalid
                    change_index = -1
            if not skip_to_checkpoint:
                if row.change_type == 'setValue':
                    lines = ['']
                    change_index = 0
    #             elif removed and removed == removed and row.change_type != 'setValue' and (len(lines)>1 or len(lines[0])>0):
                elif removed and removed == removed and (len(lines)>1 or len(lines[0])>0):
#                 if removed and removed == removed and row.change_type != 'setValue' and (len(lines)>1 or len(lines[0])>0):
                    changed = True
                    irow = int(irow)
                    icol = int(icol)
                    jrow = int(jrow)
                    jcol = int(jcol)
                    lines = remove(lines, irow, icol, jrow, jcol)
#                 if row.change_type == 'setValue':
#                     lines = ['']
#                     change_index = 0
                if added and added == added and added != '':
                    changed = True
                    irow = int(irow)
                    icol = int(icol)
                    lines = insert(lines, added, irow, icol)
                if changed:
                    change_index = len(''.join(lines[:irow]))+icol
            change_indices.append(change_index)
        except Exception as e:
            if debug:
                display('i={}: {}'.format(i,e))
                print(''.join(lines).replace(' ', '·'))
                display(row)
                traceback.print_exc()
                test = test[:len(change_indices)]
                break
            else:
                change_indices.append(-1)
                display(f'timestamp={row.timestamp}: {e}')
                skip_to_checkpoint = True
#                 raise e
#     print(change_indices)
    test.change_index = change_indices
    test['SubjectID'] = test.user_id
    test['EventID'] = test.native_index
    test['AssignmentID'] = test.project_id
    test['CodeStateSection'] = test.task
    test['EventType'] = test.change_type
    test['InsertText'] = test.code_added
    test['DeleteText'] = test.code_removed
    test['SourceLocation'] = test.change_index
    test['ClientTimestamp'] = test.timestamp
    # array(['RUN', 'SUBMIT', 'TASK', 'setValue', '+delete', '+input', 'paste',
#        'undo', 'redo', 'cut', 'drag'], dtype=object)
    test.EventType = test.EventType.replace({'+input':'File.Edit','+delete':'File.Edit',
                                             'undo':'File.Edit','redo':'File.Edit',
                                             'cut':'File.Edit','paste':'File.Edit','drag':'File.Edit',
                                             'RUN':'Run.Program','SUBMIT':'Submit',
                                             'TASK':'X-SwitchTask',
                                             'setValue':'File.Edit'
                                            })
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == '+input'),'EditType'] = 'Insert'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == '+delete'),'EditType'] = 'Delete'
#     test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'setValue'),'EditType'] = 'Insert'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'setValue'),'EditType'] = 'X-Checkpoint'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'undo'),'EditType'] = 'Undo'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'redo'),'EditType'] = 'Redo'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'paste'),'EditType'] = 'Paste'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'cut'),'EditType'] = 'Cut'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'drag'),'EditType'] = 'Drag'
    return test

# Do the conversion

In [None]:
# df = pd.read_csv('data-2019/keystrokes.csv')
df = pd.read_csv('data-2019/src/project-events.csv')

In [71]:
df = df.rename({'Unnamed: 0':'native_index'}, axis=1)
df = df.sort_values(['user_id','timestamp','native_index'])
df['change_index'] = np.nan

df['ID'] = df.user_id.astype('str') + df.project_id.astype('str') + df.task.astype('str')
df['ID_no_task'] = df.user_id.astype('str') + df.project_id.astype('str')

In [113]:

test = df[df.user_id.isin(list(range(100036,100037)))].copy().reset_index()
# test = test[test.project_id == 133]
# test = phanon2progsnap2(test, False)
# test.CodeStateSection = 0
# # display(test)
copy = test

copy.ID = copy.ID_no_task
# copy.ID = copy.user_id

dfs = []
num_failed = 0
num_total = 0
for ID in copy.ID.unique():
    print(ID)
    num_total = num_total + 1
    subdf = copy[copy.ID == ID]
    try:
        dfs.append(phanon2progsnap2(subdf, False))
    except:
        print('{} - Reconstruction failed'.format(ID))
        traceback.print_exc()
        num_failed = num_failed + 1

print(f'Number of failed reconstructions: {num_failed}/{num_total}')
copy = pd.concat(dfs)
copy.CodeStateSection = 0
copy.loc[copy.SourceLocation == -2, 'code_removed'] = ''
copy.loc[copy.SourceLocation == -2, 'DeleteText'] = ''
copy.loc[copy.SourceLocation == -2, 'SourceLocation'] = 0
copy = copy[copy.SourceLocation != -1]
copy.to_csv('../website/KeystrokePlayback/test.csv', index=False)
# copy.to_csv('phanon2ps2.csv', index=False)

100036129
100036131
100036133


'timestamp=1549389384647: list index out of range'

100036135
100036138
Number of failed reconstructions: 0/5


In [None]:
print(len(copy))
display(copy.EventType.unique())
display(copy.groupby('EventType').count())

copy.EventType = copy.EventType.replace({'setValue':'File.Edit'})
copy.loc[(copy.EventType == 'File.Edit')&(copy.change_type == 'setValue'),'EditType'] = 'Replace'

print(len(copy))
display(copy.EventType.unique())
display(copy.groupby('EventType').count())
copy.to_csv('phanon2ps2-2.csv', index=False)

## Fix bug with Jake Miller

In [None]:
df = pd.read_csv('data-2019/src/project-events.csv')


In [56]:
test = df[df.user_id == 100036].copy()
# test = test[test.project_id == 133]
n1 = 400000000
n2 = 10000
test = test[(test.timestamp > 1549389384647-n1)&(test.timestamp < 1549389384647+n2)]
test

Unnamed: 0,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,output,...,startLine,startCol,endLine,endCol,operation,key,elapsed,change_index,ID,ID_no_task
5415272,2709472,100036,131,0,TASK,,,1549063050087,,,...,,,,,TASK,,81441639.0,,1000361310,100036131
5415273,2709471,100036,131,0,setValue,# this is going to calculate future investment...,#We are now going to create a target that is s...,1549063050099,,,...,0.0,0.0,61.0,0.0,setValue,,12.0,,1000361310,100036131
5415274,2709473,100036,131,0,RUN,,,1549063059375,,,...,,,,,RUN,,9276.0,,1000361310,100036131
5415275,3964866,100036,133,0,+input,,,1549389384647,,,...,2.0,6.0,2.0,6.0,+input,space,326325272.0,,1000361330,100036133
5415276,3964867,100036,133,0,+input,6,,1549389385827,,,...,2.0,7.0,2.0,7.0,+input,6,1180.0,,1000361330,100036133
5415277,3964868,100036,133,0,+delete,,e,1549389387697,,,...,1.0,7.0,1.0,8.0,+delete,delete,1870.0,,1000361330,100036133
5415278,3964869,100036,133,0,+delete,,s,1549389387932,,,...,1.0,6.0,1.0,7.0,+delete,delete,235.0,,1000361330,100036133
5415279,3964870,100036,133,0,+delete,,r,1549389388144,,,...,1.0,5.0,1.0,6.0,+delete,delete,212.0,,1000361330,100036133
5415280,3964871,100036,133,0,+delete,,u,1549389388351,,,...,1.0,4.0,1.0,5.0,+delete,delete,207.0,,1000361330,100036133
5415281,3964872,100036,133,0,+delete,,o,1549389388550,,,...,1.0,3.0,1.0,4.0,+delete,delete,199.0,,1000361330,100036133


In [19]:
test = df_2019[df_2019.SubjectID.isin(['100036'])].copy()

# test = df_2019[df_2019.SubjectID.isin(list(range(100030,100040)))]
# display(test.head())
# test = test[test.AssignmentID.isin(['129','131','200'])]
# test = test.sort_values(['user_id','timestamp','native_index'])

# test = test[test.CodeStateSection == 'task0.py']

# test.to_csv('test.csv', index=False)
# with open('test.csv') as f:
#     text = f.read()
# text = re.sub(re.compile('jake miller', flags=re.IGNORECASE), '@@@@@@@@@@@', text)
# # print(text[:100])
# with open('test.csv', 'w') as f:
#     f.write(text)
    
# test = pd.read_csv('test.csv')
# test.SubjectID = test.SubjectID.astype('str')
# test.AssignmentID = test.AssignmentID.astype('str')

# test = test[~(test.EditType == 'Replace')]

# If setValue has only a delete then the source location is going to be nan. Fix
# that to be 0.
# test[(test.EventType == 'File.Edit')&(test.SourceLocation.isna())]
# test[~(test.DeleteText.isna())&(test.SourceLocation.isna())]
test.loc[(test.EventType == 'File.Edit')&(test.SourceLocation.isna()), 'SourceLocation'] = 0


# display(test)

# # test[~(test.DeleteText.isna())&(test.EventType != 'File.Edit')].head()

# # test = test.iloc[:1180].copy().reset_index()
# test = test.copy().reset_index()
# # program, deleted_text, entries, deletes, deleted_inserts = reconstruct(test.iloc[:990])
# program, deleted_text, entries, deletes, deleted_inserts = reconstruct(test)
# # print(deleted_text)
# # with open('test.txt', 'w') as f:
# #     f.write(deleted_text)

# # for i in re.finditer('jake', deleted_text, flags=re.IGNORECASE):
# #     print(i)

# mask_re = 'ake m'
# replace_with = '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'

# masked = test.copy()
# # masked, ms1 = mask(masked, program, entries, True, mask_re, replace_with)
# masked, ms2 = mask(masked, deleted_text, deletes, False, mask_re, replace_with, deleted_inserts)

# program, deleted_text, entries, deletes, deleted_inserts = reconstruct(masked.iloc[:2060])
# print(program)
# masked.to_csv('test.csv')

# Everything
id2names = {'100034':['Mary Chidester'], '100036':['jake miller']}
test,ms,programs,program_heads,deleted = deidentifyps2(test, id2names=id2names, header_offset=0)

test.to_csv('test.csv')


NameError: name 'df_2019' is not defined