In [2]:
import pandas as pd
import numpy as np
import re
import time
import traceback

# Supporting functions

In [3]:
def split_text(text):
    text = text.split('\n')
    if len(text) > 1:
        text = [e+'\n' for e in text[:-1]] + [text[-1]]
    return text
    
def insert(lines, text, irow, icol):
    if len(lines) == 0:
        lines = ['']
    if icol > len(lines[irow]):
        s = '"'+lines[irow].replace(' ','*').replace('\n','\\n')+'"'
        raise IndexError(f"Column out of range: irow={irow}, icol={icol}, line={s}")
    # Consider the line in the question and the line following.
    # Split into lines preceding and succeeding those two. Then
    # join the two in question.
    before = lines[:irow]
    after = []
    if len(lines) == irow+1:
        # last line
        two = lines[irow]
    elif len(lines) == irow+2:
        # second to last line
        two = ''.join(lines[irow:irow+2])
    else:
        # More than one line following
        two = ''.join(lines[irow:irow+2])
        after = lines[irow+2:]
        
    two = two[:icol] + text + two[icol:]
    if len(after) == 0:
        return before + split_text(two)
    return before + split_text(two)[:-1] + after

# print(split_text(''))
# print(split_text('\n'))
# print(split_text('abc'))
# print(split_text('abc\n'))
# print(split_text('abc\ndef'))
# print(split_text('abc\ndef\n'))
# print(split_text('abcdef\n\ndef'))
# lines = insert(['abc\n','def'], 'def', 0, 3)
# print('*'.join(lines))
# print(insert([], 'abc', 0, 0))
# print(insert([''], 'abc', 0, 0))
# print(insert([], 'abc\n', 0, 0))
# print(insert([''], 'abc\n', 0, 0))
# print(insert(['abc\n', ''], 'def\n', 0, 0))
# print(insert(['abc\n', ''], 'def\n', 0, 3))
# print(insert(['abc\n', ''], 'def\n', 1, 0))
# print(insert(['abc\n', ''], 'def', 0, 0))
# print(insert(['abc\n', ''], 'def', 0, 3))
# print(insert(['abc\n', ''], 'def', 1, 0))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 0, 0))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 0, 3))
# print(insert(['abc\n', 'ghi\n', ''], 'def\n', 1, 0))
# print(insert(['abc\n','def'], 'def\n', 0, 3))


In [4]:
def add_empty_line_cond(lines):
    if len(lines) == 0:
        return ['']
    last_line = lines[-1]
    if len(last_line) > 0 and last_line[-1] == '\n':
        return ['']
    return []

#        i j
# i,j aaaddaaa
#
#        i j
# i,j aaadd
#     aaaaa    
#
#      i
# i aaadd
#   ddddddd
# j ddaaaaa
#     j
def remove_impl(lines, irow, icol, jrow, jcol):
#     if jrow == len(lines) and jcol == 0:
#         jrow = len(lines)-1
#         jcol = len(lines[-1])
    if icol > len(lines[irow]):
        s = '"'+lines[irow].replace(' ','*').replace('\n','\\n')+'"'
        raise IndexError(f"Column out of range: irow={irow}, icol={icol}, line={s}")
    if irow == jrow:
        line = lines[irow]
        line = line[0:icol] + line[jcol:]
        if len(line) == 0:
            return lines[:irow] + lines[irow+1:]
        if line[-1] != '\n' and irow < len(lines)-1:
            return lines[:irow] + [line+lines[irow+1]] + lines[irow+2:]
        return lines[:irow] + [line] + lines[irow+1:]
    else:
        line1 = lines[irow][:icol]
        line2 = lines[jrow][jcol:]
        if len(line1+line2) == 0:
            return lines[:irow] + lines[jrow+1:]
        if (line1+line2)[-1] != '\n' and jrow < len(lines)-1:
            return lines[:irow] + [line1+line2+lines[jrow+1]] + lines[jrow+2:]
        return lines[:irow] + [line1+line2] + lines[jrow+1:]
    
def remove(lines, irow, icol, jrow, jcol):
    lines = remove_impl(lines, irow, icol, jrow, jcol)
    return lines + add_empty_line_cond(lines)

# print(remove(['abc\n', ''], 0, 0, 0, 1))
# print(remove(['abc\n', ''], 0, 0, 0, 2))
# print(remove(['abc\n', ''], 0, 0, 0, 3))
# print(remove(['abc\n', ''], 0, 0, 0, 4))
# print(remove(['abc\n', ''], 0, 1, 0, 3))
# print(remove(['abc\n', 'def'], 0, 1, 1, 0))
# print(remove(['abc\n', 'def'], 0, 1, 1, 3))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 0))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 3))
# print(remove(['abc\n', 'def\n', ''], 0, 1, 1, 4))
# print(remove(['abc\n', 'def\n', ''], 1, 3, 1, 4))
# print(remove(['abc\n', 'def\n', 'g'], 2, 0, 2, 1))
# print(remove(['abc\n', 'def\n', 'g'], 2, 0, 3, 0))


# Conversion function

In [5]:
def phanon2progsnap2(df, debug = False):
    test = df.copy()
    change_indices = []
    lines = ['']
    i = 0
    for index,row in test.iterrows():
    #     print('**',i)
        i = i + 1
        try:
            irow = row.startLine
            icol = row.startCol
            jrow = row.endLine
            jcol = row.endCol
            change_type = row.change_type
            added = row.code_added
            removed = row.code_removed
            changed = False
            if removed and removed == removed and row.change_type != 'setValue' and (len(lines)>1 or len(lines[0])>0):
#             if removed and removed == removed and (len(lines)>1 or len(lines[0])>0):
                changed = True
                irow = int(irow)
                icol = int(icol)
                jrow = int(jrow)
                jcol = int(jcol)
                lines = remove(lines, irow, icol, jrow, jcol)
            if row.change_type == 'setValue':
                lines = ['']
            if added and added == added and added != '':
                changed = True
                irow = int(irow)
                icol = int(icol)
                lines = insert(lines, added, irow, icol)
            change_index = np.nan
            if changed:
                change_index = len(''.join(lines[:irow]))+icol
            change_indices.append(change_index)
        except Exception as e:
            if debug:
                display('i={}: {}'.format(i,e))
                print(''.join(lines).replace(' ', '·'))
                display(row)
                traceback.print_exc()
                test = test[:len(change_indices)]
                break
            else:
                raise e
    test.change_index = change_indices
    test['SubjectID'] = test.user_id
    test['EventID'] = test.native_index
    test['AssignmentID'] = test.project_id
    test['CodeStateSection'] = test.task
    test['EventType'] = test.change_type
    test['InsertText'] = test.code_added
    test['DeleteText'] = test.code_removed
    test['SourceLocation'] = test.change_index
    test['ClientTimestamp'] = test.timestamp
    # array(['RUN', 'SUBMIT', 'TASK', 'setValue', '+delete', '+input', 'paste',
#        'undo', 'redo', 'cut', 'drag'], dtype=object)
    test.EventType = test.EventType.replace({'+input':'File.Edit','+delete':'File.Edit',
                                             'undo':'File.Edit','redo':'File.Edit',
                                             'cut':'File.Edit','paste':'File.Edit','drag':'File.Edit',
                                             'RUN':'Run.Program','SUBMIT':'Submit',
                                             'TASK':'X-SwitchTask',
#                                              'setValue':'File.Edit'
                                            })
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == '+input'),'EditType'] = 'Insert'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == '+delete'),'EditType'] = 'Delete'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'setValue'),'EditType'] = 'Insert'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'undo'),'EditType'] = 'Undo'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'redo'),'EditType'] = 'Redo'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'paste'),'EditType'] = 'Paste'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'cut'),'EditType'] = 'Cut'
    test.loc[(test.EventType == 'File.Edit')&(test.change_type == 'drag'),'EditType'] = 'Drag'
    return test

In [None]:
# Bad idea

# def clean_task_switches(df):
#     # Look nearby (within two rows) for rows with large both inserts and
#     # deletes and clean them
#     df = df.copy()
#     indices = df.index[df.change_type == 'TASK']
#     for row_idx in indices:
#         arr = np.intersect1d(df.index[(df.code_added.str.len() > 5)&
#                                       (df.code_removed.str.len() > 5)],
#                              pd.RangeIndex(row_idx-2, row_idx+3))
#         df.loc[arr,'code_added'] = ''
#         df.loc[arr,'code_removed'] = ''
#     return df

# test
# clean_task_switches(test)

# Do the conversion

In [None]:
# df = pd.read_csv('data-2019/keystrokes.csv')
df = pd.read_csv('data-2019/src/project-events.csv')

In [11]:
df = df.rename({'Unnamed: 0':'native_index'}, axis=1)
df = df.sort_values(['user_id','timestamp','native_index'])
df['change_index'] = np.nan

df['ID'] = df.user_id.astype('str') + df.project_id.astype('str') + df.task.astype('str')
df['ID_no_task'] = df.user_id.astype('str') + df.project_id.astype('str')

In [None]:

copy = df.copy()#[(df.user_id == 100338)|(df.user_id == 100339)|(df.user_id == 100340)].copy()
# copy = clean_task_switches(copy)
dfs = []
for ID in copy.ID.unique():
# for ID in copy.ID_no_task.unique():
    print(ID)
    subdf = copy[copy.ID == ID]
#     subdf = copy[copy.ID_no_task == ID]
    try:
        dfs.append(phanon2progsnap2(subdf, False))
    except:
        print('{} - Reconstruction failed'.format(ID))

copy = pd.concat(dfs)
copy.to_csv('phanon2ps2.csv', index=False)

In [None]:
print(len(copy))
display(copy.EventType.unique())
display(copy.groupby('EventType').count())

copy.EventType = copy.EventType.replace({'setValue':'File.Edit'})
copy.loc[(copy.EventType == 'File.Edit')&(copy.change_type == 'setValue'),'EditType'] = 'Replace'

print(len(copy))
display(copy.EventType.unique())
display(copy.groupby('EventType').count())
copy.to_csv('phanon2ps2-2.csv', index=False)

## Fix bug with Jake Miller

In [None]:
df = pd.read_csv('data-2019/src/project-events.csv')


In [None]:
df.head()
test = df[df.user_id == 100036].copy()
test = test[(test.timestamp > 1548976440848)&(test.timestamp < 1548976584332)]
test

In [21]:
test.head()

Unnamed: 0,index,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,...,startLine,startCol,endLine,endCol,operation,key,elapsed,change_index,ID,ID_no_task
0,5409573,1392776,100036,129,0,+input,#,# Don't forget comments that go here,1548367097037,,...,0.0,0.0,0.0,36.0,+input,#,,,1000361290,100036129
1,5409574,1392777,100036,129,0,+input,T,,1548367097787,,...,0.0,1.0,0.0,1.0,+input,T,750.0,,1000361290,100036129
2,5409575,1392778,100036,129,0,+input,h,,1548367098013,,...,0.0,2.0,0.0,2.0,+input,h,226.0,,1000361290,100036129
3,5409576,1392779,100036,129,0,+input,i,,1548367098424,,...,0.0,3.0,0.0,3.0,+input,i,411.0,,1000361290,100036129
4,5409577,1392780,100036,129,0,+input,s,,1548367098555,,...,0.0,4.0,0.0,4.0,+input,s,131.0,,1000361290,100036129


In [25]:
test = df[df.user_id.isin([100036])].copy().reset_index()
test = test[test.project_id == 131]
test = phanon2progsnap2(test, False)
test.CodeStateSection = 0
display(test)
test.to_csv('test.csv', index=False)

Unnamed: 0,index,native_index,user_id,project_id,task,change_type,code_added,code_removed,timestamp,input,...,SubjectID,EventID,AssignmentID,CodeStateSection,EventType,InsertText,DeleteText,SourceLocation,ClientTimestamp,EditType
2960,5412533,2706746,100036,131,0,+input,i,,1548967184553,,...,100036,2706746,131,0,File.Edit,i,,0.0,1548967184553,Insert
2961,5412534,2706747,100036,131,0,+input,m,,1548967185208,,...,100036,2706747,131,0,File.Edit,m,,1.0,1548967185208,Insert
2962,5412535,2706748,100036,131,0,+input,p,,1548967185460,,...,100036,2706748,131,0,File.Edit,p,,2.0,1548967185460,Insert
2963,5412536,2706749,100036,131,0,+input,o,,1548967185707,,...,100036,2706749,131,0,File.Edit,o,,3.0,1548967185707,Insert
2964,5412537,2706750,100036,131,0,+input,r,,1548967185859,,...,100036,2706750,131,0,File.Edit,r,,4.0,1548967185859,Insert
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5697,5415270,2709451,100036,131,0,SUBMIT,,,1548981552528,,...,100036,2709451,131,0,Submit,,,,1548981552528,
5698,5415271,2709470,100036,131,0,SUBMIT,,,1548981608448,,...,100036,2709470,131,0,Submit,,,,1548981608448,
5699,5415272,2709472,100036,131,0,TASK,,,1549063050087,,...,100036,2709472,131,0,X-SwitchTask,,,,1549063050087,
5700,5415273,2709471,100036,131,0,setValue,# this is going to calculate future investment...,#We are now going to create a target that is s...,1549063050099,,...,100036,2709471,131,0,setValue,# this is going to calculate future investment...,#We are now going to create a target that is s...,0.0,1549063050099,


In [19]:
test = df_2019[df_2019.SubjectID.isin(['100036'])].copy()

# test = df_2019[df_2019.SubjectID.isin(list(range(100030,100040)))]
# display(test.head())
# test = test[test.AssignmentID.isin(['129','131','200'])]
# test = test.sort_values(['user_id','timestamp','native_index'])

# test = test[test.CodeStateSection == 'task0.py']

# test.to_csv('test.csv', index=False)
# with open('test.csv') as f:
#     text = f.read()
# text = re.sub(re.compile('jake miller', flags=re.IGNORECASE), '@@@@@@@@@@@', text)
# # print(text[:100])
# with open('test.csv', 'w') as f:
#     f.write(text)
    
# test = pd.read_csv('test.csv')
# test.SubjectID = test.SubjectID.astype('str')
# test.AssignmentID = test.AssignmentID.astype('str')

# test = test[~(test.EditType == 'Replace')]

# If setValue has only a delete then the source location is going to be nan. Fix
# that to be 0.
# test[(test.EventType == 'File.Edit')&(test.SourceLocation.isna())]
# test[~(test.DeleteText.isna())&(test.SourceLocation.isna())]
test.loc[(test.EventType == 'File.Edit')&(test.SourceLocation.isna()), 'SourceLocation'] = 0


# display(test)

# # test[~(test.DeleteText.isna())&(test.EventType != 'File.Edit')].head()

# # test = test.iloc[:1180].copy().reset_index()
# test = test.copy().reset_index()
# # program, deleted_text, entries, deletes, deleted_inserts = reconstruct(test.iloc[:990])
# program, deleted_text, entries, deletes, deleted_inserts = reconstruct(test)
# # print(deleted_text)
# # with open('test.txt', 'w') as f:
# #     f.write(deleted_text)

# # for i in re.finditer('jake', deleted_text, flags=re.IGNORECASE):
# #     print(i)

# mask_re = 'ake m'
# replace_with = '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'

# masked = test.copy()
# # masked, ms1 = mask(masked, program, entries, True, mask_re, replace_with)
# masked, ms2 = mask(masked, deleted_text, deletes, False, mask_re, replace_with, deleted_inserts)

# program, deleted_text, entries, deletes, deleted_inserts = reconstruct(masked.iloc[:2060])
# print(program)
# masked.to_csv('test.csv')

# Everything
id2names = {'100034':['Mary Chidester'], '100036':['jake miller']}
test,ms,programs,program_heads,deleted = deidentifyps2(test, id2names=id2names, header_offset=0)

test.to_csv('test.csv')


NameError: name 'df_2019' is not defined