In [1]:
import pandas as pd

In [2]:
# Define a custom function to calculate the intersection size
def intersection_added_removed_types(row):
    return list(set(row['CWE'])-(set(row['CWE_PreviousPostBlockVersion']))), list(set(row['CWE_PreviousPostBlockVersion'])-(set(row['CWE'])))

def intersection_added_instances(row):
    cur = list(row['CWE'])
    for x in row['CWE_PreviousPostBlockVersion']:
        if x in cur:
            cur.remove(x)
    return cur

def intersection_removed_instances(row):
    cur = list(row['CWE_PreviousPostBlockVersion'])
    for x in row['CWE']:
        if x in cur:
            cur.remove(x)
    return cur

In [3]:
user_data_frame = pd.read_csv('data/database_export/user_records_2022_2_13.csv')

In [4]:
# Replace 0 with empty string
user_data_frame['CWE'] = user_data_frame['CWE'].replace(0, None)

In [5]:
user_data_frame

Unnamed: 0,UserId,Reputation,PostHistoryId,PostHistoryDate,PostBlockVersionId,RootPostBlockVersionId,CWE
0,246878,4475,8387438,2010-10-21 19:39:30,186554969,186554969,
1,524368,160538,17372069,2011-10-19 15:27:09,232890827,232890827,
2,446335,984,9256355,2010-12-08 15:08:47,191405918,191405918,
3,1223007,5536,28007196,2012-08-15 09:28:24,161687620,161687620,
4,3146,132983,3022798,2009-10-05 23:16:48,156415093,156415093,
...,...,...,...,...,...,...,...
877630,15168,742048,260848772,2021-12-28 06:50:14,253230485,253230464,
877631,4688321,4458,260848868,2021-12-28 06:52:22,253231779,253231779,
877632,7650887,37,260849269,2021-12-28 07:02:25,253232429,253232429,
877633,1625187,25860,260850082,2021-12-28 07:24:37,253233792,253233792,


In [6]:
# Check that when rows have identical PostHistoryId, they have identical PostHistoryDate
# This is to ensure that the data is correct
# If the data is correct, the following should return an empty dataframe
assert user_data_frame.groupby('PostHistoryId')['PostHistoryDate'].nunique().eq(1).all()

In [7]:
assert user_data_frame.groupby('PostBlockVersionId')['PostHistoryDate'].nunique().eq(1).all()

In [8]:
# We can group by the User Id and PostBlockVersionId since they are in a 1:1 relationship
user_data_frame = user_data_frame.groupby(['RootPostBlockVersionId', 'PostBlockVersionId', 'UserId', 'PostHistoryId', 'PostHistoryDate'])['CWE'].agg(lambda x: list(x.dropna())).reset_index()

In [9]:
user_data_frame

Unnamed: 0,RootPostBlockVersionId,PostBlockVersionId,UserId,PostHistoryId,PostHistoryDate,CWE
0,138299634,138299634,335858,154260621,2017-08-18 15:40:39,[]
1,138300281,138300281,296460,55921656,2014-01-18 07:14:14,[]
2,138303943,138303943,5389107,102310295,2015-10-21 08:50:27,[]
3,138304597,138304597,1619294,55923019,2014-01-18 08:01:30,[]
4,138304847,138304847,5439654,102309849,2015-10-21 08:44:49,[]
...,...,...,...,...,...,...
498268,276590211,276590211,134554,154258244,2017-08-18 15:04:45,[]
498269,276590215,276590215,134554,154258244,2017-08-18 15:04:45,[]
498270,276590219,276590219,134554,154258244,2017-08-18 15:04:45,[]
498271,276590221,276590221,134554,154258244,2017-08-18 15:04:45,[]


In [10]:
# Sort by RootPostBlockVersionId and PostBlockVersionId
user_data_frame = user_data_frame.sort_values(by=['RootPostBlockVersionId', 'PostBlockVersionId'])

In [11]:
user_data_frame['CWE_PreviousPostBlockVersion'] = user_data_frame.groupby('RootPostBlockVersionId')['CWE'].shift(1)

In [12]:
user_data_frame['CWE_PreviousPostBlockVersion'] = user_data_frame['CWE_PreviousPostBlockVersion'].fillna('')

In [13]:
user_data_frame

Unnamed: 0,RootPostBlockVersionId,PostBlockVersionId,UserId,PostHistoryId,PostHistoryDate,CWE,CWE_PreviousPostBlockVersion
0,138299634,138299634,335858,154260621,2017-08-18 15:40:39,[],
1,138300281,138300281,296460,55921656,2014-01-18 07:14:14,[],
2,138303943,138303943,5389107,102310295,2015-10-21 08:50:27,[],
3,138304597,138304597,1619294,55923019,2014-01-18 08:01:30,[],
4,138304847,138304847,5439654,102309849,2015-10-21 08:44:49,[],
...,...,...,...,...,...,...,...
498268,276590211,276590211,134554,154258244,2017-08-18 15:04:45,[],
498269,276590215,276590215,134554,154258244,2017-08-18 15:04:45,[],
498270,276590219,276590219,134554,154258244,2017-08-18 15:04:45,[],
498271,276590221,276590221,134554,154258244,2017-08-18 15:04:45,[],


In [14]:
user_data_frame['CWE_Instances_Added'] = user_data_frame.apply(intersection_added_instances, axis=1)
# user_data_frame['CWE_Instances_Removed'] = user_data_frame.apply(intersection_removed_instances, axis=1)

In [15]:
user_data_frame[['CWE_Types_Added', 'CWE_Types_Removed']] = user_data_frame.apply(intersection_added_removed_types, axis=1, result_type="expand")
# Remove the CWE_Types_Removed column for now
user_data_frame = user_data_frame.drop(columns=['CWE_Types_Removed'])

In [16]:
user_data_frame = user_data_frame.drop(columns=['CWE_PreviousPostBlockVersion'])

In [17]:
user_data_frame['Num_CWE_Types_Added'] = user_data_frame['CWE_Types_Added'].apply(lambda x: len(x))
user_data_frame['Num_CWE_Instances_Added'] = user_data_frame['CWE_Instances_Added'].apply(lambda x: len(x))
user_data_frame['Num_CWE'] = user_data_frame['CWE'].apply(lambda x: len(x))

In [18]:
user_data_frame

Unnamed: 0,RootPostBlockVersionId,PostBlockVersionId,UserId,PostHistoryId,PostHistoryDate,CWE,CWE_Instances_Added,CWE_Types_Added,Num_CWE_Types_Added,Num_CWE_Instances_Added,Num_CWE
0,138299634,138299634,335858,154260621,2017-08-18 15:40:39,[],[],[],0,0,0
1,138300281,138300281,296460,55921656,2014-01-18 07:14:14,[],[],[],0,0,0
2,138303943,138303943,5389107,102310295,2015-10-21 08:50:27,[],[],[],0,0,0
3,138304597,138304597,1619294,55923019,2014-01-18 08:01:30,[],[],[],0,0,0
4,138304847,138304847,5439654,102309849,2015-10-21 08:44:49,[],[],[],0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
498268,276590211,276590211,134554,154258244,2017-08-18 15:04:45,[],[],[],0,0,0
498269,276590215,276590215,134554,154258244,2017-08-18 15:04:45,[],[],[],0,0,0
498270,276590219,276590219,134554,154258244,2017-08-18 15:04:45,[],[],[],0,0,0
498271,276590221,276590221,134554,154258244,2017-08-18 15:04:45,[],[],[],0,0,0


In [19]:
user_data_frame.to_feather('data/feather_files/UserCWEData.feather')