In [2]:
import pandas as pd
from glob import glob as g
from pathlib import Path
import re

In [3]:
def generate_uuid(row):
    try:
        uuid =  '_'.join(map(str, [row['date'].replace('-', '_'),
                row['timestamp'].replace(':', '_'),
                row['user'],
                str(row['file_ind'])]))
    except Exception as e:
        print(row['file_ind'])
        raise e
    return uuid

In [4]:
def fill_missing_annots(df):
    child_set = set(df['child'].tolist())
    temp_df = df[~df['parent'].isin(child_set)].copy()
    temp_df['child'] = temp_df['parent']
    temp_df = temp_df.drop_duplicates()
    return temp_df

In [10]:
# Set patterns
timestamp_pattern = "[[](\d\d:\d\d)[]]"
user_pattern = "[[]\d\d:\d\d[]]\s[<](.*?)[>]"
message_pattern = ".*[<].*[>](.*)"
date_pattern = "(\d\d\d\d-\d\d-\d\d)"
# Get files

# Train
filepath = '../data/'
subfolders = ["train", "test", "dev"]

raw_df_dict = {}
annot_dict = {}
for subfolder in subfolders:
    filelist_ascii = g(filepath + subfolder + '/*ascii.txt')
    #filelist_annot = g(filepath + subfolder + '/*annotation.txt')
    for filename in filelist_ascii:
        filename_annot = filename.replace('ascii', 'annotation')
        data = pd.read_csv(filename, header = None, names = ['raw'], delimiter="\t")
        data['timestamp'] = data['raw'].str.extract(timestamp_pattern, expand=True)
        data.loc[data['timestamp'].isnull(), 'timestamp'] = "System"
        data['user'] = data['raw'].str.extract(user_pattern, expand=True)
        data.loc[data['user'].isnull(), 'user'] = "System"
        data['message'] = data['raw'].str.extract(message_pattern, expand=True)
        data['file_ind'] = data.index.values
        data['file_ind'] = data['file_ind'].astype(int)
        #data['date'] = Path(filename).stem.split('_')[0]
        temp_date = re.search(date_pattern, filename).group(1)
        data['date'] = temp_date
        data['uuid'] = data.apply(lambda row: generate_uuid(row), axis=1)

        raw_df_dict[temp_date] = data.copy()

        annot_df = pd.read_csv(filename_annot, index_col=False, header=None, names=['parent', 'child'], delimiter="\s")
        annot_df = pd.concat([annot_df, fill_missing_annots(annot_df)]).drop_duplicates()
        annot_dict[temp_date] = annot_df
        merged_data = pd.merge(data, annot_df, left_on='file_ind', right_on='child', how='left')

        #merged_data['parent_uuid'] = pd.merge(merged_data[['child', 'parent', 'uuid']], merged_data[['file_ind', 'child', 'parent', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['_l', '_r'])['uuid_r']
        merged_data = pd.merge(merged_data, merged_data[['file_ind', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['', '_parent'])
        #merged_data['parent_uuid'] = pd.merge(merged_data[['child', 'parent', 'uuid']], merged_data[['file_ind', 'child', 'parent', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['_l', '_r'])['uuid_r']

        merged_data.to_csv(filename + "_annot.csv")



In [13]:
#data[data['timestamp']=="System"]['timestamp']

In [14]:
merged_data.columns.values

array(['raw', 'timestamp', 'user', 'message', 'file_ind', 'date', 'uuid',
       'parent', 'child', 'file_ind_parent', 'uuid_parent'], dtype=object)

In [15]:
# Train
filepath = '../data/train'
csvs_train = pd.DataFrame(g(filepath + '/*annot.csv'), columns=["path"])
#csvs_train["date"] = csvs_train["path"].str.extract(date_pattern, expand=True)

# Test
filepath = '../data/test'
csvs_test = pd.DataFrame(g(filepath + '/*annot.csv'), columns=["path"])
#csvs_test["date"] = csvs_test["path"].str.extract(date_pattern, expand=True)


# Dev
filepath = '../data/dev'
csvs_dev = pd.DataFrame(g(filepath + '/*annot.csv'), columns=["path"])
#csvs_dev["date"] = csvs_dev["path"].str.extract(date_pattern, expand=True)


In [16]:
#Aggregate files
rootfile = '../'

agg_train = pd.DataFrame()
for i in range(len(csvs_train.path)):
    data = pd.read_csv(csvs_train.path[i], header = 0, index_col=0)
    # data['date'] = pd.Series([csvs_train.date[i] for x in range(len(data))])
    agg_train = agg_train.append(data)
agg_train.to_csv(rootfile + "agg_train.csv")

agg_test = pd.DataFrame()
for i in range(len(csvs_test.path)):
    data = pd.read_csv(csvs_test.path[i], header = 0, index_col=0)
    # data['date'] = pd.Series([csvs_test.date[i] for x in range(len(data))])
    agg_test = agg_test.append(data)
agg_test.to_csv(rootfile + "agg_test.csv")

agg_dev = pd.DataFrame()
for i in range(len(csvs_dev.path)):
    data = pd.read_csv(csvs_dev.path[i], header = 0, index_col=0)
    # data['date'] = pd.Series([csvs_dev.date[i] for x in range(len(data))])
    agg_dev = agg_dev.append(data)
agg_dev.to_csv(rootfile + "agg_dev.csv")

In [17]:
#agg_dev
# Remove command messages
agg_dict = {
    'dev': agg_dev,
    'train': agg_train,
    'test': agg_test
}

for key in agg_dict.keys():
    agg_dict[key] = agg_dict[key][agg_dict[key]['timestamp'].notnull()]

In [18]:
agg_dict['dev']

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid,parent,child,file_ind_parent,uuid_parent
0,"[12:18] <|trey|> usual, quite stable though :)",12:18,|trey|,"usual, quite stable though :)",0,2004-11-15,2004_11_15_12_18_|trey|_0,,,,
1,[12:18] <tweaked> HrdwrBoB: ok how many partit...,12:18,tweaked,HrdwrBoB: ok how many partitions should i make?,1,2004-11-15,2004_11_15_12_18_tweaked_1,,,,
2,"[12:18] <Matt|> |trey|, top in the list --> ub...",12:18,Matt|,ubuntu servers,2,2004-11-15,2004_11_15_12_18_Matt|_2,,,,
3,[12:18] <usual> a few libs and media,12:18,usual,a few libs and media,3,2004-11-15,2004_11_15_12_18_usual_3,,,,
4,[12:18] <usual> maybe some others,12:18,usual,maybe some others,4,2004-11-15,2004_11_15_12_18_usual_4,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1259,[21:57] <zacky83> who can help me on this,21:57,zacky83,who can help me on this,1245,2016-12-19,2016_12_19_21_57_zacky83_1245,1244.0,1245.0,1244.0,2016_12_19_21_57_zacky83_1244
1260,"[21:57] <Mccallum1983> can anyone assist, when...",21:57,Mccallum1983,"can anyone assist, when i try to install bitc...",1246,2016-12-19,2016_12_19_21_57_Mccallum1983_1246,1246.0,1246.0,1246.0,2016_12_19_21_57_Mccallum1983_1246
1261,[21:57] <figure002> OerHeks: still makes no se...,21:57,figure002,OerHeks: still makes no sense to me why a dae...,1247,2016-12-19,2016_12_19_21_57_figure002_1247,1242.0,1247.0,1242.0,2016_12_19_21_56_OerHeks_1242
1262,[21:58] <figure002> zacky83: did you enable th...,21:58,figure002,zacky83: did you enable the jails?,1248,2016-12-19,2016_12_19_21_58_figure002_1248,1244.0,1248.0,1244.0,2016_12_19_21_57_zacky83_1244


In [19]:
# for key in agg_dict.keys():
#     agg_dict[key]['uuid'] = agg_dict[key].apply(lambda row: generate_uuid(row), axis=1)


In [20]:
agg_dict['dev'][agg_dict['dev']['uuid'] == "2004_11_15_01_35_djtansey_685"]

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid,parent,child,file_ind_parent,uuid_parent
685,[01:35] <djtansey> i have a problem re: k3b an...,01:35,djtansey,i have a problem re: k3b and am looking for s...,685,2004-11-15,2004_11_15_01_35_djtansey_685,685.0,685.0,685.0,2004_11_15_01_35_djtansey_685


In [21]:
for key in agg_dict.keys():
    agg_dict[key].to_csv(rootfile + f"data/cleaned/agg_{key}.csv")

In [206]:
data = raw_df_dict['2005-07-06']
annot_df = annot_dict['2005-07-06']
annot_df = pd.concat([annot_df, fill_missing_annots(annot_df)], ignore_index=True).drop_duplicates()
merged_data = pd.merge(data, annot_df, left_on='file_ind', right_on='child', how='left')

#merged_data['parent_uuid'] = pd.merge(merged_data[['child', 'parent', 'uuid']], merged_data[['file_ind', 'child', 'parent', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['_l', '_r'])['uuid_r']
merged_data = pd.merge(merged_data, merged_data[['file_ind', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['', '_parent'])
#merged_data.to_csv(filename + "_annot.csv")

In [203]:
temp = pd.merge(merged_data[['parent']], merged_data[['file_ind', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['_l', '_r'])
temp

Unnamed: 0,parent,file_ind,uuid
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
1205,1493.0,1493.0,2005_07_06_03_35_Nige_1493
1206,1495.0,1495.0,2005_07_06_03_35_Nige_1495
1207,1495.0,1495.0,2005_07_06_03_35_Nige_1495
1208,1498.0,1498.0,2005_07_06_03_35_FLD_1498


In [204]:
merged_data = pd.merge(merged_data, merged_data[['file_ind', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['', '_parent'])

In [207]:
merged_data

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid,parent,child,file_ind_parent,uuid_parent
0,"[11:11] <Seveas> Amaranth, the US peer is prob...",11:11,Seveas,"Amaranth, the US peer is probably breaking th...",0,2005-07-06,2005_07_06_11_11_Seveas_0,,,,
1,[11:11] <Seveas> but an ES or NL peer download...,11:11,Seveas,but an ES or NL peer downloading it not,1,2005-07-06,2005_07_06_11_11_Seveas_1,,,,
2,[11:11] <monchichi> http://www.msnbc.msn.com/i...,11:11,monchichi,http://www.msnbc.msn.com/id/8419601/,2,2005-07-06,2005_07_06_11_11_monchichi_2,,,,
3,[11:11] <IceDC571> the US peer is just stupid ...,11:11,IceDC571,the US peer is just stupid for wanting to sha...,3,2005-07-06,2005_07_06_11_11_IceDC571_3,,,,
4,[11:11] <Seveas> The downloader is breaking th...,11:11,Seveas,The downloader is breaking the implicit rules...,4,2005-07-06,2005_07_06_11_11_Seveas_4,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1205,[03:35] <Nige> i am stuck with wireless networ...,03:35,Nige,i am stuck with wireless networking,1495,2005-07-06,2005_07_06_03_35_Nige_1495,1493.0,1495.0,1493.0,2005_07_06_03_35_Nige_1493
1206,[03:35] <Nige> :(,03:35,Nige,:(,1496,2005-07-06,2005_07_06_03_35_Nige_1496,1495.0,1496.0,1495.0,2005_07_06_03_35_Nige_1495
1207,[03:35] <Nige> and its driving me crazy!!!,03:35,Nige,and its driving me crazy!!!,1497,2005-07-06,2005_07_06_03_35_Nige_1497,1495.0,1497.0,1495.0,2005_07_06_03_35_Nige_1495
1208,[03:35] <FLD> does anybody know how to get dsn...,03:35,FLD,does anybody know how to get dsniff to work :<,1498,2005-07-06,2005_07_06_03_35_FLD_1498,1498.0,1498.0,1498.0,2005_07_06_03_35_FLD_1498


In [157]:
annot_df

Unnamed: 0,parent,child
0,993,1000
1,995,1000
2,1001,1001
3,1000,1002
4,998,1003
...,...,...
504,1498,1498
505,1494,1499
506,993,993
507,995,995


In [163]:
annot_df[annot_df['child']==1262]

Unnamed: 0,parent,child
265,1254,1262


In [164]:
data[data['file_ind'] == 1262]

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid
1262,"[02:59] <Vjaz> delire, Yeah, I'm no stranger t...",02:59,Vjaz,"delire, Yeah, I'm no stranger to compiling my...",1262,2005-07-06,2005_07_06_02_59_Vjaz_1262


In [188]:
merged_data = pd.merge(data, annot_df, left_on='file_ind', right_on='child', how='left')
merged_data

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid,parent,child
0,"[11:11] <Seveas> Amaranth, the US peer is prob...",11:11,Seveas,"Amaranth, the US peer is probably breaking th...",0,2005-07-06,2005_07_06_11_11_Seveas_0,,
1,[11:11] <Seveas> but an ES or NL peer download...,11:11,Seveas,but an ES or NL peer downloading it not,1,2005-07-06,2005_07_06_11_11_Seveas_1,,
2,[11:11] <monchichi> http://www.msnbc.msn.com/i...,11:11,monchichi,http://www.msnbc.msn.com/id/8419601/,2,2005-07-06,2005_07_06_11_11_monchichi_2,,
3,[11:11] <IceDC571> the US peer is just stupid ...,11:11,IceDC571,the US peer is just stupid for wanting to sha...,3,2005-07-06,2005_07_06_11_11_IceDC571_3,,
4,[11:11] <Seveas> The downloader is breaking th...,11:11,Seveas,The downloader is breaking the implicit rules...,4,2005-07-06,2005_07_06_11_11_Seveas_4,,
...,...,...,...,...,...,...,...,...,...
1201,[03:35] <Nige> i am stuck with wireless networ...,03:35,Nige,i am stuck with wireless networking,1495,2005-07-06,2005_07_06_03_35_Nige_1495,1493.0,1495.0
1202,[03:35] <Nige> :(,03:35,Nige,:(,1496,2005-07-06,2005_07_06_03_35_Nige_1496,1495.0,1496.0
1203,[03:35] <Nige> and its driving me crazy!!!,03:35,Nige,and its driving me crazy!!!,1497,2005-07-06,2005_07_06_03_35_Nige_1497,1495.0,1497.0
1204,[03:35] <FLD> does anybody know how to get dsn...,03:35,FLD,does anybody know how to get dsniff to work :<,1498,2005-07-06,2005_07_06_03_35_FLD_1498,1498.0,1498.0


In [168]:
merged_data['parent_uuid'] = pd.merge(merged_data[['child', 'parent', 'uuid']], merged_data[['file_ind', 'child', 'parent', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['_l', '_r'])['uuid_r']

In [185]:
merged_data

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid,parent_x,child_x,parent_uuid,parent_y,child_y
0,[04:14] <Gobbert> ziggi: what do you need help...,04:14,Gobbert,ziggi: what do you need help with?,0,2016-12-19,2016_12_19_04_14_Gobbert_0,,,,,
1,[04:14] <ziggi> i am,04:14,ziggi,i am,1,2016-12-19,2016_12_19_04_14_ziggi_1,,,,,
2,[04:15] <joshua__> boot speed was very slow,04:15,joshua__,boot speed was very slow,2,2016-12-19,2016_12_19_04_15_joshua___2,,,,,
3,[04:15] <joshua__> 2 min to boot,04:15,joshua__,2 min to boot,3,2016-12-19,2016_12_19_04_15_joshua___3,,,,,
4,[04:15] <joshua__> but windows machine was ver...,04:15,joshua__,but windows machine was very fast,4,2016-12-19,2016_12_19_04_15_joshua___4,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1205,[21:57] <zacky83> who can help me on this,21:57,zacky83,who can help me on this,1245,2016-12-19,2016_12_19_21_57_zacky83_1245,1244.0,1245.0,2016_12_19_21_55_zh19970205_1239,1244.0,1245.0
1206,"[21:57] <Mccallum1983> can anyone assist, when...",21:57,Mccallum1983,"can anyone assist, when i try to install bitc...",1246,2016-12-19,2016_12_19_21_57_Mccallum1983_1246,1246.0,1246.0,2016_12_19_21_55_zh19970205_1239,1246.0,1246.0
1207,[21:57] <figure002> OerHeks: still makes no se...,21:57,figure002,OerHeks: still makes no sense to me why a dae...,1247,2016-12-19,2016_12_19_21_57_figure002_1247,1242.0,1247.0,2016_12_19_21_55_zh19970205_1240,1242.0,1247.0
1208,[21:58] <figure002> zacky83: did you enable th...,21:58,figure002,zacky83: did you enable the jails?,1248,2016-12-19,2016_12_19_21_58_figure002_1248,1244.0,1248.0,2016_12_19_21_52_figure002_1231,1244.0,1248.0


In [189]:
merged_data[merged_data['child'] == 1262]

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid,parent,child
1019,"[02:59] <Vjaz> delire, Yeah, I'm no stranger t...",02:59,Vjaz,"delire, Yeah, I'm no stranger to compiling my...",1262,2005-07-06,2005_07_06_02_59_Vjaz_1262,1254.0,1262.0


In [190]:
merged_data[merged_data['parent'] == 1262]

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid,parent,child
1022,[03:00] <delire> Vjaz: it is a little. i think...,03:00,delire,Vjaz: it is a little. i think the best approa...,1265,2005-07-06,2005_07_06_03_00_delire_1265,1262.0,1265.0


In [197]:
merged_data

Unnamed: 0,raw,timestamp,user,message,file_ind,date,uuid,parent,child
0,"[11:11] <Seveas> Amaranth, the US peer is prob...",11:11,Seveas,"Amaranth, the US peer is probably breaking th...",0,2005-07-06,2005_07_06_11_11_Seveas_0,,
1,[11:11] <Seveas> but an ES or NL peer download...,11:11,Seveas,but an ES or NL peer downloading it not,1,2005-07-06,2005_07_06_11_11_Seveas_1,,
2,[11:11] <monchichi> http://www.msnbc.msn.com/i...,11:11,monchichi,http://www.msnbc.msn.com/id/8419601/,2,2005-07-06,2005_07_06_11_11_monchichi_2,,
3,[11:11] <IceDC571> the US peer is just stupid ...,11:11,IceDC571,the US peer is just stupid for wanting to sha...,3,2005-07-06,2005_07_06_11_11_IceDC571_3,,
4,[11:11] <Seveas> The downloader is breaking th...,11:11,Seveas,The downloader is breaking the implicit rules...,4,2005-07-06,2005_07_06_11_11_Seveas_4,,
...,...,...,...,...,...,...,...,...,...
1201,[03:35] <Nige> i am stuck with wireless networ...,03:35,Nige,i am stuck with wireless networking,1495,2005-07-06,2005_07_06_03_35_Nige_1495,1493.0,1495.0
1202,[03:35] <Nige> :(,03:35,Nige,:(,1496,2005-07-06,2005_07_06_03_35_Nige_1496,1495.0,1496.0
1203,[03:35] <Nige> and its driving me crazy!!!,03:35,Nige,and its driving me crazy!!!,1497,2005-07-06,2005_07_06_03_35_Nige_1497,1495.0,1497.0
1204,[03:35] <FLD> does anybody know how to get dsn...,03:35,FLD,does anybody know how to get dsniff to work :<,1498,2005-07-06,2005_07_06_03_35_FLD_1498,1498.0,1498.0


In [192]:
temp = pd.merge(merged_data[['child', 'parent', 'uuid']], merged_data[['file_ind', 'child', 'parent', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['_l', '_r']).copy()

In [194]:
temp[temp['child_l'] == 1262]

Unnamed: 0,child_l,parent_l,uuid_l,file_ind,child_r,parent_r,uuid_r
1020,1262.0,1254.0,2005_07_06_02_59_Vjaz_1262,1254.0,1254.0,1252.0,2005_07_06_02_58_delire_1254


In [198]:
merged_data['parent_uuid'] = pd.merge(merged_data[['child', 'parent', 'uuid']], merged_data[['file_ind', 'child', 'parent', 'uuid']], left_on='parent', right_on='file_ind', how='left', suffixes=['_l', '_r'])['uuid_r']

In [201]:
merged_data[['file_ind', 'date', 'uuid', 'parent', 'parent_uuid']]

Unnamed: 0,file_ind,date,uuid,parent,parent_uuid
0,0,2005-07-06,2005_07_06_11_11_Seveas_0,,
1,1,2005-07-06,2005_07_06_11_11_Seveas_1,,
2,2,2005-07-06,2005_07_06_11_11_monchichi_2,,
3,3,2005-07-06,2005_07_06_11_11_IceDC571_3,,
4,4,2005-07-06,2005_07_06_11_11_Seveas_4,,
...,...,...,...,...,...
1201,1495,2005-07-06,2005_07_06_03_35_Nige_1495,1493.0,2005_07_06_03_35_mindmedic_1490
1202,1496,2005-07-06,2005_07_06_03_35_Nige_1496,1495.0,2005_07_06_03_35_fdr_1492
1203,1497,2005-07-06,2005_07_06_03_35_Nige_1497,1495.0,2005_07_06_03_35_delire_1489
1204,1498,2005-07-06,2005_07_06_03_35_FLD_1498,1498.0,2005_07_06_03_34_wizo_1487


In [202]:
temp

Unnamed: 0,child_l,parent_l,uuid_l,file_ind,child_r,parent_r,uuid_r
0,,,2005_07_06_11_11_Seveas_0,,,,
1,,,2005_07_06_11_11_Seveas_1,,,,
2,,,2005_07_06_11_11_monchichi_2,,,,
3,,,2005_07_06_11_11_IceDC571_3,,,,
4,,,2005_07_06_11_11_Seveas_4,,,,
...,...,...,...,...,...,...,...
1205,1495.0,1493.0,2005_07_06_03_35_Nige_1495,1493.0,1493.0,1489.0,2005_07_06_03_35_Nige_1493
1206,1496.0,1495.0,2005_07_06_03_35_Nige_1496,1495.0,1495.0,1493.0,2005_07_06_03_35_Nige_1495
1207,1497.0,1495.0,2005_07_06_03_35_Nige_1497,1495.0,1495.0,1493.0,2005_07_06_03_35_Nige_1495
1208,1498.0,1498.0,2005_07_06_03_35_FLD_1498,1498.0,1498.0,1498.0,2005_07_06_03_35_FLD_1498


In [139]:
raw_df_dict.keys()

dict_keys(['2004-12-25', '2005-02-06', '2005-02-08', '2005-02-27', '2005-05-14', '2005-05-19', '2005-06-06', '2005-06-12', '2005-06-16', '2005-06-20', '2005-07-25', '2005-07-29', '2005-09-26', '2005-10-07', '2005-10-12', '2005-12-03', '2005-12-04', '2005-12-16', '2005-12-23', '2006-01-02', '2006-01-12', '2006-02-20', '2006-02-24', '2006-02-28', '2006-03-05', '2006-05-02', '2006-05-15', '2006-05-27', '2006-05-29', '2006-06-01', '2006-06-05', '2006-06-08', '2006-06-21', '2006-06-28', '2006-07-01', '2006-08-06', '2006-08-11', '2006-08-13', '2006-08-15', '2006-08-23', '2006-09-13', '2006-09-24', '2006-11-01', '2006-12-06', '2006-12-10', '2006-12-20', '2007-01-12', '2007-01-19', '2007-01-21', '2007-01-29', '2007-02-06', '2007-02-07', '2007-02-15', '2007-06-01', '2007-06-04', '2007-06-17', '2007-07-03', '2007-08-19', '2007-08-22', '2007-08-24', '2007-09-07', '2007-10-24', '2007-12-17', '2008-01-02', '2008-01-03', '2008-02-07', '2008-02-14', '2008-03-01', '2008-04-20', '2008-04-27', '2008-04-

In [80]:
annot_df

Unnamed: 0,parent,child
0,999,1000
1,1001,1002
2,1001,1004
3,1007,1008
4,1007,1009
...,...,...
255,1044,1047
256,1084,1084
257,1172,1172
258,999,999
