In [2]:
import pandas as pd
from glob import glob as g

In [46]:
# Set patterns
timestamp_pattern = "[[](\d\d:\d\d)[]]"
user_pattern = "[[]\d\d:\d\d[]]\s[<](.*?)[>]"
message_pattern = ".*[<].*[>](.*)"
# Get files

# Train
filepath = '../data/'
subfolders = ["train", "test", "dev"]

for subfolder in subfolders:
    filelist_ascii = g(filepath + subfolder + '/*ascii.txt')
    #filelist_annot = g(filepath + subfolder + '/*annotation.txt')
    for filename in filelist_ascii:
        filename_annot = filename.replace('ascii', 'annotation')
        data = pd.read_csv(filename, header = None, names = ['raw'], delimiter="\t")
        data['timestamp'] = data['raw'].str.extract(timestamp_pattern, expand=True)
        data['user'] = data['raw'].str.extract(user_pattern, expand=True)
        data['message'] = data['raw'].str.extract(message_pattern, expand=True)
        data['file_ind'] = data.index.values
        data['file_ind'] = data['file_ind']

        annot_df = pd.read_csv(filename_annot, index_col=False, header=None, names=['parent', 'child'], delimiter="\s")
        merged_data = pd.merge(data, annot_df, left_on='file_ind', right_on='child', how='left')
        merged_data.to_csv(filename + "_annot.csv")
        #data.to_csv(filename + ".csv")



In [47]:
date_pattern = "(\d\d\d\d-\d\d-\d\d)"

# Train
filepath = '../data/train'
csvs_train = pd.DataFrame(g(filepath + '/*annot.csv'), columns=["path"])
csvs_train["date"] = csvs_train["path"].str.extract(date_pattern, expand=True)

# Test
filepath = '../data/test'
csvs_test = pd.DataFrame(g(filepath + '/*annot.csv'), columns=["path"])
csvs_test["date"] = csvs_test["path"].str.extract(date_pattern, expand=True)


# Dev
filepath = '../data/dev'
csvs_dev = pd.DataFrame(g(filepath + '/*annot.csv'), columns=["path"])
csvs_dev["date"] = csvs_dev["path"].str.extract(date_pattern, expand=True)


In [48]:
#Aggregate files
rootfile = '../'

agg_train = pd.DataFrame()
for i in range(len(csvs_train.path)):
    data = pd.read_csv(csvs_train.path[i], header = 0, index_col=0)
    data['date'] = pd.Series([csvs_train.date[i] for x in range(len(data))])
    agg_train = agg_train.append(data)
agg_train.to_csv(rootfile + "agg_train.csv")

agg_test = pd.DataFrame()
for i in range(len(csvs_test.path)):
    data = pd.read_csv(csvs_test.path[i], header = 0, index_col=0)
    data['date'] = pd.Series([csvs_test.date[i] for x in range(len(data))])
    agg_test = agg_test.append(data)
agg_test.to_csv(rootfile + "agg_test.csv")

agg_dev = pd.DataFrame()
for i in range(len(csvs_dev.path)):
    data = pd.read_csv(csvs_dev.path[i], header = 0, index_col=0)
    data['date'] = pd.Series([csvs_dev.date[i] for x in range(len(data))])
    agg_dev = agg_dev.append(data)
agg_dev.to_csv(rootfile + "agg_dev.csv")

In [49]:
agg_dev
# Remove command messages
agg_dict = {
    'dev': agg_dev,
    'train': agg_train,
    'test': agg_test
}

for key in agg_dict.keys():
    agg_dict[key] = agg_dict[key][agg_dict[key]['timestamp'].notnull()]

In [50]:
agg_dict['dev']

Unnamed: 0,raw,timestamp,user,message,file_ind,parent,child,date
0,"[12:18] <|trey|> usual, quite stable though :)",12:18,|trey|,"usual, quite stable though :)",0,,,2004-11-15
1,[12:18] <tweaked> HrdwrBoB: ok how many partit...,12:18,tweaked,HrdwrBoB: ok how many partitions should i make?,1,,,2004-11-15
2,"[12:18] <Matt|> |trey|, top in the list --> ub...",12:18,Matt|,ubuntu servers,2,,,2004-11-15
3,[12:18] <usual> a few libs and media,12:18,usual,a few libs and media,3,,,2004-11-15
4,[12:18] <usual> maybe some others,12:18,usual,maybe some others,4,,,2004-11-15
...,...,...,...,...,...,...,...,...
1253,[21:57] <zacky83> who can help me on this,21:57,zacky83,who can help me on this,1245,1244.0,1245.0,2016-12-19
1254,"[21:57] <Mccallum1983> can anyone assist, when...",21:57,Mccallum1983,"can anyone assist, when i try to install bitc...",1246,1246.0,1246.0,2016-12-19
1255,[21:57] <figure002> OerHeks: still makes no se...,21:57,figure002,OerHeks: still makes no sense to me why a dae...,1247,1242.0,1247.0,2016-12-19
1256,[21:58] <figure002> zacky83: did you enable th...,21:58,figure002,zacky83: did you enable the jails?,1248,1244.0,1248.0,2016-12-19


In [51]:
def generate_uuid(row):
    try:
        uuid =  '_'.join(map(str, [row['date'].replace('-', '_'),
                row['timestamp'].replace(':', '_'),
                row['user'],
                str(row['file_ind'])]))
    except Exception as e:
        print(row['file_ind'])
        raise e
    return uuid


In [52]:
for key in agg_dict.keys():
    agg_dict[key]['uuid'] = agg_dict[key].apply(lambda row: generate_uuid(row), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [53]:
agg_dict['dev']

Unnamed: 0,raw,timestamp,user,message,file_ind,parent,child,date,uuid
0,"[12:18] <|trey|> usual, quite stable though :)",12:18,|trey|,"usual, quite stable though :)",0,,,2004-11-15,2004_11_15_12_18_|trey|_0
1,[12:18] <tweaked> HrdwrBoB: ok how many partit...,12:18,tweaked,HrdwrBoB: ok how many partitions should i make?,1,,,2004-11-15,2004_11_15_12_18_tweaked_1
2,"[12:18] <Matt|> |trey|, top in the list --> ub...",12:18,Matt|,ubuntu servers,2,,,2004-11-15,2004_11_15_12_18_Matt|_2
3,[12:18] <usual> a few libs and media,12:18,usual,a few libs and media,3,,,2004-11-15,2004_11_15_12_18_usual_3
4,[12:18] <usual> maybe some others,12:18,usual,maybe some others,4,,,2004-11-15,2004_11_15_12_18_usual_4
...,...,...,...,...,...,...,...,...,...
1253,[21:57] <zacky83> who can help me on this,21:57,zacky83,who can help me on this,1245,1244.0,1245.0,2016-12-19,2016_12_19_21_57_zacky83_1245
1254,"[21:57] <Mccallum1983> can anyone assist, when...",21:57,Mccallum1983,"can anyone assist, when i try to install bitc...",1246,1246.0,1246.0,2016-12-19,2016_12_19_21_57_Mccallum1983_1246
1255,[21:57] <figure002> OerHeks: still makes no se...,21:57,figure002,OerHeks: still makes no sense to me why a dae...,1247,1242.0,1247.0,2016-12-19,2016_12_19_21_57_figure002_1247
1256,[21:58] <figure002> zacky83: did you enable th...,21:58,figure002,zacky83: did you enable the jails?,1248,1244.0,1248.0,2016-12-19,2016_12_19_21_58_figure002_1248


In [56]:
for key in agg_dict.keys():
    agg_dict[key].to_csv(rootfile + f"data/cleaned/agg_{key}.csv")