In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns
import re
from collections import Counter
from glob import glob as g

%matplotlib inline

### Create Individual CSVs

In [62]:
# Set patterns
timestamp_pattern = "[[](\d\d:\d\d)[]]"
user_pattern = "[[]\d\d:\d\d[]]\s[<](.*?)[>]"
message_pattern = ".*[<].*[>](.*)"

In [63]:
# Get files

# Train
filepath = '../data/'
subfolders = ["train", "test", "dev"]

for subfolder in subfolders:
    filelist_ascii = g(filepath + subfolder + '/*ascii.txt')
    for filename in filelist_ascii:
        data = pd.read_csv(filename, header = None, names = ['raw'], delimiter="\t")
        data['timestamp'] = data['raw'].str.extract(timestamp_pattern, expand=True)
        data['user'] = data['raw'].str.extract(user_pattern, expand=True)
        data['message'] = data['raw'].str.extract(message_pattern, expand=True)
        data['file_ind'] = data.index.values
        data['file_ind'] = data['file_ind'].astype(str)
        data.to_csv(filename + ".csv")

  regex = re.compile(pat, flags=flags)


In [64]:
data.head()

Unnamed: 0,raw,timestamp,user,message,file_ind
0,[04:14] <Gobbert> ziggi: what do you need help...,04:14,Gobbert,ziggi: what do you need help with?,0
1,[04:14] <ziggi> i am,04:14,ziggi,i am,1
2,[04:15] <joshua__> boot speed was very slow,04:15,joshua__,boot speed was very slow,2
3,[04:15] <joshua__> 2 min to boot,04:15,joshua__,2 min to boot,3
4,[04:15] <joshua__> but windows machine was ver...,04:15,joshua__,but windows machine was very fast,4


### Aggregate CSVs

In [74]:
date_pattern = "(\d\d\d\d-\d\d-\d\d)"

# Train
filepath = '../data/train'
csvs_train = pd.DataFrame(g(filepath + '/*.csv'), columns=["path"])
csvs_train["date"] = csvs_train["path"].str.extract(date_pattern, expand=True)

# Test
filepath = '../data/test'
csvs_test = pd.DataFrame(g(filepath + '/*.csv'), columns=["path"])
csvs_test["date"] = csvs_test["path"].str.extract(date_pattern, expand=True)


# Dev
filepath = '../data/dev'
csvs_dev = pd.DataFrame(g(filepath + '/*.csv'), columns=["path"])
csvs_dev["date"] = csvs_dev["path"].str.extract(date_pattern, expand=True)


In [75]:
csvs_train.head(3)

Unnamed: 0,path,date
0,../data/train\2004-12-25.train-c.ascii.txt_ann...,2004-12-25
1,../data/train\2005-02-06.train-c.ascii.txt_ann...,2005-02-06
2,../data/train\2005-02-08.train-a.ascii.txt_ann...,2005-02-08


In [67]:
#Aggregate files
rootfile = '../'

agg_train = pd.DataFrame()
for i in range(len(csvs_train.path)):
    data = pd.read_csv(csvs_train.path[i], header = 0, index_col=0)
    data['date'] = pd.Series([csvs_train.date[i] for x in range(len(data))])
    agg_train = agg_train.append(data)
agg_train.to_csv(rootfile + "agg_train.csv")
    
agg_test = pd.DataFrame()
for i in range(len(csvs_test.path)):
    data = pd.read_csv(csvs_test.path[i], header = 0, index_col=0)
    data['date'] = pd.Series([csvs_test.date[i] for x in range(len(data))])
    agg_test = agg_test.append(data)
agg_test.to_csv(rootfile + "agg_test.csv")
    
agg_dev = pd.DataFrame()
for i in range(len(csvs_dev.path)):
    data = pd.read_csv(csvs_dev.path[i], header = 0, index_col=0)
    data['date'] = pd.Series([csvs_dev.date[i] for x in range(len(data))])
    agg_dev = agg_dev.append(data)
agg_dev.to_csv(rootfile + "agg_dev.csv")

In [68]:
agg_dev

Unnamed: 0,raw,timestamp,user,message,file_ind,date
0,"[12:18] <|trey|> usual, quite stable though :)",12:18,|trey|,"usual, quite stable though :)",0,2004-11-15
1,[12:18] <tweaked> HrdwrBoB: ok how many partit...,12:18,tweaked,HrdwrBoB: ok how many partitions should i make?,1,2004-11-15
2,"[12:18] <Matt|> |trey|, top in the list --> ub...",12:18,Matt|,ubuntu servers,2,2004-11-15
3,[12:18] <usual> a few libs and media,12:18,usual,a few libs and media,3,2004-11-15
4,[12:18] <usual> maybe some others,12:18,usual,maybe some others,4,2004-11-15
...,...,...,...,...,...,...
1245,[21:57] <zacky83> who can help me on this,21:57,zacky83,who can help me on this,1245,2016-12-19
1246,"[21:57] <Mccallum1983> can anyone assist, when...",21:57,Mccallum1983,"can anyone assist, when i try to install bitc...",1246,2016-12-19
1247,[21:57] <figure002> OerHeks: still makes no se...,21:57,figure002,OerHeks: still makes no sense to me why a dae...,1247,2016-12-19
1248,[21:58] <figure002> zacky83: did you enable th...,21:58,figure002,zacky83: did you enable the jails?,1248,2016-12-19


In [69]:
# Remove command messages
agg_dict = {
    'dev':agg_dev,
    'train': agg_train,
    'test': agg_test
}

for key in agg_dict.keys():
    agg_dict[key] = agg_dict[key][agg_dict[key]['timestamp'].notnull()]

In [70]:
agg_dict['dev']

Unnamed: 0,raw,timestamp,user,message,file_ind,date
0,"[12:18] <|trey|> usual, quite stable though :)",12:18,|trey|,"usual, quite stable though :)",0,2004-11-15
1,[12:18] <tweaked> HrdwrBoB: ok how many partit...,12:18,tweaked,HrdwrBoB: ok how many partitions should i make?,1,2004-11-15
2,"[12:18] <Matt|> |trey|, top in the list --> ub...",12:18,Matt|,ubuntu servers,2,2004-11-15
3,[12:18] <usual> a few libs and media,12:18,usual,a few libs and media,3,2004-11-15
4,[12:18] <usual> maybe some others,12:18,usual,maybe some others,4,2004-11-15
...,...,...,...,...,...,...
1245,[21:57] <zacky83> who can help me on this,21:57,zacky83,who can help me on this,1245,2016-12-19
1246,"[21:57] <Mccallum1983> can anyone assist, when...",21:57,Mccallum1983,"can anyone assist, when i try to install bitc...",1246,2016-12-19
1247,[21:57] <figure002> OerHeks: still makes no se...,21:57,figure002,OerHeks: still makes no sense to me why a dae...,1247,2016-12-19
1248,[21:58] <figure002> zacky83: did you enable th...,21:58,figure002,zacky83: did you enable the jails?,1248,2016-12-19


In [71]:
def generate_uuid(row):
    try:
        uuid =  '_'.join(map(str, [row['date'].replace('-', '_'),
                row['timestamp'].replace(':', '_'),
                row['user'],
                str(row['file_ind'])]))
    except Exception as e:
        print(row['file_ind'])
        raise e
    return uuid


In [72]:
for key in agg_dict.keys():
    agg_dict[key]['uuid'] = agg_dict[key].apply(lambda row: generate_uuid(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [73]:
agg_dict['dev']['uuid']

0                2004_11_15_12_18_|trey|_0
1               2004_11_15_12_18_tweaked_1
2                 2004_11_15_12_18_Matt|_2
3                 2004_11_15_12_18_usual_3
4                 2004_11_15_12_18_usual_4
                       ...                
1245         2016_12_19_21_57_zacky83_1245
1246    2016_12_19_21_57_Mccallum1983_1246
1247       2016_12_19_21_57_figure002_1247
1248       2016_12_19_21_58_figure002_1248
1249    2016_12_19_21_59_Mccallum1983_1249
Name: uuid, Length: 11644, dtype: object