## Preprocessing of the Log File
#### We will preprocessing and wrangle the messy log files and cleanly populate the dataframe. This dataframe will be used to visualise our data in our web app using Dash Plotly framework.

### Import libraries

In [72]:
import pandas as pd
import numpy as np
from datetime import datetime
import nltk
from nltk.tokenize import word_tokenize
import re

### Read contents of the file to the program

In [73]:
file = open("error.txt", "r")
contents = ""
for i in range(5000):
    contents += str(file.readline())

### Data wrangling

In [74]:
# split contents of file with respect to newline to make strings 
lines = contents.split('\n') # list of log entries

date_time_list = []
severity_level_list = []
dirty_list = []
for line in lines:
    pieces = line.split(" ")
    if len(pieces) > 11: # some lines are less than the minimum required length
        date_time_list.append(" ".join([pieces[dt] for dt in range(5)]))
        severity_level_list.append(pieces[5])
        dirty_list.append(" ".join([pieces[e] for e in range(6,len(pieces),1)]))

# clean the error messages
error_msg_list = []
error_msg = ""
for dirty in dirty_list:
    dirty_pieces = dirty.split(" ")
    error_msg = ""
    for dirt_piece in dirty_pieces:
        if not any(char.isdigit() for char in dirt_piece):
            if not any(not char.isalnum() for char in dirt_piece):
                error_msg += dirt_piece + " "
    error_msg = " ".join(error_msg.split(" ")[:5])
    if error_msg:
        error_msg_list.append(error_msg.lower())
    
# clean and formate the dates
MONTH_MAP = {
    # map words to their digits representations
    "Jan":"01","Feb":"02","Mar":"03","Apr":"04","May":"05","Jun":"06",
    "Jul":"07","Aug":"08","Sep":"09","Oct":"10","Nov":"11","Dec":"12"
}
day_of_week_list = []
date_list = []
for date_time_str in date_time_list:
    date_time_str = date_time_str.replace('[','')
    date_time_str = date_time_str.replace(']','')
    date_time_pieces = date_time_str.split(" ")
    day_of_week_list.append(date_time_pieces[0])
    formatted_date_str = date_time_pieces[2] +"/"+ MONTH_MAP[date_time_pieces[1][:3]] +"/"+ date_time_pieces[4][2:]
    # convert date string to actual date object and append to date list
    date_list.append(datetime.strptime(formatted_date_str, '%d/%m/%y').date())

'''
# remove error_messages that are too damn long
for e in error_msg_list:
    e = re.sub(' +', ' ', e) # removes double whitespaces
    if not len(e.split(" ")) > 5:
        e = e.join(" ")
        if e in error_msg_list:
            index = error_msg_list.index(e)
            error_msg_list.pop(index)
            date_list.pop(index)
            day_of_week_list.pop(index)
            severity_level_list.pop(index)
'''

# create the dictionary of all the wrangled log file data
data = {
    "date": date_list,
    "day_of_the_week": day_of_week_list,
    "severity_level": severity_level_list,
    "error_messages": error_msg_list
}

### Extracting insight for Visualisations

In [89]:
# create the dataframe
df = pd.DataFrame(data)
df.tail()

Unnamed: 0,date,day_of_the_week,error_messages,severity_level
4995,2020-03-29,Sun,worker shared already initialized,[proxy:debug]
4996,2020-03-29,Sun,worker local already initialized,[proxy:debug]
4997,2020-03-29,Sun,worker shared already initialized,[proxy:debug]
4998,2020-03-29,Sun,worker local already initialized,[proxy:debug]
4999,2020-03-29,Sun,worker shared already initialized,[proxy:debug]


In [76]:
# count how frequently each error messages occurs 
unique_errors_counts = df['error_messages'].value_counts()
unique_error_frequencies = {
    "unique_error_msgs": list(unique_errors_counts.index.values),
    "frequencies": unique_errors_counts.tolist()
}
#unique_error_frequencies

In [77]:
# count how frequently each error occurs per month
# df['date'] = pd.to_datetime(df['date'])
# gy = df.groupby(pd.Grouper(key='date', freq='Y'))['error_messages'].count().tolist()
# print(gy)
# df['Year'] = pd.DatetimeIndex(df['date']).year
# df['Year'].unique().tolist()
# df

In [88]:
# create a bunch of groupby methods for table

# sort by year
# df['Year'] = pd.DatetimeIndex(df['date']).year.astype(int)
# df['Month'] = pd.DatetimeIndex(df['date']).month.astype(int)
# df.sort_values(by='Month', ascending=False)
# df

Unnamed: 0,date,day_of_the_week,error_messages,severity_level,Year,Month
0,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
1,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
2,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
3,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
4,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
5,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
6,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
7,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
8,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
9,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error],2018,2
