## Preprocessing of the Log File
#### We will preprocessing and wrangle the messy log files and cleanly populate the dataframe. This dataframe will be used to visualise our data in our web app using Dash Plotly framework.

### Import libraries

In [28]:
import pandas as pd
import numpy as np
from datetime import datetime

### Read contents of the file to the program

In [29]:
file = open("preprocessing/error.txt", "r")
contents = ""
for i in range(5000):
    contents += str(file.readline())

### Data wrangling

In [30]:
# split contents of file with respect to newline to make strings 
lines = contents.split('\n') # list of log entries

date_time_list = []
severity_level_list = []
dirty_list = []
for line in lines:
    pieces = line.split(" ")
    if len(pieces) > 11: # some lines are less than the minimum required length
        date_time_list.append(" ".join([pieces[dt] for dt in range(5)]))
        severity_level_list.append(pieces[5])
        dirty_list.append(" ".join([pieces[e] for e in range(6,len(pieces),1)]))

# clean the error messages
error_msg_list = []
error_msg = ""
for dirty in dirty_list:
    dirty_pieces = dirty.split(" ")
    error_msg = ""
    for dirt_piece in dirty_pieces:
        if not any(char.isdigit() for char in dirt_piece):
            if not any(not char.isalnum() for char in dirt_piece):
                error_msg += dirt_piece + " "
    if error_msg:
        error_msg_list.append(error_msg.lower())

# clean and formate the dates
MONTH_MAP = {
    # map words to their digits representations
    "Jan":"01","Feb":"02","Mar":"03","Apr":"04","May":"05","Jun":"06",
    "Jul":"07","Aug":"08","Sep":"09","Oct":"10","Nov":"11","Dec":"12"
}
day_of_week_list = []
date_list = []
for date_time_str in date_time_list:
    date_time_str = date_time_str.replace('[','')
    date_time_str = date_time_str.replace(']','')
    date_time_pieces = date_time_str.split(" ")
    day_of_week_list.append(date_time_pieces[0])
    formatted_date_str = date_time_pieces[2] +"/"+ MONTH_MAP[date_time_pieces[1][:3]] +"/"+ date_time_pieces[4][2:]
    # convert date string to actual date object and append to date list
    date_list.append(datetime.strptime(formatted_date_str, '%d/%m/%y').date())

# create the dictionary of all the wrangled log file data
data = {
    "date_time": date_list,
    "day_of_the_week": day_of_week_list,
    "severity_level": severity_level_list,
    "error_messages": error_msg_list
}

### Extracting insight for Visualisations

In [31]:
# create the dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,date_time,day_of_the_week,error_messages,severity_level
0,2018-02-12,Mon,cannot serve directory no matching directoryin...,[autoindex:error]
1,2018-02-12,Mon,cannot serve directory no matching directoryin...,[autoindex:error]
2,2018-02-12,Mon,cannot serve directory no matching directoryin...,[autoindex:error]
3,2018-02-12,Mon,cannot serve directory no matching directoryin...,[autoindex:error]
4,2018-02-12,Mon,cannot serve directory no matching directoryin...,[autoindex:error]


In [41]:
# frequency 
series = df['error_messages'].value_counts()

unique_errors_lst = list(series.index.values)
freq_lst = series.tolist()

all_data = {
    "err": unique_errors_lst,
    "freq": freq_lst
}

all_data

{'err': ['worker shared already initialized ',
  'worker local already initialized ',
  'php  undefined email in on line ',
  'php  undefined password in on line ',
  'php  failed to open no such file or directory in on line ',
  'php fatal  failed opening required in on line ',
  'cannot serve directory no matching directoryindex and directory index forbidden by options directive ',
  'user authentication failure for password mismatch ',
  'script not found or unable to stat ',
  'php  undefined conn in on line ',
  'php fatal  uncaught call to a member function on null in  thrown in on line ',
  'php  undefined in on line ',
  'php  access denied for user in on line ',
  'user authorization failure for  ',
  'php  undefined submit in on line ',
  'php  use of undefined constant assumed in on line ',
  'php  expects parameter to be null given in on line ',
  'php  trying to destroy uninitialized session in on line ',
  'php  undefined url in on line ',
  'user not ',
  'php  array to 