## Preprocessing of the Log File
#### We will preprocessing and wrangle the messy log files and cleanly populate the dataframe. This dataframe will be used to visualise our data in our web app using Dash Plotly framework.

### Import libraries

In [162]:
import pandas as pd
import numpy as np
from datetime import datetime
import nltk
from nltk.tokenize import word_tokenize
import re

### Read contents of the file to the program

In [163]:
file = open("preprocessing/error.txt", "r")
contents = ""
for i in range(5000):
    contents += str(file.readline())

### Data wrangling

In [164]:
# split contents of file with respect to newline to make strings 
lines = contents.split('\n') # list of log entries

date_time_list = []
severity_level_list = []
dirty_list = []
for line in lines:
    pieces = line.split(" ")
    if len(pieces) > 11: # some lines are less than the minimum required length
        date_time_list.append(" ".join([pieces[dt] for dt in range(5)]))
        severity_level_list.append(pieces[5])
        dirty_list.append(" ".join([pieces[e] for e in range(6,len(pieces),1)]))

# clean the error messages
error_msg_list = []
error_msg = ""
for dirty in dirty_list:
    dirty_pieces = dirty.split(" ")
    error_msg = ""
    for dirt_piece in dirty_pieces:
        if not any(char.isdigit() for char in dirt_piece):
            if not any(not char.isalnum() for char in dirt_piece):
                error_msg += dirt_piece + " "
    error_msg = " ".join(error_msg.split(" ")[:5])
    if error_msg:
        error_msg_list.append(error_msg.lower())
    
# clean and formate the dates
MONTH_MAP = {
    # map words to their digits representations
    "Jan":"01","Feb":"02","Mar":"03","Apr":"04","May":"05","Jun":"06",
    "Jul":"07","Aug":"08","Sep":"09","Oct":"10","Nov":"11","Dec":"12"
}
day_of_week_list = []
date_list = []
for date_time_str in date_time_list:
    date_time_str = date_time_str.replace('[','')
    date_time_str = date_time_str.replace(']','')
    date_time_pieces = date_time_str.split(" ")
    day_of_week_list.append(date_time_pieces[0])
    formatted_date_str = date_time_pieces[2] +"/"+ MONTH_MAP[date_time_pieces[1][:3]] +"/"+ date_time_pieces[4][2:]
    # convert date string to actual date object and append to date list
    date_list.append(datetime.strptime(formatted_date_str, '%d/%m/%y').date())

'''
# remove error_messages that are too damn long
for e in error_msg_list:
    e = re.sub(' +', ' ', e) # removes double whitespaces
    if not len(e.split(" ")) > 5:
        e = e.join(" ")
        if e in error_msg_list:
            index = error_msg_list.index(e)
            error_msg_list.pop(index)
            date_list.pop(index)
            day_of_week_list.pop(index)
            severity_level_list.pop(index)
'''

# create the dictionary of all the wrangled log file data
data = {
    "date_time": date_list,
    "day_of_the_week": day_of_week_list,
    "severity_level": severity_level_list,
    "error_messages": error_msg_list
}

### Extracting insight for Visualisations

In [165]:
# create the dataframe
df = pd.DataFrame(data)
df

Unnamed: 0,date_time,day_of_the_week,error_messages,severity_level
0,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
1,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
2,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
3,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
4,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
5,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
6,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
7,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
8,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]
9,2018-02-12,Mon,cannot serve directory no matching,[autoindex:error]


In [166]:
# count how frequently each error messages occurs 
unique_errors_counts = df['error_messages'].value_counts()
unique_error_frequencies = {
    "unique_error_msgs": list(unique_errors_counts.index.values),
    "frequencies": unique_errors_counts.tolist()
}
unique_error_frequencies

{'unique_error_msgs': ['worker shared already initialized ',
  'worker local already initialized ',
  'php  undefined email in',
  'php  undefined password in',
  'php  failed to open',
  'php fatal  failed opening',
  'cannot serve directory no matching',
  'user authentication failure for password',
  'script not found or unable',
  'php  undefined conn in',
  'php fatal  uncaught call',
  'php  undefined in on',
  'php  access denied for',
  'php  expects parameter to',
  'php  undefined submit in',
  'php  use of undefined',
  'user authorization failure for ',
  'php  undefined url in',
  'php  trying to destroy',
  'php  array to string',
  'php  undefined ffffff in',
  'user not ',
  'php parse  syntax unexpected'],
 'frequencies': [2301,
  2300,
  141,
  121,
  24,
  24,
  17,
  11,
  11,
  9,
  8,
  7,
  6,
  3,
  3,
  3,
  3,
  2,
  2,
  1,
  1,
  1,
  1]}