## Preprocessing of the Log File
#### We will preprocessing and wrangle the messy log files and cleanly populate the dataframe. This dataframe will be used to visualise our data in our web app using Dash Plotly framework.

### Import libraries

In [61]:
import pandas as pd
import numpy as np
from datetime import datetime
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

### Read contents of the file to the program

In [62]:
file = open("preprocessing/demo_log.txt", "r")
contents = ""
for i in range(5000):
    contents += str(file.readline())

### Data wrangling

In [69]:
# split contents of file with respect to newline to make strings 
lines = contents.split('\n') # list of log entries

date_time_list = []
severity_level_list = []
dirty_list = []
for line in lines:
    pieces = line.split(" ")
    if len(pieces) > 11: # some lines are less than the minimum required length
        date_time_list.append(" ".join([pieces[dt] for dt in range(5)]))
        severity_level_list.append(pieces[5])
        dirty_list.append(" ".join([pieces[e] for e in range(6,len(pieces),1)]))

# clean the error messages
error_msg_list = []
error_msg = ""
for dirty in dirty_list:
    dirty_pieces = dirty.split(" ")
    error_msg = ""
    for dirt_piece in dirty_pieces:
        if not any(char.isdigit() for char in dirt_piece):
            if not any(not char.isalnum() for char in dirt_piece):
                error_msg += dirt_piece + " "
    if error_msg:
        error_msg_list.append(error_msg.lower())
    
# clean and formate the dates
MONTH_MAP = {
    # map words to their digits representations
    "Jan":"01","Feb":"02","Mar":"03","Apr":"04","May":"05","Jun":"06",
    "Jul":"07","Aug":"08","Sep":"09","Oct":"10","Nov":"11","Dec":"12"
}
day_of_week_list = []
date_list = []
for date_time_str in date_time_list:
    date_time_str = date_time_str.replace('[','')
    date_time_str = date_time_str.replace(']','')
    date_time_pieces = date_time_str.split(" ")
    day_of_week_list.append(date_time_pieces[0])
    formatted_date_str = date_time_pieces[2] +"/"+ MONTH_MAP[date_time_pieces[1][:3]] +"/"+ date_time_pieces[4][2:]
    # convert date string to actual date object and append to date list
    date_list.append(datetime.strptime(formatted_date_str, '%d/%m/%y').date())

# create the dictionary of all the wrangled log file data
data = {
    "date_time": date_list,
    "day_of_the_week": day_of_week_list,
    "severity_level": severity_level_list,
    "error_messages": error_msg_list
}
df.head(10)

['user authorization failure for  ', 'user authentication failure for password mismatch ', 'worker shared already initialized ', 'worker local already initialized ', 'worker shared already initialized ', 'worker local already initialized ', 'worker shared already initialized ', 'php  undefined salt in on line ', 'php  undefined salt in on line ', 'script not found or unable to stat ', 'script not found or unable to stat ', 'script not found or unable to stat ', 'php  undefined title in on line ', 'php  undefined return in on line ', 'php  undefined key in on line ', 'php  undefined type in on line ', 'php  undefined title in on line ', 'php  undefined return in on line ', 'php  expects parameter to be string given in on line ', 'php  use of undefined constant console assumed in on line ', 'php  expects parameter to be string given in on line ', 'php  use of undefined constant console assumed in on line ', 'php  expects parameter to be string given in on line ', 'cannot serve directory 

Unnamed: 0,date_time,day_of_the_week,error_messages,severity_level
0,2019-03-03,Sun,user authorization failure for,[authz_core:error]
1,2019-03-03,Sun,user authentication failure for password misma...,[auth_basic:error]
2,2020-03-29,Sun,worker shared already initialized,[proxy:debug]
3,2020-03-29,Sun,worker local already initialized,[proxy:debug]
4,2020-03-29,Sun,worker shared already initialized,[proxy:debug]
5,2020-03-29,Sun,worker local already initialized,[proxy:debug]
6,2020-03-29,Sun,worker shared already initialized,[proxy:debug]
7,2019-06-07,Fri,php undefined salt in on line,[:error]
8,2019-06-07,Fri,php undefined salt in on line,[:error]
9,2020-06-06,Sat,script not found or unable to stat,[:error]


### Extracting insight for Visualisations

In [68]:
# create the dataframe
df = pd.DataFrame(data)
df.head(10)

[nltk_data] Downloading package stopwords to /home/pako/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,date_time,day_of_the_week,error_messages,severity_level
0,2019-03-03,Sun,user authorization failure for,[authz_core:error]
1,2019-03-03,Sun,user authentication failure for password misma...,[auth_basic:error]
2,2020-03-29,Sun,worker shared already initialized,[proxy:debug]
3,2020-03-29,Sun,worker local already initialized,[proxy:debug]
4,2020-03-29,Sun,worker shared already initialized,[proxy:debug]
5,2020-03-29,Sun,worker local already initialized,[proxy:debug]
6,2020-03-29,Sun,worker shared already initialized,[proxy:debug]
7,2019-06-07,Fri,php undefined salt in on line,[:error]
8,2019-06-07,Fri,php undefined salt in on line,[:error]
9,2020-06-06,Sat,script not found or unable to stat,[:error]


In [65]:
# count how frequently each error messages occurs 
unique_errors_counts = df['error_messages'].value_counts()
unique_error_frequencies = {
    "unique_error_msgs": list(unique_errors_counts.index.values),
    "frequencies": unique_errors_counts.tolist()
}
unique_error_frequencies

{'unique_error_msgs': ['php  expects parameter to be string given in on line ',
  'php  failed to open no such file or directory in on line ',
  'script not found or unable to stat ',
  'worker shared already initialized ',
  'worker local already initialized ',
  'php  undefined salt in on line ',
  'php  use of undefined constant console assumed in on line ',
  'php  undefined return in on line ',
  'response header name contains invalid aborting ',
  'user authorization failure for  ',
  'php  undefined title in on line ',
  'php  access denied for user in on line ',
  'php fatal  uncaught call to undefined function in  thrown in on line ',
  'user authentication failure for password mismatch ',
  'php parse  syntax unexpected in on line ',
  'php  use of undefined constant assumed in on line ',
  'php  failed opening for inclusion in on line ',
  'cannot serve directory no matching directoryindex and directory index forbidden by options directive ',
  'user authentication failure f