In [1]:
import numpy as np
import pandas as pd
import re
import datetime as dt
from utilities import extract_data

# Data Preprocessing
Access logs were read and parsed based on the format. Extracted features:

1. IP
2. Timestamp
3. Request URL 
4. Status Code
5. Return Size
6. Referer
7. User Agent

In [2]:
# Creating initial dataframe

initial_data = pd.DataFrame(extract_data("accesslogs.log"))

In [3]:
# Saving parsed web logs

initial_data.to_csv('parsed.csv',index=False)

# Feature Engineering
The following features were extracted: 

1. Total number of request made per IP
2. Time difference between successive request

In [4]:
# Number of requests per IP

ip_req = initial_data.groupby('IP')['Timestamp'].count()

def req_count(row):
    return ip_req[row['IP']]

initial_data['Total Requests'] = initial_data.apply(req_count,axis=1)


In [5]:
# Convert Timestamp to DateTime type
times = initial_data['Timestamp']
times = times.str.split(' ',expand=True)
times = times[0]
initial_data['Timestamp'] = pd.to_datetime(times,format="%d/%b/%Y:%H:%M:%S")


In [6]:
# Adding time related features

initial_data['Date'] = initial_data['Timestamp'].dt.date
initial_data.sort_values(['IP', 'Timestamp'], inplace=True)
initial_data['Previous Request'] = initial_data.groupby(['IP'])['Timestamp'].shift(1)
initial_data['Successive Req Diff'] = (initial_data['Timestamp'] - initial_data['Previous Request']).dt.seconds