In [1]:
import numpy as np
import pandas as pd
import re
import datetime as dt
from utilities import extract_data

# Data Preprocessing
Access logs were read and parsed based on the format. Extracted features:

1. IP
2. Timestamp
3. Request URL 
4. Status Code
5. Return Size
6. Referer
7. User Agent

In [2]:
# Creating initial dataframe

initial_data = pd.DataFrame(extract_data("accesslogs.log"))

In [3]:
# Saving parsed web logs

initial_data.to_csv('parsed.csv',index=False)

# Feature Engineering
The following features were extracted: 

1. Total number of request made per IP
2. Time difference between successive request
3. Number of successful request
4. Number of GET requests
5. Return Size
6. Number of requests per day


In [4]:
# Number of requests per IP

ip_req = initial_data.groupby('IP')['Timestamp'].count()

def req_count(row):
    return ip_req[row['IP']]

initial_data['Total Requests'] = initial_data.apply(req_count,axis=1)

In [5]:
# Convert Timestamp to DateTime type
times = initial_data['Timestamp']
times = times.str.split(' ',expand=True)
times = times[0]
initial_data['Timestamp'] = pd.to_datetime(times,format="%d/%b/%Y:%H:%M:%S")


In [6]:
# Adding time related features

initial_data['Date'] = initial_data['Timestamp'].dt.date
initial_data.sort_values(['IP', 'Timestamp'], inplace=True)
initial_data['Previous Request'] = initial_data.groupby(['IP'])['Timestamp'].shift(1)
initial_data['Successive Req Diff'] = (initial_data['Timestamp'] - initial_data['Previous Request']).dt.seconds

In [7]:
# Get request and status code labels

initial_data['GET'] = initial_data['URL'].apply(lambda u: int("GET" in u[:4]))
initial_data['Success'] = initial_data['Status Code'].apply(lambda s: int((s<=299) and (s>=200)))


## Preparation of final dataset


In [85]:
# Prepare final dataset
final_data = pd.DataFrame(initial_data.groupby('IP')['Timestamp'].count())
final_data.columns = ['Total Requests']
final_data = final_data.reset_index()

# Finding Requests per day
daily_count = initial_data.groupby(['IP','Date'])['Timestamp'].count().reset_index()
daily_count_aggregate = daily_count.groupby('IP').mean()
daily_count_aggregate.columns = ['Mean']
final_data['Daily Mean'] = final_data['IP'].apply(lambda ip: daily_count_aggregate.loc[ip]['Mean'])

# Number of GET requests and success codes
get_request = initial_data.groupby(['IP'])['GET'].sum()
successes = initial_data.groupby('IP')['Success'].sum()
final_data['GET requests'] = final_data['IP'].apply(lambda ip: get_request.loc[ip])
final_data['Successful requests'] = final_data['IP'].apply(lambda ip: successes.loc[ip])

# Response size 
return_size_mean = initial_data.groupby('IP')['Return Size'].mean().fillna(0)
return_size_std = initial_data.groupby('IP')['Return Size'].std().fillna(0)
final_data['Mean Return Size'] = final_data['IP'].apply(lambda ip: return_size_mean.loc[ip])
final_data['Return Size Std'] = final_data['IP'].apply(lambda ip: return_size_std.loc[ip])

# Time difference between successive requests
time_difference = initial_data.groupby('IP')['Successive Req Diff'].mean().fillna(0)
final_data['Mean Time Difference'] = final_data['IP'].apply(lambda ip: time_difference.loc[ip])


In [88]:
final_data.to_csv('finaldata.csv',index=False)