In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
import seaborn as sns

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
#from scipy.stats import skew
from subprocess import check_output



# Creating pipeline

In [6]:
#Euclidean distance
def measureDistance(data, model):
    distance = pd.Series()
    for i in range(0,len(data)):
        Xa = np.array(data.loc[i])
        Xb = model.cluster_centers_[model.labels_[i]-1]        
        distance.set_value(i, np.linalg.norm(Xa-Xb))
    return distance

In [7]:
def loadFile(logFileName):
    colNames = ['ip','dp1','dp2','date','time','numeric1','rest','rc','numeric2','dp3','client','in','out','us']
    logData = pd.read_csv('access_logs_201612.txt', delim_whitespace=True, header=None, names=colNames, parse_dates=[3,4])
    #drop unused columns
    logData = logData.drop(['dp1','dp2','dp3'], axis =1)
    return logData 

In [8]:
def cleanInOutUs(logData):
    logData['in']= logData['in'].apply(lambda x : int(x.split(":")[1]))
    logData['out']= logData['out'].apply(lambda x : int(x.split(":")[1]))
    logData['us']= logData['us'].apply(lambda x : int(x.split(":")[1]))
    return logData

In [9]:
def extractResourceOp(restserv):
    arr = restserv.split(" ")
    op = arr[0]
    url = arr[1]
    protocol=arr[2]
    resource=url.split("?")[0]
    return [op, resource,protocol]


def extractOp(restserv):
    arr = restserv.split(" ")
    op = arr[0]
    return [op]

def parseRestURL(logData):
    logData[['operation']] = logData.apply(lambda row:pd.Series(extractOp(row['rest'])),axis=1)
    return logData

In [10]:
def extractDateFeatures(row):   
    day = row.dayofweek
    return day

def parseDate(logData):
    logData['day'] = logData['date'].apply(lambda x : extractDateFeatures(x))
    return (logData)

In [11]:
def extractTimeFeatures(row):
    hour = row.hour
    minute = row.minute
    return [hour,minute]

def parseTime(logData):
    logData[['hour', 'minute']] = logData.apply(lambda row:pd.Series(extractTimeFeatures(row['time'])),axis=1)

In [12]:
def extracFirstIpPart(x):
    return x.split(".")[0]

def parseIP(logData):
    logData['first_ip_part'] = logData['ip'].apply (lambda x : extracFirstIpPart(x))
    return logData

In [13]:
def mapIpToColumn(part):   
    if (part.startswith("10")):
        return [0, 0, 0]
    if (part.startswith("100")):
        return [0, 0, 1]
    if (part.startswith("134")):
        return [0, 1, 0]    
    if (part.startswith("137")):
        return [0, 1,  1]
    if (part.startswith("147")):
        return [1 ,0, 0]
    if (part.startswith("150") | part.startswith("153")):
        return [1, 0, 1]
    if (part.startswith("localhost") ):
        return [1, 1, 0]
    return [1,1,1] 

def transformIP_to_DummyVar(logData):
    logData[['ip_3', 'ip_2', 'ip_1']] = logData.apply(lambda row:pd.Series(mapIpToColumn(row['first_ip_part'])),
                                                           axis=1)
    return logData

In [14]:
def mapOperationToColumn(part):   
    if (part.startswith("DELETE")):
        return [0, 0, 0]
    if (part.startswith("GET")):
        return [0, 0, 1]
    if (part.startswith("OPTIONS")):
        return [0, 1, 0]    
    if (part.startswith("POST")):
        return [0, 1,  1]
    if (part.startswith("PUT")):
        return [1 ,0, 0]   
    return [1, 0, 1]   

def transformOperation_to_DummyVar(logData):
    logData[['op_3', 'op_2', 'op_1']] = logData.apply(lambda row:pd.Series(mapOperationToColumn(row['operation'])),
                                                           axis=1)
    return logData

In [15]:
def normalizeData_PCA(logData):
    # We take only numeric feature and standardize them
    pcaData = logData.select_dtypes(include=[np.int64, np.float64])
    pcaData = pcaData.drop (["numeric1"], axis=1)

    min_max_scaler = preprocessing.StandardScaler()
    np_scaled = min_max_scaler.fit_transform(pcaData)
    pcaData = pd.DataFrame(np_scaled)

    # I reduce to 2 importants features (for the sake of visualization)
    pca = PCA(n_components=2)
    pcaData = pca.fit_transform(pcaData)
    # And standardizing these 2 new features
    min_max_scaler = preprocessing.StandardScaler()
    np_scaled = min_max_scaler.fit_transform(pcaData)
    pcaData = pd.DataFrame(np_scaled)
    return pcaData

In [16]:
#As this process take long time in my laptop, I chunked into pieces and print the results in each chunk
def tagRowsWithAnomalies(logData, pcaData, outliers_fraction, kmeans):
    nrows = int(logData[[0,1]].shape[0])
    piece = int( nrows / 100)
    outliers_fraction = 0.01
    
    for x in range(0, 101):
        lower = x * piece
        upper =min(((x+1) * piece), nrows)       

        data_range= pcaData[lower:upper]
        data_range =  data_range.reset_index()
        data_range =  data_range.drop(['index'],1)
        distance = measureDistance(data_range, kmeans)
       
        number_of_outliers = int(outliers_fraction*len(distance))
        threshold = distance.nlargest(number_of_outliers).min()
        # (0:normal, 1:anomaly)  
        s = (distance >= threshold).astype(int)    
        logData.loc[lower:upper,"anomaly"] = s
        if (x % 10 == 0):
            print ("chunk ", x , " is done")

    print ("Finding anomalies is finished")
    return (logData)
    

In [17]:
#main functiuon to find anomalies
def findAnomalies(logfileName, outliers_fraction, numCluster):
    logData= loadFile(logfileName)
    logData = cleanInOutUs(logData)
    logData = parseRestURL(logData)
    logData = parseDate(logData)

    #Transform categorical features into binary dummy variable
    logData = parseIP(logData)
    logData = transformIP_to_DummyVar(logData)
    logData = transformOperation_to_DummyVar(logData)

    pcaData = normalizeData_PCA(logData)
    kmeans = KMeans(n_clusters=numCluster).fit(data)

    logData['cluster'] = kmeans.predict(pcaData)
    logData['principal_feature1'] = pcaData[0]
    logData['principal_feature2'] = pcaData[1]
    logData = tagRowsWithAnomalies(logData, pcaData, outliers_fraction, kmeans)
    return logData

In [None]:
logfileName='access_logs_201612.txt'
numCluster=4
outliers_fraction=0.01
#  outliers_fraction: An estimation of anomly population of the dataset 
findAnomalies(logfileName, outliers_fraction, numCluster)