In [None]:
# ----------------------------------------------------------------------------------------------------
# Premier League High Traffic IPs Identification
# 
# The present notebook import BigFlow traffic data on the IP level, normalizes the traffic and select 
# IPs with high volumes of traffic. These IPs are then cross checked with premier league fixtures to 
# determine if the high traffic is related to premier league football.
#
# Author: Mohsen Mohammadi - DEC 2020
# Version: 2
# ----------------------------------------------------------------------------------------------------

import os
import re
import json
import glob
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import train_test_split


size = 25
params = {'legend.fontsize': size,
          'figure.figsize': (25,15),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.75,
          'ytick.labelsize': size*0.75,
          'axes.titlepad': 25}
plt.rcParams.update(params)



## Import Traffic Data

In [None]:
def preprocessing_fn(filename, st, et):
    """
    Read and pre-process input columns into transformed columns.
    :param filename: traffic filename
    :param st: start time of the interpolation period 
    :param et: end time of the interpolation period 
    :return: list of suspicious pirating IPs
    """    
    # Read required traffic datafile to a data-frame
    ip_traffic = pd.read_csv(filename, usecols=['bf_date', 'bf_time', 'ip', 'gbps'])
    ip_traffic['datetime'] = ip_traffic['bf_date'] + ' ' + ip_traffic['bf_time']
    ip_traffic['datetime'] = pd.to_datetime(ip_traffic['datetime'])
    ip_traffic['time'] = ip_traffic['datetime'].apply( lambda d : d.time() )
    ip_traffic = ip_traffic.drop(columns=['bf_date', 'bf_time'])


    #  Pivot and filter IPs based on total or maximum traffic (n x 300GbGb or 20mbs)
    ip_pivot = pd.pivot_table(ip_traffic, values='gbps', index=['time'], columns='ip', aggfunc=np.sum)
    ip_pivot = ip_pivot[ip_pivot.columns[ip_pivot.max()>0.02]]
    ip_pivot = ip_pivot.fillna(0)

    
    # Normalized IP traffic
    s_idx = int(st.hour*12 + st.minute/5 + 1)
    e_idx = int(et.hour*12 + et.minute/5 + 1)

    ips = ip_pivot[ip_pivot.columns[:]].to_numpy()
    ips = ips.T

    ips_normalized = preprocessing.minmax_scale(ips.T).T
    ips_normalized = pd.DataFrame(ips_normalized.T)
    ips_normalized.columns = ip_pivot.columns[:]
    ips_normalized.index = ip_pivot.index

    # Smoothed IP traffic
    ips_smoothed = ips_normalized.groupby(ips_normalized.index).mean().rolling(window=8).mean().shift(periods=-4)

    # Differentiating IP traffic to identify sharp rises or declines (over 20 minutes intervals)
    ips_d = ips_smoothed.diff(periods=4)

    # List of potential pirate IPs (sorted based on number of sharp rises in traffic)
    ip_pirate_list = (ips_d[s_idx:e_idx]>.2).sum() 
    ip_pirate_list = ip_pirate_list[ip_pirate_list>0].sort_values(ascending=False)

    return ip_pirate_list.index



## Helper functions

In [None]:
def match_date(filename):
    """
    Obtain the match date from the traffic filename.
    :param game_date: traffic filename
    :return: date of the games
    """
    date = filename[2:4] + '/' + filename[0:2] + '/2020'
    return date


# ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
def interpol_periods(date):
    """
    Produce interpolation periods for the given match date using game fixtures. 
    :param game_date: input date
    :return: start time and end time of the interpolation
    """
    df = pd.read_csv('/home/jupyter/preprocessing/data/game_fixtures.csv')
    df['match_time_utc'] = pd.to_datetime(df['match_time_utc'])
    df = df[df['date']==date].reset_index()
    
    st = pd.to_datetime(df['ko_time'].iloc[0])
    st = st - dt.timedelta(minutes=90)

    et = pd.to_datetime(df['ko_time'].iloc[-1])
    et = et + dt.timedelta(minutes=180)

    return st, et


# ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
def tagger_fn(filename, ip_pirate_list, ip_dbl, ip_bl):
    """
    Create a target column for the traffic file based on normal(0), suspicous(1) and blocked(2) classes. 
    :param filename: traffic filename
    :param ip_pirate_list: list of suspicious pirating IPs
    :param ip_dbl: don't black list IPs from FMTS
    :param ip_bl: black list IPs from FMTS
    :return: traffic file ready for the ML training
    """
    ip_traffic = pd.read_csv(filename, usecols=['bf_date', 'bf_time', 'ip', 'gbps'])
    ip_traffic = pd.pivot_table(ip_traffic, values='gbps', index=['ip', 'bf_date'], columns='bf_time', aggfunc=np.sum).fillna(0)

    ip_traffic.insert(loc=0, column='target', value=0)
    ip_traffic['target'][ip_traffic.index.get_level_values('ip').isin(ip_pirate_list)] = 1
    ip_traffic['target'][ip_traffic.index.get_level_values('ip').isin(ip_dbl)] = 1
    ip_traffic['target'][ip_traffic.index.get_level_values('ip').isin(ip_bl)] = 2

    return ip_traffic
    

# ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
def total_traffic_fn(ip_traffic, ip_list):
    """
    Total traffic calculation for a list of IPs.
    :param ip_traffic: traffic data per IP
    :param ip_list: list of IPs
    :return: total traffic (per day) for the provided IPs
    """
    total_traffic = []
    for ip in ip_list:
        df = ip_traffic[ip_traffic['ip']==ip]
        df = df.set_index('time')
        df = df.gbps.groupby(df.index.time).sum()
        total_traffic.append([ip, df.sum()])
                           
    return total_traffic


# ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
def downsample(df):
    """
    Down-sample majority class. Also removes the IP traffic from midnight till 8am. 
    :param df: pivotted traffic data
    :param n_samples: the number to be down sampled. 
    :return: The down sampled database. Also removes the total traffic (per day) for the provided IPs
    """
    # Separate majority and minority classes
    df_majority = df[df.target==0]
    df_minority1 = df[df.target==1]
    df_minority2 = df[df.target==2]

    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
                                     replace=False,         # sample without replacement
                                     n_samples=2000000,     # to match minority class
                                     random_state=123)      # reproducible results
 
    # Combine minority class with downsampled majority class and remove the IP traffic from midnight till 8am. 
    df_downsampled = pd.concat([df_majority_downsampled, df_minority1, df_minority2])
    df_downsampled = df_downsampled.drop(columns = df_downsampled.columns[0:96])
    
    return df_downsampled


# ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
def input_eval_fn(filename):
    """
    Prepare the input data to evaluate the Keras model.
    :param filename: traffic filename for model evaluation
    :return: numpy data for the ML evaluation
    """
    ip_traffic = pd.read_csv(filename, usecols=['time', 'ip', 'gbps'])
    ip_traffic['time'] = pd.to_datetime(ip_traffic['time'])
    ip_traffic['time'] = ip_traffic['time'].apply( lambda d : d.time() )
    ip_traffic = pd.pivot_table(ip_traffic, values='gbps', index=['ip'], columns='time', aggfunc=np.sum).fillna(0)
    
    # add missing time columns to the ip traffic data
    col_list = pd.date_range('00:00', '23:55', freq='5min').time
    df_tmp = ip_traffic.reindex(columns=col_list, fill_value=0)
    
    ip_traffic = pd.merge(df_tmp, ip_traffic, how='outer').fillna(0)
    ip_traffic = ip_traffic.drop(columns = ip_traffic.columns[0:96])

    # z-norm
    x_eval = ip_traffic.values
    std_ = x_eval.std(axis=1, keepdims=True)
    std_[std_ == 0] = 1.0
    x_eval = (x_eval - x_eval.mean(axis=1, keepdims=True)) / std_

    if len(x_eval.shape) == 2:  # if univariate
    # add a dimension to make it multivariate with one dimension
        x_eval = x_eval.reshape((x_eval.shape[0], x_eval.shape[1], 1))
    
    return x_eval



## Import FMTS (black list/non blak list) IPs 

In [None]:
ip_bl = pd.read_csv('/home/jupyter/preprocessing/data/bl_gw1_9.csv', usecols=['ip'])
ip_dbl = pd.read_csv('/home/jupyter/preprocessing/data/dbl_gw1_9.csv', usecols=['ip'])

ip_bl = ip_bl.ip.unique()
ip_dbl = ip_dbl.ip.unique()




## Run multiple days of traffic data

In [None]:
%%time

pathname = '/home/jupyter/preprocessing/data/'

os.chdir(pathname)
filenames = [i for i in glob.glob('*.{}'.format('csv'))]
filenames = [i for i in filenames if re.match("[0-9]+.csv", i)]

count = 0
ips = pd.DataFrame()
for fname in filenames:
    date = match_date(fname)
    st, et = interpol_periods(date)    
    ip_pirate_list = preprocessing_fn(fname, st, et)
    ip_traffic = tagger_fn(fname, ip_pirate_list, ip_dbl, ip_bl)
    ips = pd.concat([ips, ip_traffic], ignore_index=True)
    count =+ 1
    print('traffic datafile no{} "{}" processing is completed.'.format(count, fname))


ips_train, ips_test = train_test_split(ips, test_size=0.3)
ips_train.to_pickle('/home/jupyter/preprocessing/data/ML/IP_TRAIN.pickle')
ips_test.to_pickle('/home/jupyter/preprocessing/data/ML/IP_TEST.pickle')

