In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from subprocess import check_output
import math

df = pd.read_csv('./input/cs448b_ipasn.csv')

# data grouping

In [None]:
df['date'] = pd.to_datetime(df['date'])
df = df.groupby(['date', 'l_ipn'], as_index=False).sum() # date와 l_ipn 별로 합을 구함
df['date']

In [None]:
df['yday'] = df['date'].dt.dayofyear
df['wday'] = df['date'].dt.dayofweek

# visualization

In [None]:
# l_ipn 기준으로 그룹 묶기
ip = []
maxF = []
for i in range(0, 10):
    ip.insert(i, df[df['l_ipn']==i])
    maxF.insert(i, np.max(ip[i]['f']))
ip[0].head()

In [None]:
count, division = np.histogram(ip[0]['f'], bins=10)
count

In [None]:
division

In [None]:
# IP 별 f(날짜 별 connection 수) 분포도
f, axarray = plt.subplots(5, 2, figsize=(15, 20)) # 5x2로 10개의 flow를 보일 것이다
for i in range(10):
    count, division = np.histogram(ip[i]['f'], bins=10) # bins는 x축의 간격
    row, col = math.floor(i/2), i%2
    g = sns.barplot(x=division[0:len(division)-1], y=count, ax=axarray[row, col]) # (0, 0) 위치에 그래프 넣을 것
    axarray[row, col].set_title(f'Local IP {i} Flow')

In [None]:
# 1년간 connection 수 흐름
f, axarray = plt.subplots(5, 2, figsize=(15, 20))
for i in range(10):
    row, col = math.floor(i/2), i%2
    axarray[row, col].plot(ip[i]['yday'], ip[i]['f'])
    axarray[row, col].plot(ip[i]['yday'], [ip[i]['f'].mean() + 3*ip[i]['f'].std()]*len(ip[i]['yday']), color='g')

In [None]:
ip[0] = df[df['l_ipn']==0]
maxF[0] = np.max(ip[0]['f'])
ip[1] = df[df['l_ipn']==1][0:len(ip[1]['f'])-5]
maxF[1] = np.max(ip[1]['f'])
ip[2] = df[df['l_ipn']==2]
maxF[2] = np.max(ip[2]['f'])
ip[3] = df[df['l_ipn']==3]
maxF[3] = np.max(ip[3]['f'])
ip[4] = df[df['l_ipn']==4][0:len(ip[4]['f'])-7]

# prediction

In [None]:
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [None]:
# 정규화
for i in range(10):
    fv = [float(v)/float(maxF[i]) for v in ip[i]['f'].values]
    ip[i].loc[:, 'f'] = np.array(fv).reshape(-1, 1)

In [None]:
# feature 리스트와 우리 모델의 target을 만든다
# look_back: 지금으로부터 이전 몇 개의 데이터를 볼지 결정
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back)].values
        dataX.append(a)
        dataY.append(dataset['f'].iloc[i+look_back])
    return np.array(dataX), np.array(dataY)

In [None]:
# RNN Train
def trainModel(data):
    data['f'] = data['f'].astype('float32')
    train = data[0:look_back*5].copy()
    trainX, trainY = create_dataset(train, look_back)
    trainX = np.reshape(trainX, (trainX.shape[0], look_back, 2))
    model = Sequential()
    model.add(GRU(64, input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
    model.add(GRU(32))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='sgd')
    model.fit(trainX, trainY, epochs=100, batch_size=16, verbose=0)
    return model

In [None]:
def predictFlow(_model, data):
    ypredFlow = [0] * look_back
    for k in range(len(data)-look_back):
        pattern = data[k:k+look_back].values
        x = np.reshape(pattern, (1, len(pattern), 2))
        ypredFlow.append(_model.predict(x)[0][0]) # ypredFlow = [v*_max for v in ypredFlow]
    return ypred

In [None]:
m = []
for i in range(10):
    m[i] = trainModel(ip[i][['f', 'wday']].copy())

f, axarray = plt.subplots(5, 2, figsize=(15, 20))

ypred, ipF = [], []
for i in range(10):
    ypred[i] = np.multiply(predictFlow(m[i], ip[i][['f', 'wday']].copy()), max[i])
    ipF[i] = np.multiply(ip[i]['f'], max[i])
    
    row, col = math.floor(i/2), i%2
    axarray[row, col].plot(ipF[i]['yday'], ipF[i])
    axarray[row, col].plot(ipF[i]['yday'], ypred[i], color='r')
    axarray[row, col].set_title(f"Local IP {i} Flow and prediction")
plt.show()

In [None]:
corr = []
for i in range(10):