In [7]:
import os
import pandas as pd
import numpy as np
import glob
import pyarrow.feather as feather
from pyarrow import csv
import random

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
scale = preprocessing.MinMaxScaler()
pd.set_option("display.max_rows", None)


def train_test_split(df, frac=0.2):
    selected = df['flow_id'].drop_duplicates().sample(frac=frac)
    test = df[df['flow_id'].isin(selected)]
    train = df[~df['flow_id'].isin(selected)]
    return train, test


def label_encoding(df, columns_list):
    for col in columns_list:
        df[col] = le.fit_transform(df[col])
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        with open('Label_mapping_GQUIC.txt', 'a') as data:
            data.write(col+": "+str(le_name_mapping) + "\n")
    return df


def index_reset(df):
    return pd.RangeIndex(len(df.index))


def data_scale(df):
    df.loc[:, '0': '1459'] = df.loc[:, '0': '1459'].div(255)
    # df[['length']] = scale.fit_transform(
    #     df[['length']])
    return df


def remove_protocol(df):
    df = df.loc[(df['protocol'] == 'GQUIC')]
    return df


def remove_columns(df):
    df = df.drop(['ip_proto', 'data', 'length'], axis=1)
    return df


def final_remove(df):
    df = df.drop(['ip_src', 'ip_dst', 'protocol', 'info'], axis=1)
    return df

def get_static(df):
    for col in ['A->B', 'B->A','total']:
        print("Cột {}: ".format(col))
        for num in [1000,550,500,450,100]:
            total_flow = df.shape[0]
            count = df[col][df[col] < num].count()
            remain = total_flow - count
            percent = round((count/float(df.shape[0])) * 100,2)
            print("<{} chiếm {}% ({}/{}) con lai {})".format(num,percent,count,df.shape[0],remain))

In [8]:

# get your working directory and target folder that contains all your files
for nhan in ['VoIP','FileTransfer','Music','Youtube']:
    path = os.path.join(os.getcwd(), '/home/onos/FL/Data_Processing/Summary/Final/' + nhan)

    files = [os.path.join(path, i) for i in os.listdir(
        path) if os.path.isfile(os.path.join(path, i))]

    df = pd.DataFrame()

    # for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
    for file in files:
        _df = pd.read_feather(file)
        _df['Label'] = nhan
        file_name = file.split('/')
        _df['File_name'] = file_name[-1]
        df = pd.concat([df, _df], ignore_index=True)
    del _df
    df.index = index_reset(df)
    df.to_feather("GQUIC_sumary/sumary_" + nhan +".feather")
    del df


# Final

## Gop du lieu

In [9]:
path = os.path.join(os.getcwd(),  '/home/onos/FL/Data_Processing/GQUIC_sumary')
all_files = glob.glob(os.path.join(path, "*.feather"))
df_GQUIC = pd.concat((pd.read_feather(f) for f in all_files), ignore_index=True)


In [11]:
df_GQUIC.shape

(138388, 15)

In [13]:
df_GQUIC.head(10)

Unnamed: 0,A_ip,A_port,B_ip,B_port,A->B_pkt,A->B_byt,B->A_pkt,B->A_byt,total_pkt,total_byt,relative_start,duration,stream_id,Label,File_name
0,192.168.0.107,53870.0,42.114.77.81,443.0,14659,20365083,7428,622110,22087,20987193,0.0,135.1031,0,Youtube,youtube_00014_20180326181737_summary.feather
1,192.168.0.107,44260.0,216.58.203.14,443.0,18,1483,21,5506,39,6989,2.413825,78.1412,1,Youtube,youtube_00014_20180326181737_summary.feather
2,192.168.0.107,51050.0,172.217.161.142,443.0,746,347276,675,193211,1421,540487,110.423556,1396.3382,2,Youtube,youtube_00014_20180326181737_summary.feather
3,192.168.0.107,41701.0,172.217.161.142,443.0,44,48778,31,4543,75,53321,259.634866,18.4731,3,Youtube,youtube_00014_20180326181737_summary.feather
4,192.168.0.107,53621.0,42.112.8.205,443.0,22190,30820480,11308,951428,33498,31771908,259.772847,190.7131,4,Youtube,youtube_00014_20180326181737_summary.feather
5,192.168.0.107,60541.0,216.58.220.194,443.0,9,2474,10,2771,19,5245,260.056942,15.337,5,Youtube,youtube_00014_20180326181737_summary.feather
6,192.168.0.107,50796.0,172.217.161.162,443.0,14,10426,16,6864,30,17290,260.065318,0.4553,6,Youtube,youtube_00014_20180326181737_summary.feather
7,192.168.0.107,34980.0,216.58.220.194,443.0,24,6959,28,15467,52,22426,260.403153,4.9089,7,Youtube,youtube_00014_20180326181737_summary.feather
8,192.168.0.107,38032.0,42.112.11.141,443.0,4,1606,3,1553,7,3159,261.173487,0.1563,8,Youtube,youtube_00014_20180326181737_summary.feather
9,192.168.0.107,54735.0,42.114.77.13,443.0,4,1609,4,1627,8,3236,261.184305,0.1471,9,Youtube,youtube_00014_20180326181737_summary.feather


In [15]:
df_GQUIC['File_name'] = df_GQUIC['File_name'].str.replace(r'_summary.feather','',regex=True)

  df_GQUIC['File_name'] = df_GQUIC['File_name'].str.replace(r'_summary.feather','')


In [16]:
df_GQUIC.head(10)

Unnamed: 0,A_ip,A_port,B_ip,B_port,A->B_pkt,A->B_byt,B->A_pkt,B->A_byt,total_pkt,total_byt,relative_start,duration,stream_id,Label,File_name
0,192.168.0.107,53870.0,42.114.77.81,443.0,14659,20365083,7428,622110,22087,20987193,0.0,135.1031,0,Youtube,youtube_00014_20180326181737
1,192.168.0.107,44260.0,216.58.203.14,443.0,18,1483,21,5506,39,6989,2.413825,78.1412,1,Youtube,youtube_00014_20180326181737
2,192.168.0.107,51050.0,172.217.161.142,443.0,746,347276,675,193211,1421,540487,110.423556,1396.3382,2,Youtube,youtube_00014_20180326181737
3,192.168.0.107,41701.0,172.217.161.142,443.0,44,48778,31,4543,75,53321,259.634866,18.4731,3,Youtube,youtube_00014_20180326181737
4,192.168.0.107,53621.0,42.112.8.205,443.0,22190,30820480,11308,951428,33498,31771908,259.772847,190.7131,4,Youtube,youtube_00014_20180326181737
5,192.168.0.107,60541.0,216.58.220.194,443.0,9,2474,10,2771,19,5245,260.056942,15.337,5,Youtube,youtube_00014_20180326181737
6,192.168.0.107,50796.0,172.217.161.162,443.0,14,10426,16,6864,30,17290,260.065318,0.4553,6,Youtube,youtube_00014_20180326181737
7,192.168.0.107,34980.0,216.58.220.194,443.0,24,6959,28,15467,52,22426,260.403153,4.9089,7,Youtube,youtube_00014_20180326181737
8,192.168.0.107,38032.0,42.112.11.141,443.0,4,1606,3,1553,7,3159,261.173487,0.1563,8,Youtube,youtube_00014_20180326181737
9,192.168.0.107,54735.0,42.114.77.13,443.0,4,1609,4,1627,8,3236,261.184305,0.1471,9,Youtube,youtube_00014_20180326181737


In [26]:
df_sorted = df_GQUIC.sort_values(['stream_id', 'File_name'])
del df_GQUIC
df_sorted['flow_id'] = (df_sorted.groupby(['stream_id', 'File_name']).cumcount()==0).astype(int)
df_sorted['flow_id'] = df_sorted['flow_id'].cumsum()

In [28]:
df_sorted.index = pd.RangeIndex(len(df_sorted.index))
df_sorted.to_feather('sumary.feather')
del df_sorted