In [1]:
import os
import pandas as pd
import numpy as np
import glob
import pyarrow.feather as feather
from pyarrow import csv
import random

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
scale = preprocessing.MinMaxScaler()
pd.set_option("display.max_rows", None)


def train_test_split(df, frac=0.2):
    selected = df['flow_id'].drop_duplicates().sample(frac=frac)
    test = df[df['flow_id'].isin(selected)]
    train = df[~df['flow_id'].isin(selected)]
    return train, test


def label_encoding(df, columns_list):
    for col in columns_list:
        df[col] = le.fit_transform(df[col])
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        with open('Label_mapping_GQUIC.txt', 'a') as data:
            data.write(col+": "+str(le_name_mapping) + "\n")
    return df


def index_reset(df):
    return pd.RangeIndex(len(df.index))


def data_scale(df):
    df.loc[:, '0': '1459'] = df.loc[:, '0': '1459'].div(255)
    # df[['length']] = scale.fit_transform(
    #     df[['length']])
    return df


def remove_protocol(df):
    df = df.loc[(df['protocol'] == 'GQUIC')]
    return df


def remove_columns(df):
    df = df.drop(['ip_proto', 'data', 'length'], axis=1)
    return df


def final_remove(df):
    df = df.drop(['ip_src', 'ip_dst', 'protocol', 'info'], axis=1)
    return df

def get_static(df):
    for col in ['A->B', 'B->A','total']:
        print("Cột {}: ".format(col))
        for num in [1000,550,500,450,100]:
            total_flow = df.shape[0]
            count = df[col][df[col] < num].count()
            remain = total_flow - count
            percent = round((count/float(df.shape[0])) * 100,2)
            print("<{} chiếm {}% ({}/{}) con lai {})".format(num,percent,count,df.shape[0],remain))

# VoIP

In [2]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'rawds-summary/rawds/NetFlow-QUIC1/Summary/GoogleHangout_VoIP')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_GoogleHangout_VoIP = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_GoogleHangout_VoIP = pd.read_csv(
        file, engine='pyarrow')
    _df_GoogleHangout_VoIP['Label'] = "GoogleHangout_VoIP"
    file_name = file.split('/')
    _df_GoogleHangout_VoIP['File_name'] = file_name[-1]
    df_GoogleHangout_VoIP = pd.concat(
        [df_GoogleHangout_VoIP, _df_GoogleHangout_VoIP], ignore_index=True)
del _df_GoogleHangout_VoIP


In [3]:
df_sorted = df_GoogleHangout_VoIP.sort_values(['stream_id', 'File_name'])
del df_GoogleHangout_VoIP
df_sorted['flow_id'] = (df_sorted.groupby(['stream_id', 'File_name']).cumcount()==0).astype(int)
df_sorted['flow_id'] = df_sorted['flow_id'].cumsum()

In [4]:
df_sorted['flow_id'].nunique()

73303

In [13]:
df_GoogleHangout_VoIP.to_csv(
    "GQUIC_sumary/sumary_GoogleHangout_VoIP.csv", index=False)
del df_GoogleHangout_VoIP


# Music

In [14]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'rawds-summary/rawds/NetFlow-QUIC1/Summary/Google_PlayMusic')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_Google_PlayMusic = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_Google_PlayMusic = pd.read_csv(
        file, engine='pyarrow')
    # print(file)
    _df_Google_PlayMusic['Label'] = "Google_PlayMusic"
    file_name = file.split('/')
    _df_Google_PlayMusic['File_name'] = file_name[-1]
    df_Google_PlayMusic = pd.concat(
        [df_Google_PlayMusic, _df_Google_PlayMusic])
del _df_Google_PlayMusic


In [15]:
df_Google_PlayMusic.to_csv("GQUIC_sumary/sumary_Google_PlayMusic.csv", index=False)
del df_Google_PlayMusic


# File Transfer

In [16]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'rawds-summary/rawds/NetFlow-QUIC1/Summary/FileTransfer')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_FileTransfer = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_FileTransfer = pd.read_csv(file, engine='pyarrow')
    # print(file)
    _df_FileTransfer['Label'] = "FileTransfer"
    file_name = file.split('/')
    _df_FileTransfer['File_name'] = file_name[-1]
    df_FileTransfer = pd.concat([df_FileTransfer, _df_FileTransfer])
del _df_FileTransfer


In [17]:
df_FileTransfer.to_csv("GQUIC_sumary/sumary_FileTranfer.csv", index=False)
del df_FileTransfer


# Youtube

In [18]:
# get your working directory and target folder that contains all your files
path = os.path.join(
    os.getcwd(), 'rawds-summary/rawds/NetFlow-QUIC1/Summary/Youtube')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_Youtube = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_Youtube = pd.read_csv(file, engine='pyarrow')
    # print(file)
    file_name = file.split('/')
    _df_Youtube['File_name'] = file_name[-1]
    _df_Youtube['Label'] = "Youtube"
    df_Youtube = pd.concat([df_Youtube, _df_Youtube])
del _df_Youtube


In [19]:
df_Youtube.to_csv(
    "GQUIC_sumary/sumary_Youtube.csv", index=False)
del df_Youtube

# Final

## Gop du lieu

In [25]:
path = os.path.join(os.getcwd(),  '/home/onos/FL/Data_Processing/GQUIC_sumary')
all_files = glob.glob(os.path.join(path, "*.csv"))
df_GQUIC = pd.concat((pd.read_csv(f,engine='pyarrow') for f in all_files), ignore_index=True)


In [26]:
df_sorted = df_GQUIC.sort_values(['stream_id', 'File_name'])
del df_GQUIC
df_sorted['flow_id'] = (df_sorted.groupby(['stream_id', 'File_name']).cumcount()==0).astype(int)
df_sorted['flow_id'] = df_sorted['flow_id'].cumsum()

In [28]:
df_sorted.index = pd.RangeIndex(len(df_sorted.index))
df_sorted.to_feather('sumary.feather')
del df_sorted