In [71]:
import os
import pandas as pd
import numpy as np
import glob
import pyarrow.feather as feather
from pyarrow import csv
import random

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
scale = preprocessing.MinMaxScaler()
pd.set_option("display.max_rows", None)


def train_test_split(df, frac=0.2):
    selected = df['flow_id'].drop_duplicates().sample(frac=frac)
    test = df[df['flow_id'].isin(selected)]
    train = df[~df['flow_id'].isin(selected)]
    return train, test


def label_encoding(df, columns_list):
    for col in columns_list:
        df[col] = le.fit_transform(df[col])
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        with open('Label_mapping_GQUIC.txt', 'a') as data:
            data.write(col+": "+str(le_name_mapping) + "\n")
    return df


def index_reset(df):
    return pd.RangeIndex(len(df.index))


def data_scale(df):
    df.loc[:, '0': '1459'] = df.loc[:, '0': '1459'].div(255)
    # df[['length']] = scale.fit_transform(
    #     df[['length']])
    return df


def remove_protocol(df):
    df = df.loc[(df['protocol'] == 'GQUIC')]
    return df


def remove_columns(df):
    df = df.drop(['ip_proto', 'data', 'length'], axis=1)
    return df


def final_remove(df):
    df = df.drop(['ip_src', 'ip_dst', 'protocol', 'info'], axis=1)
    return df

def get_static(df):
    for col in ['A->B', 'B->A','total']:
        print("Cột {}: ".format(col))
        for num in [1000,550,500,450,100]:
            total_flow = df.shape[0]
            count = df[col][df[col] < num].count()
            remain = total_flow - count
            percent = round((count/float(df.shape[0])) * 100,2)
            print("<{} chiếm {}% ({}/{}) con lai {})".format(num,percent,count,df.shape[0],remain))

# VoIP

In [2]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'rawds-summary/rawds/NetFlow-QUIC1/Summary/GoogleHangout_VoIP')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_GoogleHangout_VoIP = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_GoogleHangout_VoIP = pd.read_csv(
        file, engine='pyarrow')
    _df_GoogleHangout_VoIP['Label'] = "GoogleHangout_VoIP"
    file_name = file.split('/')
    _df_GoogleHangout_VoIP['File_name'] = file_name[-1]
    df_GoogleHangout_VoIP = pd.concat(
        [df_GoogleHangout_VoIP, _df_GoogleHangout_VoIP], ignore_index=True)
del _df_GoogleHangout_VoIP


In [10]:
df_GoogleHangout_VoIP.columns

Index(['l4_proto', 'stream_id', 'A->B', 'B->A', 'total', 'Label', 'File_name'], dtype='object')

In [6]:
df_GoogleHangout_VoIP.shape

(73303, 7)

In [51]:
test_column = df_GoogleHangout_VoIP['total']

In [9]:
total_rows = df_GoogleHangout_VoIP.shape[0]

In [72]:
get_static(df_GoogleHangout_VoIP)

Cột A->B: 
<1000 chiếm 97.27% (71301/73303) con lai 2002)
<550 chiếm 97.21% (71258/73303) con lai 2045)
<500 chiếm 97.18% (71238/73303) con lai 2065)
<450 chiếm 97.17% (71227/73303) con lai 2076)
<100 chiếm 92.52% (67820/73303) con lai 5483)
Cột B->A: 
<1000 chiếm 99.81% (73162/73303) con lai 141)
<550 chiếm 98.32% (72069/73303) con lai 1234)
<500 chiếm 98.26% (72026/73303) con lai 1277)
<450 chiếm 98.18% (71971/73303) con lai 1332)
<100 chiếm 93.79% (68754/73303) con lai 4549)
Cột total: 
<1000 chiếm 97.17% (71225/73303) con lai 2078)
<550 chiếm 96.25% (70554/73303) con lai 2749)
<500 chiếm 96.15% (70480/73303) con lai 2823)
<450 chiếm 96.07% (70421/73303) con lai 2882)
<100 chiếm 86.4% (63334/73303) con lai 9969)


In [7]:
df_GoogleHangout_VoIP['total'][df_GoogleHangout_VoIP['total'] < 500].count()

70480

In [58]:
df_GoogleHangout_VoIP['total'][df_GoogleHangout_VoIP['total'] > 1000].count()

2078

In [None]:
df_GoogleHangout_VoIP = (df_GoogleHangout_VoIP.pipe(remove_columns).pipe(final_remove))

In [None]:
df_GoogleHangout_VoIP.to_csv(
    "GQUIC_csv/df_GoogleHangout_VoIP.csv", index=False)
del df_GoogleHangout_VoIP


# Music

In [42]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'rawds-summary/rawds/NetFlow-QUIC1/Summary/Google_PlayMusic')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_Google_PlayMusic = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_Google_PlayMusic = pd.read_csv(
        file, engine='pyarrow')
    # print(file)
    _df_Google_PlayMusic['Label'] = "Google_PlayMusic"
    file_name = file.split('/')
    _df_Google_PlayMusic['File_name'] = file_name[-1]
    df_Google_PlayMusic = pd.concat(
        [df_Google_PlayMusic, _df_Google_PlayMusic])
del _df_Google_PlayMusic


In [70]:
get_static(df_Google_PlayMusic)

Cột A->B: 
<1000 chiếm 90.24% (33306/36909) con lai 3603)
<550 chiếm 89.17% (32911/36909) con lai 3998)
<500 chiếm 89.11% (32888/36909) con lai 4021)
<100 chiếm 87.34% (32238/36909) con lai 4671)
Cột B->A: 
<1000 chiếm 98.91% (36506/36909) con lai 403)
<550 chiếm 95.21% (35142/36909) con lai 1767)
<500 chiếm 94.3% (34807/36909) con lai 2102)
<100 chiếm 88.38% (32622/36909) con lai 4287)
Cột total: 
<1000 chiếm 90.05% (33236/36909) con lai 3673)
<550 chiếm 88.69% (32736/36909) con lai 4173)
<500 chiếm 88.19% (32551/36909) con lai 4358)
<100 chiếm 86.54% (31940/36909) con lai 4969)


In [None]:
df_Google_PlayMusic = (df_Google_PlayMusic.pipe(remove_columns).pipe(final_remove))

In [None]:
df_Google_PlayMusic.to_csv("GQUIC_csv/df_Google_PlayMusic.csv", index=False)
del df_Google_PlayMusic


# File Transfer

In [24]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'rawds-summary/rawds/NetFlow-QUIC1/Summary/FileTransfer')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_FileTransfer = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_FileTransfer = pd.read_csv(file, engine='pyarrow')
    # print(file)
    _df_FileTransfer['Label'] = "FileTransfer"
    file_name = file.split('/')
    _df_FileTransfer['File_name'] = file_name[-1]
    df_FileTransfer = pd.concat([df_FileTransfer, _df_FileTransfer])
del _df_FileTransfer


In [64]:
get_static(df_FileTransfer)

Cột A->B: 
<1000 chiếm 96.59% (4873/5045) con lai 172)
<500 chiếm 88.68% (4474/5045) con lai 571)
<100 chiếm 3.9% (197/5045) con lai 4848)
Cột B->A: 
<1000 chiếm 99.31% (5010/5045) con lai 35)
<500 chiếm 96.61% (4874/5045) con lai 171)
<100 chiếm 4.62% (233/5045) con lai 4812)
Cột total: 
<1000 chiếm 92.59% (4671/5045) con lai 374)
<500 chiếm 11.77% (594/5045) con lai 4451)
<100 chiếm 3.39% (171/5045) con lai 4874)


In [None]:
df_FileTransfer = df_FileTransfer.pipe(remove_columns).pipe(final_remove)

In [None]:
df_FileTransfer.to_csv("GQUIC_csv/df_FileTranfer.csv", index=False)
del df_FileTransfer


# Chat

In [None]:
# # get your working directory and target folder that contains all your files
# path = os.path.join(os.getcwd(), 'csv/gquic/GoogleHangout_Chat')

# files = [os.path.join(path, i) for i in os.listdir(
#     path) if os.path.isfile(os.path.join(path, i))]

# df_GoogleHangout_Chat = pd.DataFrame()

# # for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
# for file in files:
#     _df_GoogleHangout_Chat = pd.read_csv(file, engine='pyarrow')
#     # print(file)
#     _df_GoogleHangout_Chat['Label'] = "GoogleHangout_Chat"
#     file_name = file.split('/')
#     _df_GoogleHangout_Chat['File_name'] = file_name[-1]
#     df_GoogleHangout_Chat = pd.concat([df_GoogleHangout_Chat,_df_GoogleHangout_Chat])
# del _df_GoogleHangout_Chat


In [None]:
# df_GoogleHangout_Chat = df_GoogleHangout_Chatpipe(remove_columns)

In [None]:
# df_GoogleHangout_Chat = df_GoogleHangout_Chat.pipe(remove_ip,"192.168|172.217|216.58").pipe(final_remove)

In [None]:
# df_GoogleHangout_Chat.to_csv(
#     "GQUIC_csv/df_GoogleHangout_Chat.csv", index=False)
# del df_GoogleHangout_Chat


# Youtube

In [49]:
# get your working directory and target folder that contains all your files
path = os.path.join(
    os.getcwd(), 'rawds-summary/rawds/NetFlow-QUIC1/Summary/Youtube')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_Youtube = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_Youtube = pd.read_csv(file, engine='pyarrow')
    # print(file)
    file_name = file.split('/')
    _df_Youtube['File_name'] = file_name[-1]
    _df_Youtube['Label'] = "Youtube"
    df_Youtube = pd.concat([df_Youtube, _df_Youtube])
del _df_Youtube


In [65]:
get_static(df_Youtube)

Cột A->B: 
<1000 chiếm 88.91% (43615/49056) con lai 5441)
<500 chiếm 88.45% (43391/49056) con lai 5665)
<100 chiếm 86.62% (42494/49056) con lai 6562)
Cột B->A: 
<1000 chiếm 89.43% (43872/49056) con lai 5184)
<500 chiếm 88.75% (43536/49056) con lai 5520)
<100 chiếm 87.56% (42953/49056) con lai 6103)
Cột total: 
<1000 chiếm 88.57% (43449/49056) con lai 5607)
<500 chiếm 88.08% (43210/49056) con lai 5846)
<100 chiếm 84.87% (41632/49056) con lai 7424)


In [None]:
df_Youtube = df_Youtube.pipe(remove_columns).pipe(final_remove)

In [None]:
df_Youtube.to_csv(
    "GQUIC_csv/df_Youtube.csv", index=False)
del df_Youtube

# Final

## Gop du lieu

In [1]:
path = os.path.join(os.getcwd(),  '/home/onos/FL/Data Processing/GQUIC_csv')
all_files = glob.glob(os.path.join(path, "*.csv"))
df_GQUIC = pd.concat((pd.read_csv(f,engine='pyarrow') for f in all_files), ignore_index=True)


NameError: name 'glob' is not defined

In [None]:
df_sorted = df_GQUIC.sort_values(['stream_id', 'File_name'])
del df_GQUIC

In [4]:
df_sorted['flow_id'] = (df_sorted.groupby(['stream_id', 'File_name']).cumcount()==0).astype(int)
df_sorted['flow_id'] = df_sorted['flow_id'].cumsum()

In [5]:
df_sorted.index = pd.RangeIndex(len(df_sorted.index))
df_sorted.to_feather('sorted.feather')
del df_sorted

## Loc flow <20 packet

In [7]:
df_sorted = pd.read_feather('sorted.feather')
print(df_sorted['flow_id'].value_counts().describe())
value_c = df_sorted['flow_id'].value_counts()
print('So flow co packet < 20: ',value_c[value_c < 20].count())

In [16]:
print(round((65526.0/138430.0)*100),'%')
sub_df = df_sorted[df_sorted.groupby('flow_id').flow_id.transform('count')>19].copy()
del df_sorted
sub_df.index = pd.RangeIndex(len(sub_df.index))
sub_df.to_feather('sub_df.feather')
del sub_df

47.335115220689154

## Chuan hoa

In [7]:
df_sorted = pd.read_feather('sub_df.feather')

In [8]:
_,small_sample = train_test_split(df_sorted,frac=0.01)

In [9]:
small_sample.to_csv("data_sample.csv",index=False)

In [8]:
df_sorted = (df_sorted.pipe(label_encoding, ['Label']))

In [9]:
df_sorted.to_feather('df_with_label_encoding.feather')

del df_sorted

NameError: name 'sorted' is not defined

## Chia du lieu cho test case

In [3]:
df_sorted = pd.read_feather('df_with_label_encoding.feather')

In [4]:
training_data, test_case = train_test_split(df_sorted, 0.2)
print(test_case['flow_id'].nunique())
print(training_data['flow_id'].nunique())

14581
58323


In [5]:
del df_sorted

In [6]:
training_data = training_data.drop(['time_epoch', 'frame_number', 'File_name','stream_id'],axis=1)
train, test = train_test_split(training_data, frac=0.2)
del training_data
train = train.drop(['flow_id'],axis=1)
train.index = index_reset(train)
train.to_feather('GQUIC_data.feather')
del train
test = test.sort_values(['flow_id'])
test.index = index_reset(test)
test.to_feather("GQUIC_test.feather")
del test

In [None]:
# data_bytes = ['9','31','63','127','255','511','1023']
# for each_byte in data_bytes:
#     data_train = pd.concat([df_sorted.loc[:, : each_byte],df_sorted.iloc[:,-1:]],axis=1)
#     # data_test = pd.concat([df_sorted.loc[:, : each_byte ],df_sorted.iloc[:,-1:]],axis=1)
#     each_byte = str(int(each_byte)+1)
#     data_train_name = "GQUIC_data_" + each_byte +".csv"
#     # data_test_name = "GQUIC_test_" + each_byte +".csv"
#     data_train.to_csv(data_train_name,index=False)
#     # data_test.to_csv(data_test_name,index=False)

In [None]:
# first_column = test.pop('flow_id')
# test.insert(0, 'flow_id', first_column)

In [None]:
# del train,test,df_sorted,data_train,data_test

In [None]:
# test_case = test_case[['time_epoch', 'frame_number', 'stream_id', 'length', 'Label', 'File_name', 'flow_id']]
# split = 5
# total_data_count = test_case.shape[0]
# data_per_set = int(np.floor(total_data_count/split))
# DataFrameDict = {}
# for i in range(1, split+1):
#     client_name = "client_" + str(i)
#     start = data_per_set * (i-1)
#     end = data_per_set * i

#     print(f"Adding data from {start} to {end} for client : {client_name}")
#     DataFrameDict[client_name] = test_case[start:end]
# for df in DataFrameDict.keys():
#     csv_path = "gquic_test_case/"+df + ".csv"
#     DataFrameDict[df].to_csv(csv_path,index=False)

In [None]:
# train.to_csv("GQUIC_data.csv", index=False)
# test.to_csv("GQUIC_data_test.csv",index=False)


In [None]:
# train.to_csv('GQUIC_data_2label.csv', index=False)
# test.to_csv('GQUIC_test_2label.csv', index=False)


In [None]:
# index = 0
# with pd.read_csv("GQUIC_data.csv", chunksize=1000000) as reader:
#     for data in reader:
#         path = '/home/onos/FL/Data Processing/GQUIC_arrow/GQUIC_data({}).arrow'.format(index)
#         feather.write_feather(data, path)
#         index += 1


In [None]:
# df_GQUIC.info()
# df_GQUIC.head(2)
# df_GQUIC = df_GQUIC.astype('float16')
# df_GQUIC.head(2)


In [None]:
# df_GQUIC.info()


In [None]:
# df_GQUIC["Label"] = df_GQUIC["Label"].astype("int8")


In [None]:
# df_GQUIC.info()


In [None]:
# df = pd.read_csv("iris.csv")


In [None]:
# df.info()


In [None]:
# df = (df.pipe(label_encoding, ['variety']))


In [None]:
# df.info()


In [None]:
# df.to_csv("iris.csv")
