In [1]:
import os
import pandas as pd
import numpy as np
import glob
import pyarrow.feather as feather

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
scale = preprocessing.MinMaxScaler()
pd.set_option("display.max_rows", None)


def label_encoding(df, columns_list):
    for col in columns_list:
        df[col] = le.fit_transform(df[col])
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        with open('Label_mapping_GQUIC.txt', 'a') as data:
            data.write(col+": "+str(le_name_mapping) + "\n")
    return df


def data_scale(df):
    df.loc[:, '0': '1459'] = df.loc[:, '0': '1459'].div(255)
    df[['length']] = scale.fit_transform(
        df[['length']])
    return df


def remove_protocol(df):
    df = df.loc[(df['protocol'] == 'GQUIC')]
    return df


def remove_columns(df):
    df = df.drop(['ip_proto', 'data'], axis=1)
    return df


def final_remove(df):
    df = df.drop(['ip_src', 'ip_dst', 'protocol', 'info'], axis=1)
    return df


def remove_ip(df, ip):
    df = df[(df['ip_src'].str[:7].str.contains(ip))]
    df = df[(df['ip_dst'].str[:7].str.contains(ip))]
    return df


# VoIP

In [2]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'csv/gquic/GoogleHangout_VoIP')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_GoogleHangout_VoIP = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_GoogleHangout_VoIP = pd.read_csv(
        file, engine='pyarrow')
    _df_GoogleHangout_VoIP['Label'] = "GoogleHangout_VoIP"
    file_name = file.split('/')
    _df_GoogleHangout_VoIP['File_name'] = file_name[-1]
    df_GoogleHangout_VoIP = pd.concat(
        [df_GoogleHangout_VoIP, _df_GoogleHangout_VoIP], ignore_index=True)
del _df_GoogleHangout_VoIP


In [3]:
df_GoogleHangout_VoIP.columns

Index(['time_epoch', 'frame_number', 'stream_id', 'ip_src', 'ip_dst',
       'ip_proto', 'protocol', 'length', 'info', 'data',
       ...
       '1452', '1453', '1454', '1455', '1456', '1457', '1458', '1459', 'Label',
       'File_name'],
      dtype='object', length=1472)

In [6]:
df_GoogleHangout_VoIP['ip_src'].value_counts().nlargest(10)

192.168.0.108      120396
192.168.0.119       87257
192.168.43.5        78567
192.168.0.104       39753
192.168.0.103       31017
172.217.161.174     25487
172.217.24.206      18464
172.217.25.14       14578
172.217.31.238      13935
172.217.161.142     13652
Name: ip_src, dtype: int64

In [7]:
df_GoogleHangout_VoIP['ip_dst'].value_counts().nlargest(10)

192.168.0.108      128796
192.168.0.119       94162
192.168.43.5        84809
192.168.0.104       42511
192.168.0.103       36263
172.217.161.174     19921
74.125.23.127       17237
172.217.24.206      15088
172.217.25.14       12351
172.217.31.238      11835
Name: ip_dst, dtype: int64

In [3]:
# df_GoogleHangout_VoIP = (df_GoogleHangout_VoIP.pipe(remove_protocol).pipe(remove_columns))


In [4]:
# df_GoogleHangout_VoIP = (df_GoogleHangout_VoIP.pipe(remove_ip,"192.168|172.217").pipe(final_remove))

In [5]:
# df_GoogleHangout_VoIP.to_csv(
#     "GQUIC_csv/df_GoogleHangout_VoIP.csv", index=False)


In [8]:
del df_GoogleHangout_VoIP


# Music

In [9]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'csv/gquic/Google_PlayMusic')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_Google_PlayMusic = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_Google_PlayMusic = pd.read_csv(
        file, engine='pyarrow')
    # print(file)
    _df_Google_PlayMusic['Label'] = "Google_PlayMusic"
    file_name = file.split('/')
    _df_Google_PlayMusic['File_name'] = file_name[-1]
    df_Google_PlayMusic = pd.concat(
        [df_Google_PlayMusic, _df_Google_PlayMusic])
del _df_Google_PlayMusic


In [10]:
df_Google_PlayMusic['ip_src'].value_counts().nlargest(10)

172.31.1.227      272391
172.31.46.140      16004
172.217.9.194       9863
172.217.13.66       9280
172.217.12.226      8983
172.217.15.98       8628
172.217.7.142       8260
172.217.15.66       7274
172.217.13.78       7099
216.58.217.110      7077
Name: ip_src, dtype: int64

In [11]:
df_Google_PlayMusic['ip_dst'].value_counts().nlargest(10)

172.31.1.227      347886
172.31.46.140      19217
172.217.9.194       8872
172.217.13.66       8312
172.217.12.226      8132
172.217.15.98       8035
172.217.7.142       7899
216.58.217.110      6891
172.217.13.78       6671
172.217.15.66       6567
Name: ip_dst, dtype: int64

In [7]:
# df_Google_PlayMusic = (df_Google_PlayMusic.pipe(remove_protocol).pipe(remove_columns))

# df_Google_PlayMusic = (df_Google_PlayMusic.pipe(remove_ip,"172.31|172.217").pipe(final_remove))

In [9]:
# df_Google_PlayMusic.to_csv("GQUIC_csv/df_Google_PlayMusic.csv", index=False)

In [12]:

del df_Google_PlayMusic


# File Transfer

In [13]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'csv/gquic/FileTransfer')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_FileTransfer = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_FileTransfer = pd.read_csv(file, engine='pyarrow')
    # print(file)
    _df_FileTransfer['Label'] = "FileTransfer"
    file_name = file.split('/')
    _df_FileTransfer['File_name'] = file_name[-1]
    df_FileTransfer = pd.concat([df_FileTransfer, _df_FileTransfer])
del _df_FileTransfer


In [14]:
df_FileTransfer['ip_src'].value_counts().nlargest(10)

192.168.10.30    63416
192.168.10.35    23448
192.168.10.32    13773
Name: ip_src, dtype: int64

In [15]:
df_FileTransfer['ip_dst'].value_counts().nlargest(10)

192.168.10.35    38072
192.168.10.30    37221
192.168.10.32    25344
Name: ip_dst, dtype: int64

In [11]:
# df_FileTransfer = df_FileTransfer.loc[(df_FileTransfer['protocol'] == 'GQUIC')]
# df_FileTransfer = df_FileTransfer.pipe(remove_columns).pipe(final_remove)

In [13]:
# df_FileTransfer.to_csv("GQUIC_csv/df_FileTranfer.csv", index=False)


In [16]:
del df_FileTransfer


# Chat

In [14]:
# # get your working directory and target folder that contains all your files
# path = os.path.join(os.getcwd(), 'csv/gquic/GoogleHangout_Chat')

# files = [os.path.join(path, i) for i in os.listdir(
#     path) if os.path.isfile(os.path.join(path, i))]

# df_GoogleHangout_Chat = pd.DataFrame()

# # for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
# for file in files:
#     _df_GoogleHangout_Chat = pd.read_csv(file, engine='pyarrow')
#     # print(file)
#     _df_GoogleHangout_Chat['Label'] = "GoogleHangout_Chat"
#     file_name = file.split('/')
#     _df_GoogleHangout_Chat['File_name'] = file_name[-1]
#     df_GoogleHangout_Chat = pd.concat([df_GoogleHangout_Chat,_df_GoogleHangout_Chat])
# del _df_GoogleHangout_Chat


In [15]:
# df_GoogleHangout_Chat = df_GoogleHangout_Chat.pipe(remove_protocol).pipe(remove_columns)

In [16]:
# df_GoogleHangout_Chat = df_GoogleHangout_Chat.pipe(remove_ip,"192.168|172.217|216.58").pipe(final_remove)

In [17]:
# df_GoogleHangout_Chat.to_csv(
#     "GQUIC_csv/df_GoogleHangout_Chat.csv", index=False)
# del df_GoogleHangout_Chat


# Youtube

In [17]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'csv/gquic/Youtube')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_Youtube = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_Youtube = pd.read_csv(file, engine='pyarrow')
    # print(file)
    file_name = file.split('/')
    _df_Youtube['File_name'] = file_name[-1]
    _df_Youtube['Label'] = "Youtube"
    df_Youtube = pd.concat([df_Youtube,_df_Youtube])
del _df_Youtube

In [18]:
df_Youtube['ip_src'].value_counts().nlargest(10)

192.168.0.107      144515
192.168.10.31      105741
192.168.10.35       72660
172.217.161.162     17160
216.58.220.194      14629
216.58.197.98       14368
172.217.31.226      14243
172.217.161.161      9510
216.58.197.100       9104
216.58.197.99        8552
Name: ip_src, dtype: int64

In [19]:
df_Youtube['ip_dst'].value_counts().nlargest(10)

192.168.0.107      182087
192.168.10.31      132942
192.168.10.35       98995
172.217.161.162     16108
172.217.31.226      13551
216.58.197.98       13444
216.58.220.194      12560
172.217.24.194       7723
216.58.200.2         7648
216.58.197.99        7254
Name: ip_dst, dtype: int64

In [19]:
# df_Youtube = df_Youtube.pipe(remove_protocol).pipe(remove_columns)
# df_Youtube = df_Youtube.pipe(remove_ip,"192.168|172.217|216.58").pipe(final_remove)

In [None]:
# df_Youtube.to_csv(
#     "GQUIC_csv/df_Youtube.csv", index=False)


In [20]:
del df_Youtube

# Final

In [4]:
path = os.path.join(os.getcwd(),  '/home/onos/FL/Data Processing/GQUIC_csv')
all_files = glob.glob(os.path.join(path, "*.csv"))
df_GQUIC = pd.concat((pd.read_csv(f,engine='pyarrow') for f in all_files), ignore_index=True)


In [None]:
# remove_rows = df_GQUIC[(df_GQUIC['Label'] == 'GoogleHangout_Chat') | (
#     df_GQUIC['Label'] == 'GoogleHangout_VoIP')].index
# df_GQUIC.drop(remove_rows, inplace=True)


In [None]:
# import matplotlib.pyplot as plt


In [None]:
# fig = plt.figure(figsize=(10, 6))
# fig.suptitle("Thống kê nhãn trên tập GQUIC")
# df_GQUIC.Label.value_counts().plot(kind="barh")
# plt.savefig('image/GQUIC_label.png')


In [None]:
# df_GQUIC.Label.value_counts()


In [None]:
# (df_GQUIC.Label.value_counts()/20).round(0)

In [None]:
# fig = plt.figure(figsize=(10, 6))
# fig.suptitle("Thống kê nhãn trên tập GQUIC theo flow")
# (df_GQUIC.Label.value_counts()/20).round(0).plot(kind="barh")
# plt.savefig('image/GQUIC_label_flow.png')

In [None]:
# df_GQUIC_small = df_GQUIC.sample(n=100)
# df_GQUIC_small.to_csv("GQUIC_data_small.csv", index=False)


In [None]:
# df_GQUIC.head()

In [None]:
# df_GQUIC[df_GQUIC['stream_id']==3]['File_name'].value_counts()

In [5]:
df_sorted = df_GQUIC.sort_values(['stream_id', 'File_name'])
df_sorted['flow_id'] = (df_sorted.groupby(['stream_id', 'File_name']).cumcount()==0).astype(int)

df_sorted['flow_id'] = df_sorted['flow_id'].cumsum()

In [None]:
# def most_frequent(List):
#     return max(set(List), key = List.count)

In [None]:
# result=df_sorted.groupby('flow_id')['Label'].apply(list).to_dict()
# flow_label = []
# for flow in result:
#     flow_label.append(most_frequent(result[flow]))

In [6]:
df_sorted['flow_id'].nunique()

83439

In [None]:
df_sorted = (df_sorted.pipe(label_encoding, ['Label']).pipe(data_scale))

In [7]:
from sklearn.model_selection import train_test_split

train, test_case = train_test_split(df_sorted, test_size=0.2, stratify=df_sorted['flow_id'])

train = train.drop(['time_epoch', 'frame_number', 'File_name','stream_id'],axis=1)
train, test = train_test_split(train, test_size=0.2)
train = train.drop(['flow_id'],axis=1)

train.to_csv("GQUIC_data.csv", index=False)

test = test.sort_values(['flow_id'])
test.to_csv("GQUIC_test.csv", index=False)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
first_column = test.pop('flow_id')
test.insert(0, 'flow_id', first_column)

In [None]:
data_bytes = ['9','31','63','127','255','511','1023']

In [None]:
for each_byte in data_bytes:
    data_train = pd.concat([train.loc[:, : each_byte],train.iloc[:,-1:]],axis=1)
    data_test = pd.concat([test.loc[:, : each_byte ],test.iloc[:,-1:]],axis=1)
    each_byte = str(int(each_byte)+1)
    data_train_name = "GQUIC_data_" + each_byte +".csv"
    data_test_name = "GQUIC_test_" + each_byte +".csv"
    data_train.to_csv(data_train_name,index=False)
    data_test.to_csv(data_test_name,index=False)

In [None]:
test_case = test_case[['time_epoch', 'frame_number', 'stream_id', 'length', 'Label', 'File_name', 'flow_id']]
split = 5
total_data_count = test_case.shape[0]
data_per_set = int(np.floor(total_data_count/split))
DataFrameDict = {}
for i in range(1, split+1):
    client_name = "client_" + str(i)
    start = data_per_set * (i-1)
    end = data_per_set * i

    print(f"Adding data from {start} to {end} for client : {client_name}")
    DataFrameDict[client_name] = test_case[start:end]
for df in DataFrameDict.keys():
    csv_path = "gquic_test_case/"+df + ".csv"
    DataFrameDict[df].to_csv(csv_path,index=False)

Adding data from 0 to 53306 for client : client_1
Adding data from 53306 to 106612 for client : client_2
Adding data from 106612 to 159918 for client : client_3
Adding data from 159918 to 213224 for client : client_4
Adding data from 213224 to 266530 for client : client_5


In [None]:
# train.to_csv("GQUIC_data.csv", index=False)
# test.to_csv("GQUIC_data_test.csv",index=False)


In [None]:
# train.to_csv('GQUIC_data_2label.csv', index=False)
# test.to_csv('GQUIC_test_2label.csv', index=False)


In [None]:
# index = 0
# with pd.read_csv("GQUIC_data.csv", chunksize=1000000) as reader:
#     for data in reader:
#         path = '/home/onos/FL/Data Processing/GQUIC_arrow/GQUIC_data({}).arrow'.format(index)
#         feather.write_feather(data, path)
#         index += 1


In [None]:
# df_GQUIC.info()
# df_GQUIC.head(2)
# df_GQUIC = df_GQUIC.astype('float16')
# df_GQUIC.head(2)


In [None]:
# df_GQUIC.info()


In [None]:
# df_GQUIC["Label"] = df_GQUIC["Label"].astype("int8")


In [None]:
# df_GQUIC.info()


In [None]:
# df = pd.read_csv("iris.csv")


In [None]:
# df.info()


In [None]:
# df = (df.pipe(label_encoding, ['variety']))


In [None]:
# df.info()


In [None]:
# df.to_csv("iris.csv")
