In [1]:
import os
import pandas as pd
import numpy as np
import glob
import pyarrow.feather as feather
from pyarrow import csv
import random

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
scale = preprocessing.MinMaxScaler()
pd.set_option("display.max_rows", None)


def train_test_split(df, frac=0.2):
    selected = df['flow_id'].drop_duplicates().sample(frac=frac)
    test = df[df['flow_id'].isin(selected)]
    train = df[~df['flow_id'].isin(selected)]
    return train, test


def label_encoding(df, columns_list):
    for col in columns_list:
        df[col] = le.fit_transform(df[col])
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        with open('Label_mapping_GQUIC.txt', 'a') as data:
            data.write(col+": "+str(le_name_mapping) + "\n")
    return df


def index_reset(df):
    return pd.RangeIndex(len(df.index))


def data_scale(df):
    df.loc[:, '0': '1459'] = df.loc[:, '0': '1459'].div(255)
    # df[['length']] = scale.fit_transform(
    #     df[['length']])
    return df


def remove_protocol(df):
    df = df.loc[(df['protocol'] == 'GQUIC')]
    return df


def remove_columns(df):
    df = df.drop(['ip_proto', 'data', 'length'], axis=1)
    return df


def final_remove(df):
    df = df.drop(['ip_src', 'ip_dst', 'protocol', 'info'], axis=1)
    return df

def get_static(df):
    for col in ['A->B', 'B->A','total']:
        print("Cột {}: ".format(col))
        for num in [1000,550,500,450,100]:
            total_flow = df.shape[0]
            count = df[col][df[col] < num].count()
            remain = total_flow - count
            percent = round((count/float(df.shape[0])) * 100,2)
            print("<{} chiếm {}% ({}/{}) con lai {})".format(num,percent,count,df.shape[0],remain))

# VoIP

In [2]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'csv/gquic/GoogleHangout_VoIP')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_GoogleHangout_VoIP = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_GoogleHangout_VoIP = pd.read_csv(
        file, engine='pyarrow')
    _df_GoogleHangout_VoIP['Label'] = "GoogleHangout_VoIP"
    file_name = file.split('/')
    _df_GoogleHangout_VoIP['File_name'] = file_name[-1]
    df_GoogleHangout_VoIP = pd.concat(
        [df_GoogleHangout_VoIP, _df_GoogleHangout_VoIP], ignore_index=True)
del _df_GoogleHangout_VoIP


In [79]:
df_GoogleHangout_VoIP = (df_GoogleHangout_VoIP.pipe(remove_columns).pipe(final_remove))

In [80]:
df_GoogleHangout_VoIP.index = index_reset(df_GoogleHangout_VoIP)
df_GoogleHangout_VoIP.to_feather(
    "GQUIC_csv/df_GoogleHangout_VoIP.feather", )
del df_GoogleHangout_VoIP


# Music

In [81]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'csv/gquic/Google_PlayMusic')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_Google_PlayMusic = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_Google_PlayMusic = pd.read_csv(
        file, engine='pyarrow')
    # print(file)
    _df_Google_PlayMusic['Label'] = "Google_PlayMusic"
    file_name = file.split('/')
    _df_Google_PlayMusic['File_name'] = file_name[-1]
    df_Google_PlayMusic = pd.concat(
        [df_Google_PlayMusic, _df_Google_PlayMusic])
del _df_Google_PlayMusic


In [82]:
df_Google_PlayMusic = (df_Google_PlayMusic.pipe(remove_columns).pipe(final_remove))

In [83]:
df_Google_PlayMusic.index = index_reset(df_Google_PlayMusic)
df_Google_PlayMusic.to_feather("GQUIC_csv/df_Google_PlayMusic.feather", )
del df_Google_PlayMusic


# File Transfer

In [84]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'csv/gquic/FileTransfer')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_FileTransfer = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_FileTransfer = pd.read_csv(file, engine='pyarrow')
    # print(file)
    _df_FileTransfer['Label'] = "FileTransfer"
    file_name = file.split('/')
    _df_FileTransfer['File_name'] = file_name[-1]
    df_FileTransfer = pd.concat([df_FileTransfer, _df_FileTransfer])
del _df_FileTransfer


In [85]:
df_FileTransfer = df_FileTransfer.pipe(remove_columns).pipe(final_remove)

In [86]:
df_FileTransfer.index = index_reset(df_FileTransfer)
df_FileTransfer.to_feather("GQUIC_csv/df_FileTranfer.feather", )
del df_FileTransfer


# Youtube

In [87]:
# get your working directory and target folder that contains all your files
path = os.path.join(os.getcwd(), 'csv/gquic/Youtube')

files = [os.path.join(path, i) for i in os.listdir(
    path) if os.path.isfile(os.path.join(path, i))]

df_Youtube = pd.DataFrame()

# for every file in folder, read it and append to a empty dataframe with column filename as 'Label'
for file in files:
    _df_Youtube = pd.read_csv(file, engine='pyarrow')
    # print(file)
    file_name = file.split('/')
    _df_Youtube['File_name'] = file_name[-1]
    _df_Youtube['Label'] = "Youtube"
    df_Youtube = pd.concat([df_Youtube,_df_Youtube])
del _df_Youtube

In [88]:
df_Youtube = df_Youtube.pipe(remove_columns).pipe(final_remove)

In [89]:
df_Youtube.index = index_reset(df_Youtube)
df_Youtube.to_feather(
    "GQUIC_csv/df_Youtube.feather", )
del df_Youtube

# Final

## Gop du lieu

In [92]:
path = os.path.join(os.getcwd(),  '/home/onos/FL/Data_Processing/GQUIC_csv')
all_files = glob.glob(os.path.join(path, "*.feather"))
df_GQUIC = pd.concat((pd.read_feather(f) for f in all_files), ignore_index=True)


In [93]:
df_sorted = df_GQUIC.sort_values(['stream_id', 'File_name'])
del df_GQUIC
df_sorted['flow_id'] = (df_sorted.groupby(['stream_id', 'File_name']).cumcount()==0).astype(int)
df_sorted['flow_id'] = df_sorted['flow_id'].cumsum()

In [95]:
df_sorted.index = pd.RangeIndex(len(df_sorted.index))
df_sorted.to_feather('GQUIC_full.feather')
del df_sorted

## Loc flow <500 packet

In [2]:
df_sorted = pd.read_feather('GQUIC_full.feather')


In [3]:
df_summary = pd.read_feather('sumary.feather')

In [7]:
# df_summary['File_name'] = df_summary['File_name'].str.replace(r'_summary','')

In [9]:
# df_summary.to_feather('sumary.feather')
# del df_summary

In [4]:
df_summary.drop(['flow_id'],axis=1,inplace=True)

In [5]:
df_sorted['temp'] = df_sorted['stream_id'].astype(str)+df_sorted['File_name'].astype(str)
df_summary['temp'] = df_summary['stream_id'].astype(str)+df_summary['File_name'].astype(str)


In [6]:
df_summary = df_summary[df_summary['temp'].isin(df_sorted['temp'])].drop('temp',1)

  df_summary = df_summary[df_summary['temp'].isin(df_sorted['temp'])].drop('temp',1)


In [7]:
df_summary = df_summary.sort_values(['stream_id', 'File_name'])
df_summary['flow_id'] = (df_summary.groupby(['stream_id', 'File_name']).cumcount()==0).astype(int)
df_summary['flow_id'] = df_summary['flow_id'].cumsum()

In [8]:
df_summary_500 = df_summary[df_summary['total'] > 500]

In [9]:
flow_id_list = df_summary_500['flow_id'].to_list()

In [19]:
len(flow_id_list)

16344

In [12]:
value_c = df_sorted['flow_id'].value_counts()

In [14]:
df_sorted['flow_id'].nunique()

138430

In [15]:
df_summary['flow_id'].nunique()

138430

In [20]:
count_list = value_c[value_c<20].index.to_list()

In [22]:
choice_list = list(set(count_list).intersection(flow_id_list))

In [23]:
len(choice_list)

5468

In [29]:
choice_list[2]

23

In [30]:
df_summary[df_summary['flow_id'] == 23]

Unnamed: 0,l4_proto,stream_id,A->B,B->A,total,Label,File_name,flow_id
29,udp,0,5918,813,6731,Google_PlayMusic,Google_Play_Music_00001_20180329072020.csv,23


In [27]:
df_sorted.drop(['temp'],axis=1,inplace=True)

In [31]:
df_sorted[df_sorted['flow_id'] == 23]

Unnamed: 0,time_epoch,frame_number,stream_id,0,1,2,3,4,5,6,...,1453,1454,1455,1456,1457,1458,1459,Label,File_name,flow_id
410,1522308000.0,1,0,12,86,237,219,167,209,219,...,0,0,0,0,0,0,0,Google_PlayMusic,Google_Play_Music_00001_20180329072020.csv,23
411,1522308000.0,2,0,0,4,127,223,97,167,113,...,0,0,0,0,0,0,0,Google_PlayMusic,Google_Play_Music_00001_20180329072020.csv,23
412,1522308000.0,3,0,12,86,237,219,167,209,219,...,0,0,0,0,0,0,0,Google_PlayMusic,Google_Play_Music_00001_20180329072020.csv,23


In [18]:
df_sorted_500 = df_sorted[df_sorted['flow_id'].isin(flow_id_list)]

In [19]:
df_sorted['flow_id'].value_counts().describe()

count    138430.000000
mean         16.162371
std           4.883958
min           1.000000
25%          12.000000
50%          20.000000
75%          20.000000
max          20.000000
Name: flow_id, dtype: float64

In [115]:
df_sorted_500[df_sorted_500['flow_id'].value_counts() < 20]

  df_sorted_500[df_sorted_500['flow_id'].value_counts() < 20]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [16]:
print(round((65526.0/138430.0)*100),'%')
sub_df = df_sorted[df_sorted.groupby('flow_id').flow_id.transform('count')>19].copy()
del df_sorted
sub_df.index = pd.RangeIndex(len(sub_df.index))
sub_df.to_feather('sub_df.feather')
del sub_df

47.335115220689154

## Chuan hoa

In [7]:
df_sorted = pd.read_feather('sub_df.feather')

In [8]:
_,small_sample = train_test_split(df_sorted,frac=0.01)

In [9]:
small_sample.to_feather("data_sample.feather",)

In [8]:
df_sorted = (df_sorted.pipe(label_encoding, ['Label']))

In [9]:
df_sorted.to_feather('df_with_label_encoding.feather')

del df_sorted

NameError: name 'sorted' is not defined

## Chia du lieu cho test case

In [3]:
df_sorted = pd.read_feather('df_with_label_encoding.feather')

In [4]:
training_data, test_case = train_test_split(df_sorted, 0.2)
print(test_case['flow_id'].nunique())
print(training_data['flow_id'].nunique())

14581
58323


In [5]:
del df_sorted

In [6]:
training_data = training_data.drop(['time_epoch', 'frame_number', 'File_name','stream_id'],axis=1)
train, test = train_test_split(training_data, frac=0.2)
del training_data
train = train.drop(['flow_id'],axis=1)
train.index = index_reset(train)
train.to_feather('GQUIC_data.feather')
del train
test = test.sort_values(['flow_id'])
test.index = index_reset(test)
test.to_feather("GQUIC_test.feather")
del test

In [None]:
# data_bytes = ['9','31','63','127','255','511','1023']
# for each_byte in data_bytes:
#     data_train = pd.concat([df_sorted.loc[:, : each_byte],df_sorted.iloc[:,-1:]],axis=1)
#     # data_test = pd.concat([df_sorted.loc[:, : each_byte ],df_sorted.iloc[:,-1:]],axis=1)
#     each_byte = str(int(each_byte)+1)
#     data_train_name = "GQUIC_data_" + each_byte +".feather"
#     # data_test_name = "GQUIC_test_" + each_byte +".feather"
#     data_train.to_feather(data_train_name,)
#     # data_test.to_feather(data_test_name,)

In [None]:
# first_column = test.pop('flow_id')
# test.insert(0, 'flow_id', first_column)

In [None]:
# del train,test,df_sorted,data_train,data_test

In [None]:
# test_case = test_case[['time_epoch', 'frame_number', 'stream_id', 'length', 'Label', 'File_name', 'flow_id']]
# split = 5
# total_data_count = test_case.shape[0]
# data_per_set = int(np.floor(total_data_count/split))
# DataFrameDict = {}
# for i in range(1, split+1):
#     client_name = "client_" + str(i)
#     start = data_per_set * (i-1)
#     end = data_per_set * i

#     print(f"Adding data from {start} to {end} for client : {client_name}")
#     DataFrameDict[client_name] = test_case[start:end]
# for df in DataFrameDict.keys():
#     csv_path = "gquic_test_case/"+df + ".feather"
#     DataFrameDict[df].to_feather(csv_path,)

In [None]:
# train.to_feather("GQUIC_data.feather", )
# test.to_feather("GQUIC_data_test.feather",)


In [None]:
# train.to_feather('GQUIC_data_2label.feather', )
# test.to_feather('GQUIC_test_2label.feather', )


In [None]:
# index = 0
# with pd.read_csv("GQUIC_data.feather", chunksize=1000000) as reader:
#     for data in reader:
#         path = '/home/onos/FL/Data Processing/GQUIC_arrow/GQUIC_data({}).arrow'.format(index)
#         feather.write_feather(data, path)
#         index += 1


In [None]:
# df_GQUIC.info()
# df_GQUIC.head(2)
# df_GQUIC = df_GQUIC.astype('float16')
# df_GQUIC.head(2)


In [None]:
# df_GQUIC.info()


In [None]:
# df_GQUIC["Label"] = df_GQUIC["Label"].astype("int8")


In [None]:
# df_GQUIC.info()


In [None]:
# df = pd.read_csv("iris.feather")


In [None]:
# df.info()


In [None]:
# df = (df.pipe(label_encoding, ['variety']))


In [None]:
# df.info()


In [None]:
# df.to_feather("iris.feather")
