In [1]:
import random
import time
import warnings
import argparse
import shutil
import os.path as osp
from torchsummary import summary
import os
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from torch.optim import SGD
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
import torch.nn.functional as F
import cv2
import numpy as np
import custom_utils
from matplotlib import pyplot as plt
from custom_utils import plot_graph
from torch.utils.data import Dataset, DataLoader
from tllib.alignment.dan import MultipleKernelMaximumMeanDiscrepancy, ImageClassifier
from tllib.modules.kernels import GaussianKernel
from tllib.utils.data import ForeverDataIterator
from tllib.utils.metric import accuracy
from tllib.utils.meter import AverageMeter, ProgressMeter
from tllib.utils.logger import CompleteLogger
from tllib.utils.analysis import collect_feature, tsne, a_distance
import pandas as pd
import numpy
import gc
torch.set_printoptions(profile="full")
gc.collect()

0

In [2]:
def remapping(df, map):
    df_copy = df.copy()
    df_copy['Label'] = df_copy['Label'].replace(map)
    return df_copy

In [3]:
# Set data path
Train_path = '/home/bkcs/HDD/FL/Data_Processing/Data/Concatenated/Full_data/Concatenated_train.feather'
Test_path = '/home/bkcs/HDD/FL/Data_Processing/Data/Concatenated/Full_data/Concatenated_test.feather'


Train_data = pd.read_feather(Train_path)
Test_data = pd.read_feather(Test_path)



In [4]:
Train_data.columns

Index(['flow_id', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '1015', '1016', '1017', '1018', '1019', '1020', '1021', '1022', '1023',
       'Label'],
      dtype='object', length=1026)

In [5]:
Train_data['Label'].value_counts()/20

11    4728.0
4     3715.0
0     2850.0
2     2784.0
1     2042.0
12    2036.0
3     1776.0
17    1512.0
9     1475.0
7     1338.0
5     1132.0
10    1119.0
14     954.0
16     726.0
6      668.0
13     402.0
15     236.0
8      157.0
Name: Label, dtype: float64

In [6]:
mapping_dict = {'FileTransfer': 0, 'GoogleHangout_Chat': 1, 'Music': 2, 'VoIP': 3, 'Youtube': 4, 'alibaba': 5, 'amazon': 6, 'docs': 7, 'drive': 8, 'ebay': 9, 'facebook': 10, 'photo': 11, 'shopee': 12, 'thegioididong': 13, 'tiki': 14, 'tiktok': 15, 'youtube': 16, 'youtubelive': 17}
reverse_mapping_dict = {v: k for k, v in mapping_dict.items()}
Train_data['Label'].replace(reverse_mapping_dict,inplace=True)
Test_data['Label'].replace(reverse_mapping_dict,inplace=True)


In [7]:
Train_data['Label'].value_counts()/20

photo                 4728.0
Youtube               3715.0
FileTransfer          2850.0
Music                 2784.0
GoogleHangout_Chat    2042.0
shopee                2036.0
VoIP                  1776.0
youtubelive           1512.0
ebay                  1475.0
docs                  1338.0
alibaba               1132.0
facebook              1119.0
tiki                   954.0
youtube                726.0
amazon                 668.0
thegioididong          402.0
tiktok                 236.0
drive                  157.0
Name: Label, dtype: float64

In [8]:
Train_data['flow_id'].value_counts().describe()

count    29650.0
mean        20.0
std          0.0
min         20.0
25%         20.0
50%         20.0
75%         20.0
max         20.0
Name: flow_id, dtype: float64

In [9]:
# ['Ecommerce', 'Video', 'FileTransfer', 'Google_service']
# ['Ecommerce', 'Video', 'Google_service']
# original_dict = {0: ['thegioididong','amazon','tiki','alibaba','ebay','shopee'],
#                  1: ['tiktok','youtube','facebook','youtubelive','Youtube'],
#                  2: ['drive','FileTransfer'],
#                  3: ['docs','VoIP','Music','photo','GoogleHangout_Chat']}
original_dict = {0: ['thegioididong','amazon','tiki','alibaba','ebay','shopee'],
                 1: ['tiktok','youtube','facebook','youtubelive','Youtube'],
                 2: ['docs','VoIP','Music','photo','GoogleHangout_Chat']}
label_mapping = {}

for key, values in original_dict.items():
    for value in values:
        label_mapping[value] = key

print(label_mapping)


{'thegioididong': 0, 'amazon': 0, 'tiki': 0, 'alibaba': 0, 'ebay': 0, 'shopee': 0, 'tiktok': 1, 'youtube': 1, 'facebook': 1, 'youtubelive': 1, 'Youtube': 1, 'docs': 2, 'VoIP': 2, 'Music': 2, 'photo': 2, 'GoogleHangout_Chat': 2}


In [10]:
source_labels = ['Youtube', 'GoogleHangout_Chat','shopee','tiki','thegioididong']
target_labels = ['ebay','alibaba','amazon','VoIP','facebook']

train_source = Train_data.loc[Train_data['Label'].isin(source_labels)]
train_target = Train_data.loc[Train_data['Label'].isin(target_labels)]
val_raw = Test_data.loc[Test_data['Label'].isin(source_labels)]
test_raw =  Test_data.loc[Test_data['Label'].isin(target_labels)]
del Train_data, Test_data


In [11]:
train_source = remapping(train_source, label_mapping)
train_target = remapping(train_target, label_mapping)
test_raw = remapping(test_raw, label_mapping)
val_raw = remapping(val_raw, label_mapping)

In [12]:
train_source['Label'].value_counts()/20

1    3715.0
0    3392.0
2    2042.0
Name: Label, dtype: float64

In [13]:
train_target['Label'].value_counts()/20

0    3275.0
2    1776.0
1    1119.0
Name: Label, dtype: float64

In [14]:
test_raw['Label'].value_counts()/20

0    796.0
2    422.0
1    260.0
Name: Label, dtype: float64

In [15]:
val_raw['Label'].value_counts()/20

1    930.0
0    903.0
2    514.0
Name: Label, dtype: float64

In [16]:
train_source.reset_index(drop=True, inplace=True)
train_target.reset_index(drop=True, inplace=True)
val_raw.reset_index(drop=True, inplace=True)
test_raw.reset_index(drop=True, inplace=True)


In [17]:
train_source.shape[0]/20


9149.0

In [18]:
train_target.shape[0]/20


6170.0

In [19]:
val_raw.shape[0]/20


2347.0

In [20]:
test_raw.shape[0]/20

1478.0

In [24]:
# Plotting the first ten rows of the DataFrame and saving as images
plt.figure(figsize=(10, 15))

for i in range(10):
    plt.subplot(10, 1, i + 1)
    plt.imshow(test_raw.iloc[i:i + 1], cmap='viridis', aspect='auto')
    plt.colorbar()
    plt.title(f'Row {i+1} Visualization')
    plt.xlabel('Columns')
    plt.ylabel('Row')

plt.tight_layout()
plt.savefig('first_ten_rows.png')  # Save the visualizations of the first ten rows as an image file
plt.show()


  


In [21]:
def get_nbyte(df, num_byte):
    df = pd.concat([df.loc[:, : num_byte], df.iloc[:, -1:]], axis=1)
    return df

In [57]:
data_bytes = ['9', '31', '63', '127', '255', '383', '511', '1023']

for each_byte in data_bytes:
    sub_train_source = get_nbyte(train_source, each_byte)
    sub_train_target = get_nbyte(train_target, each_byte)
    sub_val_raw = get_nbyte(val_raw, each_byte)
    sub_test_raw = get_nbyte(test_raw, each_byte)
    
    each_byte = str(int(each_byte) + 1)
    train_source_name = "train_source_" + each_byte + ".feather"
    train_target_name = "train_target_" + each_byte + ".feather"
    val_raw_name = "val_raw_" + each_byte + ".feather"
    test_raw_name = "test_raw_" + each_byte + ".feather"

    path = "/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/"

    sub_train_source.to_feather(path + train_source_name)
    sub_train_target.to_feather(path + train_target_name)
    sub_val_raw.to_feather(path + val_raw_name)
    sub_test_raw.to_feather(path + test_raw_name)


In [58]:
import datetime
import time

# Start the timer
start_time = time.time()

# Your code for model training, validation, or testing goes here
# ...

# Calculate the elapsed time
elapsed_time = time.time() - start_time

# Convert elapsed time to hours, minutes, seconds, and smaller units of seconds
hours, rem = divmod(elapsed_time, 3600)
minutes, rem = divmod(rem, 60)
seconds, microseconds = divmod(rem, 1)
microseconds = round(microseconds, 3)

# Print the elapsed time
print("Elapsed time: {:0>2}:{:0>2}:{:05.3f}".format(int(hours), int(minutes), seconds + microseconds))


Elapsed time: 00:00:0.000
