In [25]:
import argparse
import csv
import datetime
import gc
import os
import os.path as osp
import random
import shutil
import time
import warnings

import custom_utils
import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy
import numpy as np
import pandas as pd
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.nn.functional as F
import utils
from custom_utils import plot_graph
from matplotlib import pyplot as plt
from numpy import newaxis
from scipy.stats import chi2
from sklearn.manifold import TSNE
from sklearn.metrics import ConfusionMatrixDisplay
from torch.optim import SGD
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, Dataset

from tllib.alignment.dan import (ImageClassifier,
                                 MultipleKernelMaximumMeanDiscrepancy)
from tllib.modules.kernels import GaussianKernel
from tllib.utils.analysis import a_distance, collect_feature
from tllib.utils.data import ForeverDataIterator
from tllib.utils.logger import CompleteLogger
from tllib.utils.meter import AverageMeter, ProgressMeter
from tllib.utils.metric import accuracy

In [26]:
torch.set_printoptions(profile="full")
gc.collect()
torch.cuda.empty_cache()
warnings.filterwarnings("ignore", category=UserWarning)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 14

def create_set_with_target_percentage(target_df, source_df):
    counts = target_df['Label'].value_counts()
    data_by_label = {}
    for label, group in target_df.groupby('Label'):
        data_by_label[label] = group.iloc[:, :-1] 
    sampled_rows = []
    for i in range(counts.shape[0]):
        data_by_label[i]['Label'] = i
        count = len(data_by_label[i])/20 * 0/100
        count = int(count)
        for _ in range(count):
            start_idx = int(data_by_label[i].sample(1).index[0]/20)*20
            end_idx = start_idx + 20
            sampled_group = target_df.iloc[start_idx:end_idx]
            
            sampled_rows.append(sampled_group)
            data_by_label[i] = data_by_label[i].drop(sampled_group.index)
        
    sampled_target_df = pd.concat(sampled_rows, ignore_index=True)
    source_df = pd.concat([source_df, sampled_target_df], ignore_index=True)
    remaining_target_df = pd.concat([data_by_label[k] for k in range(counts.shape[0])], ignore_index=True)
    del data_by_label,sampled_target_df,sampled_rows
    return source_df, remaining_target_df

In [27]:
class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [28]:
def source_target_split(df, choice, frac=0.5):
    df_label_choice = df[df.Label == choice]
    print("Selected label " + str(choice))
    seletected_label_3 = df_label_choice['flow_id'].drop_duplicates().sample(
        frac=0.99)
    seletected = df['flow_id'].drop_duplicates().sample(frac=frac)

    source_select = seletected[~seletected.isin(seletected_label_3)]

    source = df[df['flow_id'].isin(source_select)]
    target = df[~df['flow_id'].isin(seletected)]
    return source, target


def resize_image(image, byte_size, target_size=(224, 224)):
    if byte_size == 256:
        target_size = (224, 224)
    else:
        target_size = (byte_size, byte_size)
    return cv2.resize(image, target_size, interpolation=cv2.INTER_LINEAR)


def most_frequent(List):
    return max(set(List), key=List.count)
def split_data(df, frac=0.2):
    seletected = df['flow_id'].drop_duplicates().sample(frac=frac)

    val = df[df['flow_id'].isin(seletected)]
    train = df[~df['flow_id'].isin(seletected)]
    return train, val


def source_target_split(df, choice, frac=0.5):
    df_label_choice = df[df.Label == choice]
    print("Selected label " + str(choice))
    seletected_label_3 = df_label_choice['flow_id'].drop_duplicates().sample(
        frac=0.99)
    seletected = df['flow_id'].drop_duplicates().sample(frac=frac)

    source_select = seletected[~seletected.isin(seletected_label_3)]

    source = df[df['flow_id'].isin(source_select)]
    target = df[~df['flow_id'].isin(seletected)]
    return source, target


def resize_image(image, byte_size, target_size=(224, 224)):
    if byte_size == 256:
        target_size = (224, 224)
    else:
        target_size = (byte_size, byte_size)
    return cv2.resize(image, target_size, interpolation=cv2.INTER_LINEAR)

def data_processing(raw_data, backbone):
    # Get flow label
    result = raw_data.groupby('flow_id')['Label'].apply(list).to_dict()
    flow_label = []
    for flow in result:
        flow_label.append(most_frequent(result[flow]))
    flow_label = np.array(flow_label)
    # Reshape payloads
    true_data = raw_data.drop('flow_id', axis=1)
    datas = true_data.drop('Label', axis=1).to_numpy()/255
    datas = datas.reshape(-1, 20, 256).astype('float32')
    # print(f"Shape of datas before resize is {datas.shape}")
    # Resize each image in the dataset
    datas = np.array([resize_image(img, 256) for img in datas])
    # print("before:")
    # print(datas.shape)
    if 'lenet' in backbone:
        datas = np.repeat(datas[:, :, np.newaxis, ], 1, axis=2)
    else:
        # print(f"Shape of datas befor get error is: {datas.shape}")
        datas = np.repeat(datas[:, :, np.newaxis, ], 3, axis=2)
    # print('middle')
    # print(datas.shape)
    datas = np.moveaxis(datas, 2, 1)
    # print("after")
    # print(datas.shape)
    final_dataset = MyDataset(datas, flow_label)
    return final_dataset


def remapping(df, map):
    df_copy = df.copy()
    df_copy['Label'] = df_copy['Label'].replace(map)
    return df_copy

In [29]:
scenario = "S2T"
subset = "none"
byte_size = 256
arch = "resnet50"
batch_size = 32
workers = 4
percent = 0

In [30]:
print('Concate data')
class_names = ['E-commerce', 'Video on-demand', 'Interactive data']
num_classes = len(class_names)
if scenario == "S2T":
    train_source = pd.read_feather(
        '/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/train_source_{}.feather'.format(byte_size))
    if subset == "none":
        train_target = pd.read_feather('/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/train_target_{}.feather'.format(byte_size))
        test_raw = val_raw = pd.read_feather('/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/test_raw_{}.feather'.format(byte_size))
    else:
        train_target = pd.read_feather('/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/train_target_{}_{}.feather'.format(subset,byte_size))
        test_raw = val_raw = pd.read_feather('/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/test_target_{}_{}.feather'.format(subset,byte_size))
else:
    train_target = pd.read_feather(
        '/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/train_source_{}.feather'.format(byte_size))
    train_source = pd.read_feather(
        '/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/train_target_{}.feather'.format(byte_size))
    test_raw = val_raw = pd.read_feather(
        '/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat/val_raw_{}.feather'.format(byte_size))
if percent != 0:
    train_source,train_target=create_set_with_target_percentage(train_target,train_source)
print(train_source.shape,train_target.shape)
train_source_dataset = data_processing(train_source, arch)
train_target_dataset = data_processing(train_target, arch)
val_dataset = data_processing(val_raw, arch)
test_dataset = data_processing(test_raw, arch)
# del train_source, train_target, val_raw, test_raw

# train_source_loader = DataLoader(train_source_dataset, batch_size=batch_size,
#                                 shuffle=True, num_workers=workers, drop_last=True)
# train_target_loader = DataLoader(train_target_dataset, batch_size=batch_size,
#                                 shuffle=True, num_workers=workers, drop_last=True)
# val_loader = DataLoader(
#     val_dataset, batch_size=batch_size, shuffle=False, num_workers=workers)
# test_loader = DataLoader(
#     test_dataset, batch_size=batch_size, shuffle=False, num_workers=workers)

# train_source_iter = ForeverDataIterator(train_source_loader)
# train_target_iter = ForeverDataIterator(train_target_loader)

Concate data
(182980, 258) (123400, 258)


In [32]:
print(type(train_source_dataset))

<class '__main__.MyDataset'>


In [34]:
print(type(train_source_dataset[0]))

<class 'tuple'>


In [35]:
print(type(train_source_dataset[0][0]))

<class 'numpy.ndarray'>


In [36]:
print((train_source_dataset[0][0].shape))

(3, 224, 224)


In [45]:
import os
import numpy as np
from PIL import Image
from torch.utils.data import Dataset


class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

def save_dataset(dataset, root_dir, dataset_name):
    images_dir = os.path.join(root_dir, dataset_name, 'images')
    subfolders = ['ecommerce', 'video', 'interactive']
    label_map = {0: 'ecommerce', 1: 'video', 2: 'interactive'}

    # Create directories
    for subfolder in subfolders:
        os.makedirs(os.path.join(images_dir, subfolder), exist_ok=True)

    image_list_path = os.path.join(root_dir, 'image_list', f'{dataset_name}.txt')
    os.makedirs(os.path.dirname(image_list_path), exist_ok=True)

    # Save images and create image list file
    with open(image_list_path, 'w') as f:
        for idx, (data, label) in enumerate(dataset):
            if label in label_map:
                # Convert the data to 8-bit unsigned integer
                data = (data * 255).astype(np.uint8)
                folder = label_map[label]
                image_path = os.path.join(images_dir, folder, f'frame_{idx}.jpg')
                Image.fromarray(data.transpose(1, 2, 0)).save(image_path)
                dataset_path = os.path.join(dataset_name,"images" ,folder, f'frame_{idx}.jpg')
                f.write(f'{dataset_path} {label}\n')

# Example usage:
root_dir = '/home/bkcs/HDD/Transfer-Learning-Library/examples/domain_adaptation/image_classification/data/concat_dataset'

save_dataset(train_source_dataset, root_dir, 'dataset_1')
save_dataset(train_target_dataset, root_dir, 'dataset_2')
save_dataset(test_dataset, root_dir, 'dataset_2')
