# Load Libaries

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import shutil

# Function Definition

In [4]:
def sampling(dataset, percent, choice=None):
    """
    Randomly remove items from dataset based on percentage provided
    """
    
    # Choose only train.csv to sample
    if choice == None:
        df_train, dir_name = read_sample(dataset, percent, 'train.csv', choice)
        
        # Read test.csv
        path = Path.cwd().joinpath('processed', dataset, 'test.csv')
        df_test = pd.read_csv(path, header=0)
    
        # Copy test.csv (i.e. unsampled) and list.txt to new directories
        for file in ['test.csv', 'list.txt']:
            copy_file(dataset, dir_name, file)
        
    # Choose both train.csv and test.csv to sample
    else:
        df_train, dir_name = read_sample(dataset, percent, 'train.csv', choice)
        df_test, _ = read_sample(dataset, percent, 'test.csv', choice)
        
        # Copy list.txt to new directories
        copy_file(dataset, dir_name, 'list.txt')    
    
    return df_train, df_test



def read_sample(dataset, percent, file, choice):
    """
    Read and sample dataset based on allocated percent
    Save sampled dataset in correct directory
    """
    
    # Read original file
    file_path = Path.cwd().joinpath('processed', dataset, file)
    df = pd.read_csv(file_path, header=0)
    size = len(df)
    
    # Create directories to save randomly sampled train data
    if choice == None:
        dir_name = f"{dataset}_{int(percent)}"
    else:
        dir_name = f"{dataset}_{choice}_{percent}"
        
    path = Path.cwd().joinpath('processed', dir_name)
    
    if not path.is_dir():
        path.mkdir(parents=True)
    
    # Randomly sample train data
    num_data = int(round(percent/100 * size, 0))
    index = sorted(np.random.choice(size, num_data, replace=False))
    df_sample = df.loc[index, :].reset_index(drop=True)
    
    # Print sampling percentage and size of data sample
    print(f"Sampling percentage and sample size ({dataset}, {file+')':<10} : {str(percent)+'%,':<5} {len(df_sample)}")
    
    # Save train data as train.csv in respective directories
    df_sample.to_csv(path.joinpath(file), index=False)  
    
    return df_sample, dir_name
    


def copy_file(old, new, file):
    """
    Copy file from old folder to new folder
    """
    
    old_path = Path.cwd().joinpath('processed', old, file)
    new_path = Path.cwd().joinpath('processed', new, file)
    
    shutil.copy(old_path, new_path)
    

# Generate Sampled SWaT
* swat = Only train data is sampled. Original test data is intact.
* swat_all = Both train and test data sampled at the assigned percentage.

In [5]:
sample = {}
dataset = 'swat'

for choice in [None, 'all']:
    for sample_percent in [10, 30, 50, 70, 90]:
        df_temp = sampling(dataset, sample_percent, choice)
        key = f"{dataset}_{int(sample_percent*100)}"
        sample[key] = df_temp
    

Sampling percentage and sample size (swat, train.csv) : 10%,  4752
Sampling percentage and sample size (swat, train.csv) : 30%,  14256
Sampling percentage and sample size (swat, train.csv) : 50%,  23760
Sampling percentage and sample size (swat, train.csv) : 70%,  33264
Sampling percentage and sample size (swat, train.csv) : 90%,  42768
Sampling percentage and sample size (swat, train.csv) : 10%,  4752
Sampling percentage and sample size (swat, test.csv)  : 10%,  4499
Sampling percentage and sample size (swat, train.csv) : 30%,  14256
Sampling percentage and sample size (swat, test.csv)  : 30%,  13497
Sampling percentage and sample size (swat, train.csv) : 50%,  23760
Sampling percentage and sample size (swat, test.csv)  : 50%,  22496
Sampling percentage and sample size (swat, train.csv) : 70%,  33264
Sampling percentage and sample size (swat, test.csv)  : 70%,  31494
Sampling percentage and sample size (swat, train.csv) : 90%,  42768
Sampling percentage and sample size (swat, test.csv