In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import shutil

In [2]:
def sampling(dataset, percent):
    """
    Randomly remove items from dataset based on percentage provided
    Create test dataset that is sampled at the same rate as train data
    Copy sampled train dataset
    """
    # Read sampled train dataset
    ori = f"{dataset}_{percent}"
    path = Path.cwd().joinpath('data', ori, 'train.csv')
    df_train = pd.read_csv(path, header=0)
    
    # Sample test.csv at same rate
    df_test, dir_name = read_sample(dataset, percent, 'test.csv')
    
    # Copy train.csv and list.txt
    for file in ['train.csv', 'list.txt']:
        copy_file(ori, dir_name, file)

    return df_train, df_test



def read_sample(dataset, percent, file):
    """
    Read and sample dataset based on allocated percent
    Save sampled dataset in correct directory
    """
    
    # Read original file
    file_path = Path.cwd().joinpath('data', dataset, file)
    df = pd.read_csv(file_path, header=0)
    size = len(df)
    
    # Create directories to save randomly sampled train data
    dir_name = f"{dataset}_all_{percent}"
        
    path = Path.cwd().joinpath('data', dir_name)
    
    if not path.is_dir():
        path.mkdir(parents=True)
    
    # Randomly sample train data
    num_data = int(round(percent/100 * size, 0))
    index = sorted(np.random.choice(size, num_data, replace=False))
    df_sample = df.loc[index, :].reset_index(drop=True)
    
    # Print sampling percentage and size of data sample
    print(f"Sampling percentage and sample size ({dataset}, {file+')':<10} : {str(percent)+'%,':<5} {len(df_sample)}")
    
    # Save train data as train.csv in respective directories
    df_sample.to_csv(path.joinpath(file), index=False)  
    
    return df_sample, dir_name



def copy_file(old, new, file):
    """
    Copy file from old folder to new folder
    """
    
    old_path = Path.cwd().joinpath('data', old, file)
    new_path = Path.cwd().joinpath('data', new, file)
    
    shutil.copy(old_path, new_path)
    

In [4]:
sample = {}
sample_percent = 10

# Generate sampling size 30%, 70% and 90% of dataset
# where both train and test data are randomly sampled

datasets = [
    'msl_m', 'msl_tel', 'msl_tel_m',
    'smap_m', 'smap_tel', 'smap_tel_m',
    'swat', 'swat_ori'
]

for dataset in datasets:
    for sample_percent in [30, 70, 90]:
        
        # Generate sample
        df_temp = sampling(dataset, sample_percent)
        key = f"{dataset}_all_{sample_percent}"
        sample[key] = df_temp
    

Sampling percentage and sample size (msl_m, test.csv)  : 30%,  22119
Sampling percentage and sample size (msl_m, test.csv)  : 70%,  51610
Sampling percentage and sample size (msl_m, test.csv)  : 90%,  66356
Sampling percentage and sample size (msl_tel, test.csv)  : 30%,  1830
Sampling percentage and sample size (msl_tel, test.csv)  : 70%,  4270
Sampling percentage and sample size (msl_tel, test.csv)  : 90%,  5490
Sampling percentage and sample size (msl_tel_m, test.csv)  : 30%,  1830
Sampling percentage and sample size (msl_tel_m, test.csv)  : 70%,  4270
Sampling percentage and sample size (msl_tel_m, test.csv)  : 90%,  5490
Sampling percentage and sample size (smap_m, test.csv)  : 30%,  128285
Sampling percentage and sample size (smap_m, test.csv)  : 70%,  299332
Sampling percentage and sample size (smap_m, test.csv)  : 90%,  384855
Sampling percentage and sample size (smap_tel, test.csv)  : 30%,  2592
Sampling percentage and sample size (smap_tel, test.csv)  : 70%,  6048
Sampling per

In [3]:
tr, ts = sampling('swat_ori', 70)

Sampling percentage and sample size (swat_ori, test.csv)  : 70%,  314943


In [7]:
datasets = [
    'msl_m', 'msl_tel', 'msl_tel_m',
    'smap_m', 'smap_tel', 'smap_tel_m',
    'swat', 'swat_ori'
]

percent = 90

for dataset in datasets:
    d1 = dataset+'_all_'+str(percent)
    d2 = dataset+'_'+str(percent)

    path_d1 = Path.cwd().joinpath('data', d1, 'train.csv')
    path_d2 = Path.cwd().joinpath('data', d2, 'train.csv')

    df1 = pd.read_csv(path_d1, header=0)
    df2 = pd.read_csv(path_d2, header=0)

    print(f"{dataset:<15} : {str(df1.equals(df2)):>10}")

msl_m           :       True
msl_tel         :       True
msl_tel_m       :       True
smap_m          :       True
smap_tel        :       True
smap_tel_m      :       True
swat            :       True
swat_ori        :       True
