In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import shutil

In [31]:
def sampling(dataset, percent, choice=None):
    """
    Randomly remove items from dataset based on percentage provided
    """
    
    # Choose only train.csv to sample
    if choice == None:
        df_train, dir_name = read_sample(dataset, percent, 'train.csv', choice)
        
        # Read test.csv
        path = Path.cwd().joinpath('data', dataset, 'test.csv')
        df_test = pd.read_csv(path, header=0)
    
        # Copy test.csv (i.e. unsampled) and list.txt to new directories
        for file in ['test.csv', 'list.txt']:
            copy_file(dataset, dir_name, file)
        
    # Choose both train.csv and test.csv to sample
    else:
        df_train, dir_name = read_sample(dataset, percent, 'train.csv', choice)
        df_test, _ = read_sample(dataset, percent, 'test.csv', choice)
        
        # Copy list.txt to new directories
        copy_file(dataset, dir_name, 'list.txt')    
    
    return df_train, df_test



def read_sample(dataset, percent, file, choice):
    """
    Read and sample dataset based on allocated percent
    Save sampled dataset in correct directory
    """
    
    # Read original file
    file_path = Path.cwd().joinpath('data', dataset, file)
    df = pd.read_csv(file_path, header=0)
    size = len(df)
    
    # Create directories to save randomly sampled train data
    if choice == None:
        dir_name = f"{dataset}_{int(percent)}"
    else:
        dir_name = f"{dataset}_{choice}_{percent}"
        
    path = Path.cwd().joinpath('data', dir_name)
    
    if not path.is_dir():
        path.mkdir(parents=True)
    
    # Randomly sample train data
    num_data = int(round(percent/100 * size, 0))
    index = sorted(np.random.choice(size, num_data, replace=False))
    df_sample = df.loc[index, :].reset_index(drop=True)
    
    # Print sampling percentage and size of data sample
    print(f"Sampling percentage and sample size ({dataset}, {file+')':<10} : {str(percent)+'%,':<5} {len(df_sample)}")
    
    # Save train data as train.csv in respective directories
    df_sample.to_csv(path.joinpath(file), index=False)  
    
    return df_sample, dir_name
    


def copy_file(old, new, file):
    """
    Copy file from old folder to new folder
    """
    
    old_path = Path.cwd().joinpath('data', old, file)
    new_path = Path.cwd().joinpath('data', new, file)
    
    shutil.copy(old_path, new_path)
    

In [None]:
sample = {}
sample_percent = 10

# Generate sampling size 10%, and 50% of dataset
# where both train and test data are randomly sampled

datasets = [
    'msl_m', 'msl_tel', 'msl_tel_m',
    'smap_m', 'smap_tel', 'smap_tel_m',
    'swat_ori'
]

for dataset in datasets:
    for sample_percent in [10, 50]:

    # Generate sample
    df_temp = sampling(dataset, sample_percent, 'all')
    key = f"{dataset}_{int(sample_percent*100)}"
    sample[key] = df_temp
    

In [14]:
tr, ts = sampling('msl_m', 10, 'all')

Sampling percentage and sample size (msl_m, train.csv): 0.1, 5832
Sampling percentage and sample size (msl_m, test.csv): 0.1, 7373


In [30]:
tr, ts = sampling('smap_m', 1, 'all')

Sampling percentage and sample size (smap_m, train.csv) : 1%,   1352
Sampling percentage and sample size (smap_m, test.csv)  : 1%,   4276
