# Data Splitting
The data splitting process uses the scikit-learn package due to its user-friendly interface and support for stratified splitting, which helps maintain label distribution across the training and validation sets.

### Import the necessary package

In [23]:
# handling file and directory
import os
import shutil
# handle the data info
import pandas as pd
# handle the data splitting
from sklearn.model_selection import train_test_split

### Get the necessary variables

In [24]:
dataset_path = "./dataset/prepared_dataset"
destination_path = "./dataset/splited_dataset"
labels = os.listdir(dataset_path)
labels

['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']

### Create a dataframe
the dataframe is made to store the image path and it's label information. the dataframe is used to make the splitting process easy because the format is suitable for scikit-learn needs.

In [7]:
# initiate the dataframe
dataset = pd.DataFrame(columns=["path", "label"])
# populate the dataframe
for label in labels:
    # get all the files in the directory
    files = os.listdir(f"{dataset_path}/{label}")
    for file in files:
        # add the file path and the label to the dataframe
        dataset.loc[len(dataset)] = [os.path.join(dataset_path, label, file), label]
# check the dataframe
dataset.head(3)

Unnamed: 0,path,label
0,./dataset/prepared_dataset\cardboard\cardboard...,cardboard
1,./dataset/prepared_dataset\cardboard\cardboard...,cardboard
2,./dataset/prepared_dataset\cardboard\cardboard...,cardboard


### Split the dataset
Split the dataset using stratified method to ensure it's label distribution. the ratio used is 20% for the test size, this ratio is considering the small amount of dataset size

In [8]:
img_train, img_test, label_train, label_test = train_test_split(dataset.path,
                                                                dataset.label,
                                                                test_size=0.2,
                                                                random_state=191502,
                                                                stratify=dataset.label)

### Transform the split result
Transform the split result into a dataframe to make the handling easier

In [12]:
trainset = pd.DataFrame({"path": img_train, "label": label_train}).reset_index(drop=True)
testset = pd.DataFrame({"path": img_test, "label": label_test}).reset_index(drop=True)

### Validate the split ratio

In [22]:
print(f"trainset count: {trainset.shape[0]} ({round(trainset.shape[0]/len(dataset)*100)}%)",
    f"testset count: {testset.shape[0]} ({round(testset.shape[0]/len(dataset)*100)}%)",
    sep="\n")

trainset count: 2021 (80%)
testset count: 506 (20%)


### Prepare the splitted directory
the directory will be used for storing the splitted dataset

In [27]:
for data_type in ['train', 'test']:
    for label in labels:
        os.makedirs(f"{destination_path}/{data_type}/{label}", exist_ok=True)

### Populate the spliited directory
Copy the dataset into their place on the new splitted directory

In [28]:
# copy the trainset to the destination directory
for i, row in trainset.iterrows():
    shutil.copy(row.path, f"{destination_path}/train/{row.label}")
# copy the testset to the destination directory
for i, row in testset.iterrows():
    shutil.copy(row.path, f"{destination_path}/test/{row.label}")