<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('dataset/train.csv')
df_train.drop(columns=['elapsed_timedelta'], inplace=True)
print(f'Training Set Shape: {df_train.shape} - {df_train["id"].nunique()} Images - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')

Training Set Shape: (73585, 8) - 606 Images - Memory Usage: 4.49 MB


In [3]:
df_train.head()

Unnamed: 0,id,annotation,width,height,cell_type,plate_time,sample_date,sample_id
0,0030fd0e6378,118145 6 118849 7 119553 8 120257 8 120961 9 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3
1,0030fd0e6378,189036 1 189739 3 190441 6 191144 7 191848 8 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3
2,0030fd0e6378,173567 3 174270 5 174974 5 175678 6 176382 7 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3
3,0030fd0e6378,196723 4 197427 6 198130 7 198834 8 199538 8 2...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3
4,0030fd0e6378,167818 3 168522 5 169225 7 169928 8 170632 9 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3


In [4]:
from sklearn.model_selection import train_test_split

df_train_supervised_cell_types = df_train[~df_train['annotation'].isnull()].groupby('id')['cell_type'].first().reset_index()

train_df, validation_df = train_test_split(
    df_train_supervised_cell_types['id'],
    train_size=0.9,
    test_size=0.1,
    random_state=21,
    shuffle=True,
    stratify=df_train_supervised_cell_types['cell_type']
)

len(train_df), len(validation_df)

(545, 61)

In [7]:
import os
import shutil
import glob

splits = {
    "train": train_df,
    "validation": validation_df
}

for split_name, split in splits.items():
    os.makedirs(f'dataset/{split_name}_split')
    for image_id in split:
        current_files = f'dataset/train/{image_id}'
        for file in glob.glob(rf'{current_files}*'):
            print(file)
            shutil.copy(
                file,
                f'dataset/{split_name}_split/{os.path.basename(file)}'
            )

dataset/train/0ba181d412da.png
dataset/train/029e5b3b89c7.png
dataset/train/a55cec7ee7a1.png
dataset/train/e4632744011e.png
dataset/train/d8fc6dd956f8.png
dataset/train/563435d64260.png
dataset/train/a162768bcf04.png
dataset/train/b9056ac30b4b.png
dataset/train/8541146e15d9.png
dataset/train/42d8ecbc95a1.png
dataset/train/625c65b50aa1.png
dataset/train/699757ca44a7.png
dataset/train/18d5d665a6af.png
dataset/train/f0e54d645fe5.png
dataset/train/d5c06777eee7.png
dataset/train/aa2e2c09a57b.png
dataset/train/1d618b80769f.png
dataset/train/4551bb9de3fa.png
dataset/train/5d57448ab949.png
dataset/train/053d61766edb.png
dataset/train/e33d521aa9de.png
dataset/train/393c8540c6fa.png
dataset/train/798c6a9ca18c.png
dataset/train/3b56cced208e.png
dataset/train/b03de5cbebb2.png
dataset/train/36855e37531a.png
dataset/train/d28e67c1ad17.png
dataset/train/606831bd2dd2.png
dataset/train/4d52c84bfe79.png
dataset/train/e1140f76536a.png
dataset/train/93ce48e5227c.png
dataset/train/6955f473e6f5.png
dataset/

dataset/train/411a7b067dcc.png
dataset/train/14dbd973a7cd.png
dataset/train/90a3e24b62b0.png
dataset/train/741edb5bb8f5.png
dataset/train/5e04f48d34e3.png
dataset/train/04928f0866b0.png
dataset/train/8050704a02eb.png
dataset/train/22e0c43da285.png
dataset/train/56e473c7c793.png
dataset/train/eec79772cb99.png
dataset/train/624d50b5bfd1.png
dataset/train/adbaf2416db2.png
dataset/train/f717a5f6d473.png
dataset/train/4b701c599d33.png
dataset/train/9b6b19ec0736.png
dataset/train/44a154410273.png
dataset/train/26efe388938c.png
dataset/train/49d4a04f398c.png
dataset/train/d97922f1d446.png
dataset/train/c5be3066e673.png
dataset/train/f982188d7249.png
dataset/train/3b70c0fef171.png
dataset/train/3f29e529f210.png
dataset/train/caa06f9a4057.png
dataset/train/1e60fc475228.png
dataset/train/4bdf75f87261.png
dataset/train/bb3520da4cce.png
dataset/train/c17eac09ff70.png
dataset/train/ebffbf1cfe00.png
dataset/train/51c920fcd542.png
dataset/train/680c18cb5fea.png
dataset/train/a1aab9d6b6b2.png
dataset/

dataset/train/235f6d2095c9.png
dataset/train/f6e7e998a190.png
dataset/train/1b539d8a8897.png
dataset/train/bfb878cd992e.png
dataset/train/bebd2d51a0d0.png
dataset/train/d132b291f6cc.png
dataset/train/b53a3bfd3bd0.png
dataset/train/f6abc09eb12f.png
dataset/train/8bd09ff70b13.png
dataset/train/5863bf795692.png
dataset/train/0cfdeeb0dded.png
dataset/train/9b362a5983ae.png
dataset/train/517648559341.png
dataset/train/57bc63239549.png
dataset/train/198593a55b7a.png
dataset/train/d14e4fe1fef1.png
dataset/train/1d2ca29fef3e.png
dataset/train/0728b8f39241.png
dataset/train/c98b451417f7.png
dataset/train/e201f565a7d8.png
dataset/train/e6aff490e966.png
dataset/train/5df720a4fad6.png
dataset/train/0c5938ac5e3c.png
dataset/train/8650b0110eb6.png
dataset/train/ec1d0a62b17c.png
dataset/train/dd8bcbe5094b.png
dataset/train/1ef6aaa62132.png
dataset/train/630bac646b5a.png
dataset/train/76f6eb941d56.png
dataset/train/b0c3bd326249.png
dataset/train/01ae5a43a2ab.png
dataset/train/026b3c2c4b32.png
dataset/