In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras_preprocessing.image import ImageDataGenerator
from glob import glob
from tqdm import tqdm
from pathlib import Path
from keras.models import Sequential
from keras.layers import Activation, Dense
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, BatchNormalization, Flatten
from sklearn.model_selection import train_test_split
from PIL import Image

# Parameters

In [35]:
# input
INPUT = Path(r'D:\ML Datasets\kagglecatsanddogs_5340')
SPLIT = Path(r'D:\ML Datasets\kagglecatsanddogs_5340\Split')
DIM = 'Edited'

In [36]:
# list of Dog imgs
dogs = glob(
    str(
        Path(
            INPUT,
            DIM,
            'Dog',
            '*.jpg'
        )
    )
)

# list of Cat imgs
cats = glob(
    str(
        Path(
            INPUT,
            DIM,
            'Cat',
            '*.jpg'
        )
    )
)

print(f'Dogs {len(dogs)}')
print(f'Cats {len(cats)}')

Dogs 12499
Cats 12499


# Train and test split

In [37]:
dataset = pd.concat(
    [
        pd.DataFrame(
            {
                'Path':cats,
                'Class':[1]*len(cats)
            }
        ),
        pd.DataFrame(
            {
                'Path':dogs,
                'Class':[0]*len(dogs)
            }
        )
    ]
)

dataset.reset_index(drop=True, inplace=True)

In [38]:
X_train, X_test_val, y_train, y_test_val = train_test_split(
    dataset[['Path']], dataset[['Class']], test_size=0.3, random_state=123, stratify= dataset[['Class']]
    )

In [39]:
X_test, X_val, y_test, y_val = train_test_split(
    X_test_val[['Path']], y_test_val[['Class']], test_size=0.7, random_state=123, stratify= y_test_val[['Class']]
    )

In [40]:
X_train.shape

(17498, 1)

In [41]:
X_test.shape

(2250, 1)

In [42]:
X_val.shape

(5250, 1)

# Train

In [43]:
# Dogs and cats
for idx, row in tqdm(X_train.iterrows()):
    image = Image.open(row['Path'])
    file_name = row['Path'][len(str(Path(INPUT, DIM,'Dog')))+1:]

    if y_train['Class'][idx] == 0:
        image.save(
            str(
                Path(
                    SPLIT,
                    'Train',
                    DIM,
                    'Dog',
                    file_name 
                )
            )
        )
    elif y_train['Class'][idx] == 1:
        image.save(
            str(
                Path(
                    SPLIT,
                    'Train',
                    DIM,
                    'Cat',
                    file_name 
                )
            )
        )



17498it [00:56, 308.62it/s]


# Test

In [44]:
# Dogs and cats
for idx, row in tqdm(X_test.iterrows()):
    image = Image.open(row['Path'])

    if y_test['Class'][idx] == 0:
        file_name = row['Path'][len(str(Path(INPUT, DIM,'Dog')))+1:]
        image.save(
            str(
                Path(
                    SPLIT,
                    'Test',
                    DIM,
                    'Dog',
                    file_name 
                )
            )
        )
    elif y_test['Class'][idx] == 1:
        file_name = row['Path'][len(str(Path(INPUT, DIM,'Cat')))+1:]
        image.save(
            str(
                Path(
                    SPLIT,
                    'Test',
                    DIM,
                    'Cat',
                    file_name 
                )
            )
        )

2250it [00:06, 330.93it/s]


# Val

In [45]:
# Dogs and cats
for idx, row in tqdm(X_val.iterrows()):
    image = Image.open(row['Path'])

    if y_val['Class'][idx] == 0:
        file_name = row['Path'][len(str(Path(INPUT, DIM,'Dog')))+1:]
        image.save(
            str(
                Path(
                    SPLIT,
                    'Val',
                    DIM,
                    'Dog',
                    file_name 
                )
            )
        )
    elif y_val['Class'][idx] == 1:
        file_name = row['Path'][len(str(Path(INPUT, DIM,'Cat')))+1:]
        image.save(
            str(
                Path(
                    SPLIT,
                    'Val',
                    DIM,
                    'Cat',
                    file_name 
                )
            )
        )

5250it [00:16, 321.02it/s]
