In [None]:
# Keep in mind that this below is the structure of the data directory

archive/
├── train/
│   ├── Dog/ dog_image1, ... 
│   ├── Cat/ cat_image1, ... 
│   └── Bird/ bird_image1, ... 
└── test/
    ├── Dog/ dog_image1, ... 
    ├── Cat/ cat_image1, ... 
    └── Bird/ bird_image1, ... 

In [None]:
# These are the requirement pkgs for this notebook 

pandas                    2.2.2
scikit-learn              1.5.1
tensorflow                2.17.0

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [2]:
def load_dataset(directory):
    
    image_paths = []
    labels = []
    
    directory_ = os.listdir(directory)
    
    for i in directory_:
        if i.startswith('.') or i.startswith('_'):
            directory_.remove(i)
        
    for folder in directory_:
        for filename in os.listdir(directory+folder):
            image_path = os.path.join(directory, folder, filename)

            if filename.startswith('.') or filename.startswith('_') or filename[-4:] != '.jpg':
                pass

            else:
                image_paths.append(image_path)
                labels.append(folder)
        
    return image_paths, labels

In [3]:
# Train
train_directory ="archive/train/"
train = pd.DataFrame()
train['image'], train['label'] = load_dataset(train_directory)

# shuffle the dataset
control = 'label'
random_order = np.random.permutation(len(train))
train['RandomOrder'] = random_order
train = train.sort_values(by=['RandomOrder', control]).reset_index(drop=True)
train = train.drop(columns=['RandomOrder'])

In [4]:
train

Unnamed: 0,image,label
0,archive/train/cats/Abyssinian_160.jpg,cats
1,archive/train/dogs/german_shorthaired_97.jpg,dogs
2,archive/train/cats/Birman_188.jpg,cats
3,archive/train/birds/NORTHERN_CARDINAL_114.jpg,birds
4,archive/train/birds/GYRFALCON_026.jpg,birds
...,...,...
5236,archive/train/birds/CINNAMON_TEAL_019.jpg,birds
5237,archive/train/cats/Bengal_5.jpg,cats
5238,archive/train/birds/MALAGASY_WHITE_EYE_100.jpg,birds
5239,archive/train/birds/RED_TAILED_THRUSH_077.jpg,birds


In [5]:
train['label']

0        cats
1        dogs
2        cats
3       birds
4       birds
        ...  
5236    birds
5237     cats
5238    birds
5239    birds
5240     dogs
Name: label, Length: 5241, dtype: object

In [6]:
train['image']

0                archive/train/cats/Abyssinian_160.jpg
1         archive/train/dogs/german_shorthaired_97.jpg
2                    archive/train/cats/Birman_188.jpg
3        archive/train/birds/NORTHERN_CARDINAL_114.jpg
4                archive/train/birds/GYRFALCON_026.jpg
                             ...                      
5236         archive/train/birds/CINNAMON_TEAL_019.jpg
5237                   archive/train/cats/Bengal_5.jpg
5238    archive/train/birds/MALAGASY_WHITE_EYE_100.jpg
5239     archive/train/birds/RED_TAILED_THRUSH_077.jpg
5240                  archive/train/dogs/boxer_199.jpg
Name: image, Length: 5241, dtype: object

In [7]:
encoder = LabelEncoder()
encoder.fit(train['label'])
y_train = encoder.transform(train['label'])

In [8]:
y_train

array([1, 2, 1, ..., 0, 0, 2])

In [9]:
train_feature_path = train['image'].values

In [10]:
train_feature_path

array(['archive/train/cats/Abyssinian_160.jpg',
       'archive/train/dogs/german_shorthaired_97.jpg',
       'archive/train/cats/Birman_188.jpg', ...,
       'archive/train/birds/MALAGASY_WHITE_EYE_100.jpg',
       'archive/train/birds/RED_TAILED_THRUSH_077.jpg',
       'archive/train/dogs/boxer_199.jpg'], dtype=object)

In [11]:
# Function that read and preprocess the image dir and return image and label
def _parse_data(train_feature_path, y_train):
    image_string = tf.io.read_file(train_feature_path)
    image_decoded = tf.image.decode_jpeg(image_string, channels=3)
    image_resized = tf.image.resize(image_decoded, [224, 224])  # Resize if necessary
    return image_resized, y_train

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((train_feature_path, y_train))
dataset = dataset.map(_parse_data)

In [13]:
# Shuffle and batch the dataset
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(batch_size=32)

In [14]:
# Prefetch data
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [25]:
# Create simple model
model = tf.keras.Sequential([tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
                             tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
                             tf.keras.layers.Flatten(),
                             tf.keras.layers.Dense(128, activation='relu'),
                             tf.keras.layers.Dense(3, activation='softmax')])

In [23]:
# Compile the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [24]:
# Model summary
model.summary()

In [18]:
# Train the model
model.fit(dataset, epochs=10)

Epoch 1/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 331ms/step - accuracy: 0.4856 - loss: 2679.7000
Epoch 2/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 327ms/step - accuracy: 0.8422 - loss: 3.0030
Epoch 3/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 339ms/step - accuracy: 0.9303 - loss: 0.9540
Epoch 4/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 335ms/step - accuracy: 0.9830 - loss: 0.1691
Epoch 5/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 332ms/step - accuracy: 0.9829 - loss: 0.2170
Epoch 6/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 323ms/step - accuracy: 0.9872 - loss: 0.1671
Epoch 7/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 328ms/step - accuracy: 0.9933 - loss: 0.0375
Epoch 8/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 337ms/step - accuracy: 0.9914 - loss: 0.0683
Epoch 9/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 338ms/step - accuracy: 0.9859 - loss: 0.1330
Epoch 10/10


Corrupt JPEG data: premature end of data segment
Corrupt JPEG data: 240 extraneous bytes before marker 0xd9


[1m164/164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 337ms/step - accuracy: 0.9764 - loss: 0.4070


<keras.src.callbacks.history.History at 0x17afbba70>