In [2]:
import pandas as pd
import numpy as np
from PIL import Image
import os
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
import pickle


In [None]:
# get the subfolders in root folder 
root_folder = r"Resources"
sub_folder_names = os.listdir(root_folder)
sub_folder_names

In [None]:
#empty list for dataframes
df_list = []

# make dataframes for each subfolder and label each image
for sub_folder_name in sub_folder_names:
    subfolder_path = os.path.join(root_folder, sub_folder_name)
    if not os.path.isdir(subfolder_path):
        continue  # Skip if it's not a directory // such as .DS_Store
    else:
        file_names = os.listdir(subfolder_path)
        df = pd.DataFrame(file_names, columns=['image_id'])
        df['label'] = sub_folder_name
        df_list.append(df)

# concat list of dataframes into one dataframe
concat_df = pd.concat(df_list)
concat_df

In [None]:
#check if there are null values
concat_df.info()

In [None]:
# check that all image ids are unique
len(concat_df['image_id'].unique())

In [None]:
# value counts for labels
concat_df['label'].value_counts()

In [None]:
# graph value counts
concat_df['label'].value_counts().sort_values().plot.barh()

In [8]:
# create empty list to store images
imgs = []
# create empty list to image formats
imgs_format = []

# iterate through each row and get file path for each img
# open image and append it to empty list
for row in concat_df.itertuples():
    file_path = os.path.join(root_folder, row.label, row.image_id)
    with Image.open(file_path) as img:
        imgs_format.append(img.format)
        imgs.append(img.copy())
    


In [None]:
# make sure all images in same file format
set(imgs_format)

In [None]:
# Check for unique image sizes
sizes = set([img.size for img in imgs])
sizes

In [None]:
# create an empty dictionary to store img size counts
img_size_count = {}

# get counts for each image size
for img in imgs:
    if str(img.size) in img_size_count:
        img_size_count[str(img.size)] += 1
    else:
        img_size_count[str(img.size)] = 1

# print to determine which image size that is most abundant in dataset
img_size_count

In [None]:
# use target size that is most abundant in dataset 
target_size = (150, 150)

# resize images
resized_imgs = [img.resize(target_size, resample = Image.LANCZOS) for img in imgs]
resized_imgs[1]

In [None]:
# check if resizing images was succesful by checking unique values again
sizes = set([img.size for img in resized_imgs])
sizes

In [None]:
# make sure all images are in RGB format
set([img.mode for img in resized_imgs])

In [None]:
# Convert all images to floating point numpy arrays
float_images = [np.array(img).astype(np.float32) for img in resized_imgs]

# Display the pixel values of the first image
print("Pixel Values:")
print(float_images[0])

In [None]:
# Normalize pixel values to a range between 0 and 1,
# divide all pixel values by the max of 255
normalized_images = [img/255 for img in float_images]

# Display the pixel values of the first image
print("Pixel Values:")
print(normalized_images[0])

In [None]:
X = np.array(normalized_images)
X

In [None]:
X.shape

In [None]:
y = concat_df['label']
set(y)

In [None]:
# Label encode the y data
y_encoder = LabelEncoder().fit(y)
y = y_encoder.transform(y)
set(y)


In [21]:
# Split the training dataset into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [25]:
# unique, frequency = np.unique(y_train_aug, 
#                               return_counts = True)

# print(unique)
# print(frequency)

In [26]:
# unique, frequency = np.unique(y_test, 
#                               return_counts = True)

# print(unique)
# print(frequency)

In [None]:
# # Define the augmentation pipeline
# data_augmentation = tf.keras.Sequential([
#     tf.keras.layers.RandomRotation(0.05),        # Random rotation (20 degrees)
#     tf.keras.layers.RandomZoom(0.2),             # Random zoom
#     tf.keras.layers.RandomFlip('horizontal')     # Random horizontal flip

# ])


In [None]:
# # Create an empty list for both X and y augmentations
# X_train_aug = []
# y_train_aug = []

# # Loop through each image in the training data
# for i in range(len(X_train)):
#     # Select the image and its y label
#     img = X_train[i]
#     label = y_train[i]

#     # Add the batch dimension
#     img = np.expand_dims(img, axis=0)

#     # Use a loop to create 2 new images
#     # Append each to X_train_aug
#     # For each image, append the correct label to y_train_aug
#     for j in range(1):
#         X_train_aug.append(data_augmentation(img, training=True)[0].numpy())
#         y_train_aug.append(label)

# # Print the lengths of both augmented sets to ensure they are the same length
# print(len(X_train_aug))
# print(len(y_train_aug))

In [22]:
# with open('X_test.pkl', 'wb') as file:
#     pickle.dump(X_test, file)

In [23]:
# with open('y_test.pkl', 'wb') as file:
#     pickle.dump(y_test, file)

In [4]:
with open('X_train_aug.pkl', 'rb') as file:
    X_train_aug = pickle.load(file)

with open('y_train_aug.pkl', 'rb') as file:
    y_train_aug = pickle.load(file)


with open('y_test.pkl', 'rb') as file:
    y_test = pickle.load(file)

with open('X_test.pkl', 'rb') as file:
    X_test = pickle.load(file)

In [67]:
# # Visualize the original and augmented images
# plt.figure(figsize=(12, 6))
# for i in range(3):
#     plt.subplot(1, 3, i + 1)
#     if i == 0:
#         plt.imshow((reshaped_image_array[0, :, :, 0]*255).astype('uint8'), cmap='gray')  # Original image
#     else:
#         plt.imshow((augmented_images[i - 1][:, :, 0]*255).astype('uint8'), cmap='gray')
#     plt.axis('off')

# plt.show()

In [None]:
X_train_aug

In [None]:
# Convert values to numpy arrays
X_train_aug_np = np.array(X_train_aug)
X_test_np = np.array(X_test)
y_train_aug_np = np.array(y_train_aug)
y_test_np = np.array(y_test)

In [None]:
X_train_aug.shape

In [None]:
y_train_aug.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
# Define a CNN model
model = keras.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    layers.MaxPooling2D((2, 2)),
    # layers.Conv2D(64, (3, 3), activation='relu'),
    # layers.MaxPooling2D((2, 2)),
    # layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(6, activation='softmax')  #6 classes
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
# batch_size = 32
epochs = 10
history = model.fit(
    X_train_aug, y_train_aug,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size= 32
)