In [1]:
import pandas as pd
import numpy as np
class_names = np.load("class_names.npy", allow_pickle=True).item()
class_attributes = np.load("attributes.npy")
class_labels = list(class_names.values())
df = pd.read_csv('train_images.csv')
df['attributes'] = df['label'].apply(lambda x: class_attributes[class_labels.index(x)])
df['image_path'] = df['image_path'].str.lstrip('/')

In [2]:
from PIL import Image, ImageOps

def preprocess_image(image_path):
    image = Image.open(image_path)
    padded_image = ImageOps.pad(image, (500, 500), color='black')
    resized_image = padded_image.resize((250, 250), Image.LANCZOS)
#     return resized_image
    image_array = np.array(resized_image) / 255.0  # Normalize pixel values to [0, 1]
    if image_array.shape == (250, 250, 3):
        return image_array
    else:
        return 'error'
#         return np.zeros((250, 250, 3))  # Return a blank image if the shape is incorrect


In [3]:
df['processed_image'] = df['image_path'].apply(preprocess_image)


In [4]:
# Check if any entries in the 'processed_image' column are string (find 'error')
errors = df[df['processed_image'].apply(lambda x: isinstance(x, str))]

print(errors)


                 image_path  label  \
255    train_images/256.jpg      9   
740    train_images/741.jpg     25   
1825  train_images/1826.jpg     63   
1835  train_images/1836.jpg     63   
2818  train_images/2819.jpg    108   

                                             attributes processed_image  
255   [0.004741491045300055, 0.0, 0.0094829820905760...           error  
740   [0.01095033675287154, 0.014600449003846848, 0....           error  
1825  [0.013956050085295966, 0.1011813631185804, 0.0...           error  
1835  [0.013956050085295966, 0.1011813631185804, 0.0...           error  
2818  [0.0, 0.01547173546312857, 0.0, 0.0, 0.0, 0.0,...           error  


In [5]:
# Remove rows where 'processed_image' column contains 'error'
df = df[df['processed_image'].apply(lambda x: not isinstance(x, str))]


           image_path  label  \
0  train_images/1.jpg      1   
1  train_images/2.jpg      1   
2  train_images/3.jpg      1   
3  train_images/4.jpg      1   
4  train_images/5.jpg      1   

                                          attributes  \
0  [0.010638400403539122, 0.010638400403539122, 0...   
1  [0.010638400403539122, 0.010638400403539122, 0...   
2  [0.010638400403539122, 0.010638400403539122, 0...   
3  [0.010638400403539122, 0.010638400403539122, 0...   
4  [0.010638400403539122, 0.010638400403539122, 0...   

                                     processed_image  
0  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  
1  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  
2  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  
3  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  
4  [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...  


In [6]:
# class imbalance
df['label'].value_counts()

label
1      35
2      35
4      35
10     34
16     33
       ..
196     6
197     5
198     5
199     5
200     5
Name: count, Length: 200, dtype: int64

In [7]:
df['processed_image'].apply(lambda x: x.shape).nunique() == 1

True

In [8]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# get underrepresented classes
class_counts = df['label'].value_counts()
underrepresented_classes = class_counts[class_counts < 35].index

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Function to augment images
def augment_images(images, target_count):
    augmented_images = []
    for image in images:
        image = np.expand_dims(image, axis=0)
        i = 0
        for batch in datagen.flow(image, batch_size=1):
            augmented_images.append(batch[0])
            i += 1
            if i >= target_count:
                break
    return augmented_images[:target_count]  # Ensure the exact number of images is returned

# Upsample underrepresented classes
for label in underrepresented_classes:
    images = df[df['label'] == label]['processed_image'].tolist()
    current_count = len(images)
    target_count = 35 - current_count
    augmented_images = augment_images(images, target_count)
    
    # Verify the shapes
    for img in augmented_images:
        if img.shape != (250, 250, 3):
            print(f"Error: Augmented image has incorrect shape {img.shape}")
    
    # Ensure the number of augmented images matches the target count
    if len(augmented_images) != target_count:
        print(f"Error: Expected {target_count} augmented images, but got {len(augmented_images)}")
    
    augmented_df = pd.DataFrame({'processed_image': augmented_images, 'label': [label] * target_count})
    df = pd.concat([df, augmented_df], ignore_index=True)

print(df['label'].value_counts())
print

label
1      35
138    35
128    35
129    35
130    35
       ..
70     35
71     35
72     35
73     35
200    35
Name: count, Length: 200, dtype: int64


In [10]:
(df['label'].value_counts() == 35).all()

True

In [12]:
# df.to_csv('upsampled_np_arryed_images.csv', index=False)

In [13]:
# Save the 'processed_image' column to a .npy file
np.save('processed_images.npy', df['processed_image'].values)

In [32]:
# Create a dictionary to store the first non-NaN 'attributes' value for each label
attributes_dict = {}

for label in df['label'].unique():
    # Get the first non-NaN 'attributes' value for the current label
    first_non_nan = df[df['label'] == label]['attributes'].dropna().iloc[0]
    attributes_dict[label] = first_non_nan

# Function to fill NaN values in 'attributes' column based on 'label'
def fill_nan_attributes(row):
    if np.isnan(row['attributes']).any():
        return attributes_dict[row['label']]
    else:
        return row['attributes']


df['attributes'] = df.apply(fill_nan_attributes, axis=1)

In [53]:
np.save('upsampled_attributes.npy', df['attributes'].values)