## About: Use papeclips images to regress into values
## Date: 31/03/23

An attempt to regress paperclips images into amount of visible paperclips on each image. Simple CNN used. Three channels used.

Dataset author: Jeff Heaton <br/>
Code inspired by: Jeff Heaton


In [None]:
# importing necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import numpy as np
import pandas as pd
import os

In [None]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# read csv with descriptors
df = pd.read_csv("/content/drive/My Drive/paperclips_dataset/train.csv", na_values=['NA', '?'])
df['filename'] = 'clips-'+df['id'].astype(str)+'.png'
df.head(5)

Unnamed: 0,id,clip_count,filename
0,30001,11,clips-30001.png
1,30002,2,clips-30002.png
2,30003,26,clips-30003.png
3,30004,41,clips-30004.png
4,30005,49,clips-30005.png


In [None]:
# specify percentage of data used for training
TRAIN_PCT = 0.9
TRAIN_CUT = int(len(df) * TRAIN_PCT)

# split the data to train and validation
df_train = df[0:TRAIN_CUT]
df_validate = df[TRAIN_CUT:]

TEST_PCT = 0.9
TEST_CUT = int(len(df_train) * TEST_PCT)

df_test = df_train[TEST_CUT:]
df_train = df_train[0:TEST_CUT]

# print their sizes
print(f'Train size: {len(df_train)}')
print(f'Validation size: {len(df_validate)}')
print(f'Test size: {len(df_test)}')

Train size: 16200
Validation size: 2000
Test size: 1800


In [None]:
# specify values that go into ImageDataGenerator and CNN model
IMAGES_DIR = r'/content/drive/My Drive/paperclips_dataset/clips-data-2020/clips'
Y_COL = "clip_count"
HEIGHT = 100
WIDTH = 100
BATCH = 32
EPOCHS = 10

In [None]:
# improve amount of data using ImageDataGenerator
training_datagen = ImageDataGenerator(
    rescale = 1./255,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

train_generator = training_datagen.flow_from_dataframe(
    dataframe=df_train,
    directory=IMAGES_DIR,
    x_col="filename",
    y_col=Y_COL,
    target_size= (HEIGHT,WIDTH),
    batch_size=BATCH,
    class_mode='other'
)

validation_datagen = ImageDataGenerator(rescale = 1./255)

val_generator = validation_datagen.flow_from_dataframe(
        dataframe=df_validate,
        directory=IMAGES_DIR,
        x_col="filename",
        y_col=Y_COL,
        target_size=(HEIGHT,WIDTH),
        class_mode='other')

Found 16199 validated image filenames.




Found 2000 validated image filenames.


In [None]:
# create the model with early stopping
from tensorflow.keras.callbacks import EarlyStopping

model = tf.keras.models.Sequential([
    # Note the input shape is the desired size with 3 bytes color
    # This is the first convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu', input_shape=(HEIGHT, WIDTH, 3)),
    tf.keras.layers.MaxPooling2D(2, 2),
    # The second convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    # 512 neuron hidden layer
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')
])


model.summary()
epoch_steps = 250 # needed for 2.2
validation_steps = len(df_validate)
model.compile(loss = 'mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto',
        restore_best_weights=True)
history = model.fit(train_generator,  
  verbose = 1, 
  validation_data=val_generator, callbacks=[monitor], epochs=EPOCHS)
#  steps_per_epoch=epoch_steps, validation_steps=validation_steps, # needed for 2.2


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 98, 98, 64)        1792      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 49, 49, 64)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 47, 47, 64)        36928     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 23, 23, 64)       0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 33856)             0         
                                                                 
 dense (Dense)               (None, 512)               1

In [None]:
'''
# loading test images paths into df
# drop duplicates
# add the column with target values
TEST_IMAGES_DIR = '/content/drive/My Drive/tox_files/test_images'
test_images = load_images_from_folder(TEST_IMAGES_DIR)
df_test = pd.DataFrame (test_images, columns = ['filename', 'id'])
df_test = df_test.drop_duplicates(subset=['id'])
df_test = pd.merge(df_test,df[['id',Y_COL]],on='id', how='left')
df_test.head(5)
'''

"\n# loading test images paths into df\n# drop duplicates\n# add the column with target values\nTEST_IMAGES_DIR = '/content/drive/My Drive/tox_files/test_images'\ntest_images = load_images_from_folder(TEST_IMAGES_DIR)\ndf_test = pd.DataFrame (test_images, columns = ['filename', 'id'])\ndf_test = df_test.drop_duplicates(subset=['id'])\ndf_test = pd.merge(df_test,df[['id',Y_COL]],on='id', how='left')\ndf_test.head(5)\n"

In [None]:
# using ImageDataGenerator on test images
test_datagen = ImageDataGenerator(rescale = 1./255)

test_generator = test_datagen.flow_from_dataframe(
        dataframe=df_test,
        directory=IMAGES_DIR,
        x_col="filename",
        batch_size=1,
        shuffle=False,
        target_size=(HEIGHT,WIDTH),
        class_mode=None)

Found 1800 validated image filenames.


In [None]:
# reseting the generator and predicting the values
test_generator.reset()
pred = model.predict(test_generator,steps=len(df_test))



In [None]:
# add new column with predicted values
# predicted values should be beside actual values 
df_test['Predicted values'] = pred.flatten()
df_test

Unnamed: 0,id,clip_count,filename,Predicted values
16200,46201,7,clips-46201.png,7.295506
16201,46202,56,clips-46202.png,55.417664
16202,46203,5,clips-46203.png,5.841433
16203,46204,36,clips-46204.png,36.890785
16204,46205,71,clips-46205.png,67.102493
...,...,...,...,...
17995,47996,9,clips-47996.png,8.777569
17996,47997,45,clips-47997.png,50.643719
17997,47998,52,clips-47998.png,50.076843
17998,47999,63,clips-47999.png,49.328999
