# Deep Learning final project

#### Load packages

In [24]:
#pip uninstall keras
#!pip install keras
#!pip install tensorflow
#!pip install sklearn

In [101]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import keras
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from keras.optimizers import RMSprop

In [102]:
#get working directory
os. getcwd()

'/home/akarunakaran1/BotanistData'

In [54]:
#change working directory to
working_di_path = "/home/akarunakaran1/BotanistData/"
os. chdir(working_di_path)
os.getcwd()

'/home/akarunakaran1/BotanistData'

In [103]:
#read csv file
botanist_csv = pd.read_csv("/home/akarunakaran1/BotanistData/Botanist_Training_Set.csv",dtype=str)
print("Number of rows in .csv file is: ", botanist_csv.shape)

Number of rows in .csv file is:  (50000, 2)


In [104]:
#split into train and test data
train, test = train_test_split(botanist_csv, test_size=0.1)
print("Number of records for train", len(train))
print("Number of records for test", len(test))

Number of records for train 45000
Number of records for test 5000


In [32]:
train.columns

Index(['FileName', 'Label'], dtype='object')

In [105]:
#append the Filename in .csv file with .jpg
def append_ext(fn):
    return fn+".jpg"

train_df = train.copy()
test_df = test.copy()

In [106]:
train_df["FileName"] = train_df["FileName"].apply(append_ext)
print(train_df.head(5))

test_df["FileName"] = test_df["FileName"].apply(append_ext)
print(test_df.head(5))

           FileName Label
17988  45475314.jpg    26
6838   75420537.jpg    11
3612   28844103.jpg    25
648    51844002.jpg    20
32978  26973763.jpg     7
           FileName Label
12805  76528282.jpg    11
19020  13689463.jpg    34
22850  77606856.jpg    17
37423  35148993.jpg    33
37330  69622618.jpg     7


In [107]:
#object for ImageDataGenerator
datagen = ImageDataGenerator(rescale=1./255.,validation_split=0.25)

In [108]:
#setting 
#batch_size = 32
#seed = 42

#split into train-validation data
train_generator = datagen.flow_from_dataframe( dataframe=train_df, 
                                              directory="/home/akarunakaran1/BotanistData/Trainfiles/",
                                              x_col="FileName",y_col="Label",subset="training",batch_size=32,seed=42,shuffle=True,
                                              class_mode="categorical",target_size=(32,32))

valid_generator = datagen.flow_from_dataframe(dataframe=train_df, 
                                              directory="/home/akarunakaran1/BotanistData/Trainfiles/",
                                              x_col="FileName",y_col="Label",subset="validation",batch_size=32,seed=42,shuffle=True,
                                              class_mode="categorical",target_size=(32,32))

Found 33750 validated image filenames belonging to 38 classes.
Found 11250 validated image filenames belonging to 38 classes.


In [109]:
#fit model
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',input_shape=(32,32,3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#we have 38 label classes
model.add(Dense(38, activation='softmax'))

# Compile model
model.compile(optimizers.RMSprop(lr=0.0001, decay=1e-6),
              loss="categorical_crossentropy",metrics=["accuracy"])

In [110]:
#setting values
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size

#fitting the model
model.fit_generator(generator=train_generator,steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,validation_steps=STEP_SIZE_VALID,epochs=10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4f448e65e0>

In [111]:
#evaluate against validation data
score = model.evaluate_generator(generator=valid_generator,steps=STEP_SIZE_VALID)
print('Test loss:', score[0])
print('Test accuracy:', score[1])



Test loss: 0.4969847500324249
Test accuracy: 0.8475783467292786


In [112]:
#test generator
test_datagen = ImageDataGenerator(rescale=1./255.)

test_generator = datagen.flow_from_dataframe(dataframe=test_df, directory="/home/akarunakaran1/BotanistData/Trainfiles/",
                                             x_col="FileName",y_col="Label",subset="validation",batch_size=2,seed=42,shuffle=False,
                                             class_mode="categorical",target_size=(32,32))

Found 1250 validated image filenames belonging to 38 classes.


In [113]:
test_generator.reset()
STEP_SIZE_TEST = test_generator.n//test_generator.batch_size
STEP_SIZE_TEST

625

In [114]:
#prediction
pred = model.predict_generator(test_generator,steps=STEP_SIZE_TEST,verbose=1)
print(len(pred))
print("predictions on test data",pred)

predicted_class_indices = np.argmax(pred,axis=1)
print("predicted_class_indices",predicted_class_indices)



1250
predictions on test data [[6.91463763e-04 1.68432493e-03 3.68226141e-01 ... 5.05015603e-04
  1.54414281e-01 2.77566258e-04]
 [2.04891970e-04 3.65200670e-09 1.85727607e-04 ... 7.26991057e-05
  5.66264043e-06 1.63769101e-07]
 [1.11551285e-01 8.24605286e-06 3.18547990e-03 ... 2.33017560e-03
  8.38558096e-03 1.46670820e-04]
 ...
 [3.57120916e-05 3.95739221e-07 1.94174372e-05 ... 1.86047598e-03
  4.90963794e-06 1.82916352e-07]
 [1.03989134e-06 9.23319399e-01 6.06155314e-04 ... 1.58049977e-06
  1.60980606e-04 2.47811414e-02]
 [1.26086780e-10 2.22576645e-12 4.17999619e-12 ... 1.75431717e-06
  5.96850347e-12 1.25386914e-11]]
predicted_class_indices [18 27  8 ...  7  1 31]


In [115]:
#assigning corresponding labels to test data
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]
len(predictions)

1250

In [116]:
#creating dataframe with filename and prediction values
filenames = test_generator.filenames
#len(filenames)
results = pd.DataFrame({"FileName":filenames, "Predictions":predictions})
print(results.head(5))

       FileName Predictions
0  76528282.jpg          26
1  13689463.jpg          34
2  77606856.jpg          17
3  35148993.jpg          34
4  69622618.jpg           7


In [117]:
print("Working directory",os.getcwd())

#save result
results.to_csv("testdata_prediction.csv",index=False)

Working directory /home/akarunakaran1/BotanistData


In [121]:
test_df.head(5)

Unnamed: 0,FileName,Label
12805,76528282.jpg,11
19020,13689463.jpg,34
22850,77606856.jpg,17
37423,35148993.jpg,33
37330,69622618.jpg,7


In [119]:
#find accuracy on testdata

atcual_prediction = pd.merge(test_df, results, on='FileName')
atcual_prediction.shape

(1250, 3)

In [120]:
#find accuracy
print(accuracy_score(atcual_prediction.Label, atcual_prediction.Predictions))


0.8312


In [122]:
#fit new model with more layers
model1 = Sequential()
model1.add(Conv2D(32, (3, 3), padding='same',input_shape=(32,32,3)))
model1.add(Activation('relu'))
model1.add(Conv2D(32, (3, 3)))
model1.add(Activation('relu'))
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.25))
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.25))
model1.add(Conv2D(64, (3, 3), padding='same'))
model1.add(Activation('relu'))
model1.add(Conv2D(64, (3, 3)))
model1.add(Activation('relu'))
model1.add(MaxPooling2D(pool_size=(2, 2)))
model1.add(Dropout(0.25))
model1.add(Flatten())
model1.add(Dense(512))
model1.add(Activation('relu'))
model1.add(Dropout(0.5))
#we have 38 label classes
model1.add(Dense(38, activation='softmax'))

# Compile model
model1.compile(optimizers.RMSprop(lr=0.0002, decay=1e-6),
               loss="categorical_crossentropy",metrics=["accuracy"])

In [123]:
#setting values
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size

#fitting the model
model1.fit_generator(generator=train_generator,steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,validation_steps=STEP_SIZE_VALID,epochs=10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4de0223ee0>

In [124]:
#evaluate against validation data
score = model1.evaluate_generator(generator=valid_generator,steps=STEP_SIZE_VALID)
print('Test loss:', score[0])
print('Test accuracy:', score[1])



Test loss: 0.6202677488327026
Test accuracy: 0.8226495981216431


In [125]:
#prediction
pred = model1.predict_generator(test_generator,steps=STEP_SIZE_TEST,verbose=1)
print(len(pred))
print("predictions on test data",pred)

predicted_class_indices = np.argmax(pred,axis=1)
print("predicted_class_indices",predicted_class_indices)

  1/625 [..............................] - ETA: 1:14



1250
predictions on test data [[1.43880767e-04 1.42836943e-02 4.40297216e-01 ... 3.51437158e-03
  3.33768189e-01 4.99110785e-04]
 [2.40372564e-03 1.50626793e-05 6.91432087e-03 ... 4.47757542e-03
  2.05724942e-03 3.98780576e-05]
 [5.00525115e-03 4.18799800e-07 8.73054450e-05 ... 1.84918725e-04
  3.20935069e-05 1.11485417e-06]
 ...
 [1.04129940e-05 3.03051165e-05 5.36562547e-05 ... 1.09939417e-03
  4.88062142e-06 1.31222407e-06]
 [8.94228094e-07 8.33474278e-01 9.19489656e-03 ... 3.15876445e-04
  1.57263167e-02 6.73807785e-03]
 [1.37353018e-09 1.05127285e-07 1.92731076e-09 ... 6.90359059e-07
  7.48440920e-10 3.01349843e-08]]
predicted_class_indices [ 2 25  8 ...  7  1 31]


In [126]:
#assigning corresponding labels to test data
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]
len(predictions)

1250

In [127]:
#creating dataframe with filename and prediction values
filenames = test_generator.filenames
#len(filenames)
results1 = pd.DataFrame({"FileName":filenames, "Predictions":predictions})
print(results1.head(5))

       FileName Predictions
0  76528282.jpg          11
1  13689463.jpg          32
2  77606856.jpg          17
3  35148993.jpg          34
4  69622618.jpg           7


In [128]:
print("Working directory",os.getcwd())

#save result
results1.to_csv("testdata_prediction_newModel.csv",index=False)

Working directory /home/akarunakaran1/BotanistData


In [129]:
test_df.head(5)

Unnamed: 0,FileName,Label
12805,76528282.jpg,11
19020,13689463.jpg,34
22850,77606856.jpg,17
37423,35148993.jpg,33
37330,69622618.jpg,7


In [130]:
#find accuracy on testdata

atcual_prediction1 = pd.merge(test_df, results1, on='FileName')
atcual_prediction1.shape

(1250, 3)

In [131]:
#find accuracy
print(accuracy_score(atcual_prediction1.Label, atcual_prediction1.Predictions))


0.8104


#### REFERENCES:
+ https://vijayabhaskar96.medium.com/tutorial-on-keras-flow-from-dataframe-1fd4493d237c
+ https://www.kaggle.com/yassineghouzam/introduction-to-cnn-keras-0-997-top-6
+ https://stackoverflow.com/questions/62218611/how-to-train-a-model-with-a-dataset-in-which-image-dataset-is-given-and-label-fo
+ https://studymachinelearning.com/keras-imagedatagenerator-with-flow_from_dataframe/

