In [1]:
import os
import shutil
import tensorflow
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from PIL import Image

MUSHROOMS_PATH = 'mushrooms_dataset'

# Directory for the images and its subdirectories
images_dir = os.path.join(MUSHROOMS_PATH, 'images')
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]

# Check if in each subdirectory there are more than 5 images
for subdir in subdirs:
    # list of all image files in subdirectory
    images = [img for img in os.listdir(subdir) if img.endswith('.jpg')]

    # if there are less than 5 images in the subdirectory, create a randomly flipped copy of each image in this subdirectory
    if len(images) <= 4:
        for img in images:
            img_path = os.path.join(subdir, img)
            flipped_img = Image.open(img_path).transpose(Image.FLIP_LEFT_RIGHT)
            flipped_img.save(os.path.join(subdir, img.replace('.jpg', '_flipped.jpg')))
            print(f'Created flipped image for {img_path} in {subdir}')



Created flipped image for mushrooms_dataset\images\Amanita_flavella\517934.jpg in mushrooms_dataset\images\Amanita_flavella
Created flipped image for mushrooms_dataset\images\Amanita_flavella\517934_flipped.jpg in mushrooms_dataset\images\Amanita_flavella
Created flipped image for mushrooms_dataset\images\Amanita_flavella\517935.jpg in mushrooms_dataset\images\Amanita_flavella
Created flipped image for mushrooms_dataset\images\Amanita_flavella\517935_flipped.jpg in mushrooms_dataset\images\Amanita_flavella
Created flipped image for mushrooms_dataset\images\Amanita_flavescens\540535.jpg in mushrooms_dataset\images\Amanita_flavescens
Created flipped image for mushrooms_dataset\images\Amanita_flavescens\540535_flipped.jpg in mushrooms_dataset\images\Amanita_flavescens
Created flipped image for mushrooms_dataset\images\Amanita_friabilis\28102.jpg in mushrooms_dataset\images\Amanita_friabilis
Created flipped image for mushrooms_dataset\images\Amanita_friabilis\28102_flipped.jpg in mushroom

In [5]:
# if you ran previous cell again, there will be duplicates named *_flipped_flipped.jpg, we cant have that so we remove them

import glob
path = images_dir
files = glob.glob(os.path.join(path, '**/*_flipped_flipped.jpg'), recursive=True)

for f in files:
    os.remove(f)

print('Removed all double flipped images')

Removed all double flipped images


In [7]:
#creating labels for the images
# labels = [os.path.basename(d) for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]
# labels

['Abortiporus_biennis',
 'Abundisporus_fuscopurpureus',
 'Acanthobasidium_delicatum',
 'Acanthobasidium_penicillatum',
 'Acanthobasidium_phragmitis',
 'Acanthophysellum_canadense',
 'Acanthophysellum_lividocoeruleum',
 'Acarospora',
 'Acarospora_boulderensis',
 'Acarospora_fuscata',
 'Acarospora_glaucocarpa',
 'Acarospora_nodulosa',
 'Acarospora_obnubila',
 'Acarospora_obpallens',
 'Acarospora_placodiiformis',
 'Acarospora_robiniae',
 'Acarospora_schleicheri',
 'Acarospora_socialis',
 'Acarospora_stapfiana',
 'Acarospora_strigata',
 'Acervus_epispartius',
 'Achlya_ambisexualis',
 'Achroomyces',
 'Achroomyces_vestitus',
 'Acrocordia',
 'Acrocordia_conoidea',
 'Acrocordia_gemmata',
 'Acrocordia_macrospora',
 'Acrospermum_compressum',
 'Aculops_rhois',
 'Adelges_tsugae',
 'Aegerita_candida',
 'Aegerita_webberi',
 'Agaricus',
 'Agaricus_abruptibulbus',
 'Agaricus_agrinferus',
 'Agaricus_alboargillascens',
 'Agaricus_albolutescens',
 'Agaricus_altipes',
 'Agaricus_amicosus',
 'Agaricus_andr

In [9]:
#Now we have some ideas for dividing the dataset into training and testing sets. We can use the train_test_split function from scikit-learn to divide the dataset into training and testing sets.
#But for that we will have to put the every image into array and then into a dataframe
#Then we will have to use ImageDataGenerator and flow_from_dataframe to load the images from the dataframe

#Second idea is to manually create the test set by taking 20% of the images from each class and putting them into a separate directory.
#We will then use ImageDataGenerator and flow_from_directory to load the images from the directory.

#In both ideas we need to take in consider stratification, so that the distribution of classes in the training and testing sets is similar.
#For example, if in one class there are 10 images and in another one there are 8 images, we want both  of them to have the same percentage of images in the training and testing sets.

#Third idea is to use the splitfolders library to divide the dataset into training and testing sets.
#But again we have to stratify the dataset which is not supported by that library.

#So the first idea might require a lot of memory usage, the second idea needs us to well do this manually which is not very efficient.
#And the third idea is not supporting stratification.

#So for now we will use the first idea and divide the dataset into training and testing sets using the train_test_split function from scikit-learn which has the stratify parameter.


In [12]:
#So the process with the first idea is as follows:
#1. Load the images and its corresponding labels into a dataframe.
#2. Divide the dataset into training and testing sets using the train_test_split function from scikit-learn with stratification.
#3. Use ImageDataGenerator and flow_from_dataframe to load the images from the dataframe.


'Abortiporus_biennis'

In [15]:
data = []
for subdir in subdirs:
    label = os.path.basename(subdir) #we specify the label for each image
    for filename in os.listdir(subdir):
        if filename.endswith('.jpg'):
            data.append((os.path.join(subdir, filename), label)) #we need to include whole path of the image for using flow_from_dataframe because it reads the images directly from the file system using the paths provided in the DataFrame.
data_df = pd.DataFrame(data, columns=['filename', 'label'])

In [16]:
data_df.head()

Unnamed: 0,filename,label
0,mushrooms_dataset\images\Amanita_farinosa\1201...,Amanita_farinosa
1,mushrooms_dataset\images\Amanita_farinosa\1201...,Amanita_farinosa
2,mushrooms_dataset\images\Amanita_farinosa\1207...,Amanita_farinosa
3,mushrooms_dataset\images\Amanita_farinosa\1207...,Amanita_farinosa
4,mushrooms_dataset\images\Amanita_farinosa\1227...,Amanita_farinosa


In [17]:
train_df, test_df = train_test_split(data_df, test_size=0.2, stratify=data_df['label'], random_state=42)

In [21]:
datagen = ImageDataGenerator(rescale=1./255, zoom_range=0.2, shear_range=0.2, validation_split=0.25,) #we use 25% from the 80% of the training set as the validation set which will be the same amount as the testing set

train_data = datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='filename',
    y_col='label',
    target_size=(128, 128),
    class_mode='categorical',
    batch_size=64,
    subset='training'
)


val_data = datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='filename',
    y_col='label',
    target_size=(128, 128),
    class_mode='categorical',
    batch_size=64,
    subset='validation'
)

datagen_test = ImageDataGenerator(rescale=1./255)

test_data = datagen_test.flow_from_dataframe(
    dataframe=test_df,
    x_col='filename',
    y_col='label',
    target_size=(128, 128),
    class_mode='categorical',
    batch_size=64
)

Found 150082 validated image filenames belonging to 7504 classes.
Found 50027 validated image filenames belonging to 7504 classes.
Found 50028 validated image filenames belonging to 6903 classes.


In [None]:
#It worked, but why do we have only 6903 classes in the test set and 7504 in training and validation sets? 
# Perhaps there are not enough images in some classes???

#//TODO
# It is probably true beacause when we use stratify parameter in train_test_split function, it tries to keep the distribution of classes in the training and testing sets similar.
# But if there are not enough images in some classes, it will not be able to keep the distribution of classes similar in the training and testing sets.
# So we have few solutions to this
# 1. Ensure that each class has a minimum number of instances before splitting the data into training and testing sets
# 2. Use tge stratify sampling only on the classes with sufficient instances, and randomly split the ones with too few instances

//TODO
Przemyśleć czy potrzebujemy tak dużej ilości gatunków grzybów oraz jak usunąć zdjęcia niebędące grzybami zbieralnymi, np mamy tutaj parę pleśni nie grzybów, dlatego trzeba by sprawdzić mniej więcej jakie grzyby czy inne twory tam mamy za pomocą dodatkowych plików csv które MushroomObserver udostępnia


In [22]:
#Creating the first model
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(7504, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(train_data, validation_data=val_data, epochs=20)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20

KeyboardInterrupt: 

In [None]:
model.evaluate(test_data)