In [2]:
import os
import shutil
import tensorflow
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from PIL import Image

MUSHROOMS_PATH = 'mushrooms_dataset'

# Directory for the images and its subdirectories
images_dir = os.path.join(MUSHROOMS_PATH, 'images')
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]






In [3]:
len(subdirs)

7728

Now we check whether the mushrooms actually belong to mushrooms (are in Fungi kingdom)

In [4]:
fungus_types_csv = pd.read_csv("name_classifications.csv", delimiter="\t")
fungus_types_df = pd.DataFrame(fungus_types_csv)
fungus_types_df 

Unnamed: 0,name_id,domain,kingdom,phylum,class,order,family
0,1,Eukarya,,,,,
1,2,Eukarya,Fungi,Ascomycota,Sordariomycetes,Xylariales,Xylariaceae
2,3,Eukarya,Fungi,Ascomycota,Sordariomycetes,Xylariales,Xylariaceae
3,4,Eukarya,Fungi,Ascomycota,Sordariomycetes,Xylariales,Xylariaceae
4,5,Eukarya,Fungi,Basidiomycota,Agaricomycetes,Agaricales,Mycenaceae
...,...,...,...,...,...,...,...
32325,113684,Eukarya,Protozoa,Myxomycota,,Physarales,Didymiaceae
32326,113685,Eukarya,Fungi,Basidiomycota,Agaricomycetes,Agaricales,Clavariaceae
32327,113686,Eukarya,Fungi,Ascomycota,Sordariomycetes,Hypocreales,Ophiocordycipitaceae
32328,113687,Eukarya,Fungi,Basidiomycota,Agaricomycetes,Russulales,Russulaceae


In [5]:
non_fungus_types = fungus_types_df[fungus_types_df['kingdom'] != 'Fungi']
non_fungus_types['kingdom'].unique() # We see now that are some non-fungus types in the dataset

array([nan, 'Protozoa', 'Plantae', 'Animalia', 'Amoebozoa', 'Chromista',
       'Eubacteria', 'Anamalia', 'Bacteria'], dtype=object)

In [6]:
#We need to create a list of the fungus families that are not fungi
non_fungus_families = non_fungus_types['family'].unique()
non_fungus_families

array([nan, 'Physaraceae', 'Stemonitidaceae', 'Trichiaceae',
       'Tubiferaceae', 'Ceratiomyxaceae', 'Didymiaceae', 'Arcyriaceae',
       'Sarcosomataceae', 'Ericaceae', 'Orchidaceae', 'Teloschistaceae',
       'Saprolegniaceae', 'Peronosporaceae', 'Dictyosteliaceae',
       'Acrasiaceae', 'Thraustochytriaceae', 'Labyrinthulaceae',
       'Albuginaceae', 'Nostocaceae', 'Dictydiaethaliaceae',
       'Cribrariaceae', 'Nectriaceae', 'Cortinariaceae', 'Boletaceae',
       'Bolbitiaceae', 'Pythiaceae', 'Orobanchaceae', 'Liceaceae',
       'Stereaceae', 'Hygrophoraceae', 'Pinaceae', 'Eriophyidae',
       'Clastodermataceae', 'Dianemataceae', 'Echinosteliaceae',
       'Elaeomyxaceae', 'Listerellaceae', 'Guttulinaceae', 'Gurleyidae',
       'Adelgidae', 'Burmanniaceae', 'Fagaceae', 'Triuridaceae',
       'Cynipidae', 'Clavicipitaceae', 'Frankiaceae',
       'Enterobacteriaceae', 'Aytoniaceae', 'Rhizobiaceae',
       'Asterinaceae', 'Taphrinaceae', 'Tubeufiaceae', 'Pleomassariaceae',
       

In [7]:
non_fungus_types = non_fungus_types.drop(non_fungus_types[non_fungus_types['name_id'] == 1].index)
non_fungus_types

Unnamed: 0,name_id,domain,kingdom,phylum,class,order,family
7,13,Eukarya,Protozoa,,,,
124,189,Eukarya,Protozoa,Myxomycota,,Physarales,Physaraceae
187,267,Eukarya,Protozoa,Myxomycota,,Physarales,Physaraceae
364,498,Eukarya,Protozoa,Myxomycota,Myxomycetes,Stemonitidales,Stemonitidaceae
561,789,Eukarya,Protozoa,Myxomycota,,Physarales,Physaraceae
...,...,...,...,...,...,...,...
32314,113673,,Protozoa,Amoebozoa,Myxogastrea,Trichiida,Trichiaceae
32315,113674,Eukarya,Protozoa,Myxomycota,Myxomycetes,Stemonitidales,Stemonitidaceae
32322,113681,Eukarya,Protozoa,Mycetozoa,Myxogastria,Physarales,Didymiaceae
32325,113684,Eukarya,Protozoa,Myxomycota,,Physarales,Didymiaceae


In [8]:
#Because these names are the family names of the non-fungus types, but in our dataset they are named by the genus name so we need to handle this
#We have a csv file that contains the genus names of the non-fungus types based on name_id that is in our non_fungus_types dataframe
#We want to get the genus names of the non-fungus types and then we will use them to filter out the non-fungus types from our dataset
fungus_genus_names = pd.read_csv("names.csv", delimiter="\t")
fungus_genus_names_df = pd.DataFrame(fungus_genus_names)
fungus_genus_names_df = fungus_genus_names_df.drop(fungus_genus_names_df[fungus_genus_names_df['id'] == 1].index)
fungus_genus_names_df


Unnamed: 0,id,text_name,author,deprecated,correct_spelling_id,synonym_id,rank
1,2,Xylaria polymorpha group,J.D. Rogers,0,,8975.0,16
2,3,Xylaria magnoliae,J.D. Rogers,0,,,4
3,4,Xylaria hypoxylon group,J.D. Rogers,0,,3692.0,16
4,5,Xeromphalina,Kühner & Maire,0,,6577.0,9
5,6,Xerocomus zelleri,(Murrill) Snell,1,,505.0,4
...,...,...,...,...,...,...,...
67277,113841,Armillaria rhizomorphs,,0,,,4
67278,113842,"Clavaria ""sp-MI01""",S.D. Russell crypt. temp.,0,,,4
67279,113843,"Entoloma ""sp-IN46""",S.D. Russell crypt. temp.,0,,,4
67280,113844,"Tulostoma ""sp-TAC625""",crypt. temp.,0,,,4


In [9]:
#We create a list of nun-fungus genus names based on the name_id from the non_fungus_types dataframe and the id from the fungus_genus_names_df dataframe
non_fungus_genus_names = []
for i in non_fungus_types['name_id']:
    non_fungus_genus_names.append(fungus_genus_names_df[fungus_genus_names_df['id'] == i]['text_name'].values[0])
non_fungus_genus_names

['Myxomycota',
 'Leocarpus fragilis',
 'Fuligo septica',
 'Stemonitis fusca',
 'Fuligo',
 'Leocarpus',
 'Stemonitis',
 'Trichia varia',
 'Lycogala epidendrum',
 'Marchantia',
 'Enteridium',
 'Enteridium lycoperdon',
 'Ceratiomyxa',
 'Ceratiomyxa fruticulosa',
 'Diderma',
 'Brefeldia',
 'Brefeldia maxima',
 'Hemitrichia',
 'Hemitrichia clavata',
 'Protozoa',
 'Liceales',
 'Lycogalaceae',
 'Lycogala',
 'Arcyria',
 'Arcyria denudata',
 'Diachea',
 'Diachea leucopodia',
 'Didymium',
 'Didymium verrucosporum',
 'Hemitrichia calyculata',
 'Hemitrichia serpula',
 'Lamproderma',
 'Lamproderma arcyrionema',
 'Lamproderma scintillins',
 'Physarum',
 'Physarum globuliferum',
 'Physarum melleum',
 'Physarum nutans',
 'Trichia',
 'Trichia favoginea',
 'Tubifera',
 'Tubifera microsperma',
 'Sarcosoma mexicanum',
 'Reticularia',
 'Reticularia lycoperdon',
 'Oligonema',
 'Craterium',
 'Craterium minutum',
 'Badhamia',
 'Badhamia panicea',
 'Mucilago',
 'Sarcodes sanguinea',
 'Pityopus',
 'Pityopus cal

In [10]:
#We can see that the names sometimes contanins something like "var." or "f." or "subsp." so we need to remove the unnecessary part it from the names
#It always contains 1 or 2 words that are genus names, so we can just split the name by space and take the first 2 words
def clean_genus_names(genus_names):
    cleaned_genus_names = []
    for name in genus_names:
        words = name.split() # Split the name by space
        if len(words) > 2:
            name = ' '.join(words[:2])
        cleaned_genus_names.append(name)
    return cleaned_genus_names

non_fungus_genus_names = clean_genus_names(non_fungus_genus_names)
non_fungus_genus_names

['Myxomycota',
 'Leocarpus fragilis',
 'Fuligo septica',
 'Stemonitis fusca',
 'Fuligo',
 'Leocarpus',
 'Stemonitis',
 'Trichia varia',
 'Lycogala epidendrum',
 'Marchantia',
 'Enteridium',
 'Enteridium lycoperdon',
 'Ceratiomyxa',
 'Ceratiomyxa fruticulosa',
 'Diderma',
 'Brefeldia',
 'Brefeldia maxima',
 'Hemitrichia',
 'Hemitrichia clavata',
 'Protozoa',
 'Liceales',
 'Lycogalaceae',
 'Lycogala',
 'Arcyria',
 'Arcyria denudata',
 'Diachea',
 'Diachea leucopodia',
 'Didymium',
 'Didymium verrucosporum',
 'Hemitrichia calyculata',
 'Hemitrichia serpula',
 'Lamproderma',
 'Lamproderma arcyrionema',
 'Lamproderma scintillins',
 'Physarum',
 'Physarum globuliferum',
 'Physarum melleum',
 'Physarum nutans',
 'Trichia',
 'Trichia favoginea',
 'Tubifera',
 'Tubifera microsperma',
 'Sarcosoma mexicanum',
 'Reticularia',
 'Reticularia lycoperdon',
 'Oligonema',
 'Craterium',
 'Craterium minutum',
 'Badhamia',
 'Badhamia panicea',
 'Mucilago',
 'Sarcodes sanguinea',
 'Pityopus',
 'Pityopus cal

In [11]:
#Now that we have a list with the genus names of the non-fungus types we can filter out the non-fungus types from our dataset
#We will delete the non-fungus types based on the genus names of the non-fungus types from our directory

def delete_non_fungus_types(non_fungus_genus_names, images_dir):
    for subdir in os.listdir(images_dir):
        if subdir in non_fungus_genus_names:
            shutil.rmtree(os.path.join(images_dir, subdir))
            print(f"Deleted {subdir}")
delete_non_fungus_types(non_fungus_genus_names, images_dir)


In [12]:
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]
len(subdirs)

7728

In [14]:
#It didint quite work as expected. It seems that we dont have the exact genus names of the non-fungus types in our dataset. Maybe we can also try to delete fungi types
#based on their phyllum names not kingdom. We can try to get the phyllum names of the non-fungus types and then delete the fungi types based on their phyllum names
#Lets check how many unique phyllum names we have in our dataset and how many fungis each phyllum contains

fungus_phyllums = fungus_types_df['phylum'].unique()
fungus_phyllums

array([nan, 'Ascomycota', 'Basidiomycota', 'Myxomycota', 'Mucoromycota',
       'Mycetozoa', 'Zygomycota', 'Glomeromycota', 'Amoebozoa',
       'Tracheophyta', 'Blastocladiomycota', 'Oomycota', 'Percolozoa',
       'Labyrinthista', 'Zoopagomycota', 'Cyanobacteria', 'Deuteromycota',
       'Chytridiomycota', 'Marchantiophyta', 'Pinophyta', 'Arthropoda',
       'Magnoliophyta', 'Microspora', 'Microsporidia', 'Fossil',
       'Bryophyta', 'Kickxellomycota', 'Basidiomycetes', 'Olpidiomycota',
       'Plasmodiophoromycota', 'Entomophthoromycota', 'Athropoda',
       'Proteobacteria'], dtype=object)

In [None]:
# Check if in each subdirectory there are more than 5 images
for subdir in subdirs:
    # list of all image files in subdirectory
    images = [img for img in os.listdir(subdir) if img.endswith('.jpg')]

    # if there are less than 5 images in the subdirectory, create a randomly flipped copy of each image in this subdirectory
    if len(images) <= 4:
        for img in images:
            img_path = os.path.join(subdir, img)
            flipped_img = Image.open(img_path).transpose(Image.FLIP_LEFT_RIGHT)
            flipped_img.save(os.path.join(subdir, img.replace('.jpg', '_flipped.jpg')))
            print(f'Created flipped image for {img_path} in {subdir}')


In [5]:
# if you ran previous cell again, there will be duplicates named *_flipped_flipped.jpg, we cant have that so we remove them

import glob
path = images_dir
files = glob.glob(os.path.join(path, '**/*_flipped_flipped.jpg'), recursive=True)

for f in files:
    os.remove(f)

print('Removed all double flipped images')

Removed all double flipped images
