In [None]:
import os
import shutil
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

MUSHROOMS_PATH = 'mushrooms_dataset'

# Directory for the images and its subdirectories
images_dir = os.path.join(MUSHROOMS_PATH, 'images')
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]

In [None]:
len(subdirs) #7751

Now we check whether the mushrooms actually belong to mushrooms (are in Fungi kingdom)

In [None]:
fungus_types_csv = pd.read_csv("mushrooms_dataset/name_classifications.csv", delimiter="\t")
fungus_types_df = pd.DataFrame(fungus_types_csv)
fungus_types_df 

In [None]:
non_fungus_types = fungus_types_df[fungus_types_df['kingdom'] != 'Fungi']
non_fungus_types['kingdom'].unique() # We see now that are some non-fungus types in the dataset

In [None]:
# We need to create a list of the fungus families that are not fungi
non_fungus_families = non_fungus_types['family'].unique()
non_fungus_families

In [None]:
non_fungus_types = non_fungus_types.drop(non_fungus_types[non_fungus_types['name_id'] == 1].index)
non_fungus_types

In [None]:
# Because these names are the family names of the non-fungus types, but in our dataset they are named by the genus name so we need to handle this
# We have a csv file that contains the genus names of the non-fungus types based on name_id that is in our non_fungus_types dataframe
# We want to get the genus names of the non-fungus types and then we will use them to filter out the non-fungus types from our dataset
fungus_genus_names = pd.read_csv("mushrooms_dataset/names.csv", delimiter="\t")
fungus_genus_names_df = pd.DataFrame(fungus_genus_names)
fungus_genus_names_df = fungus_genus_names_df.drop(fungus_genus_names_df[fungus_genus_names_df['id'] == 1].index)
fungus_genus_names_df


In [None]:
# We create a list of nun-fungus genus names based on the name_id from the non_fungus_types dataframe and the id from the fungus_genus_names_df dataframe
non_fungus_genus_names = []
for i in non_fungus_types['name_id']:
    non_fungus_genus_names.append(fungus_genus_names_df[fungus_genus_names_df['id'] == i]['text_name'].values[0])
non_fungus_genus_names

In [None]:
# We can see that the names sometimes contanins something like "var." or "f." or "subsp." so we need to remove the unnecessary part it from the names
# It always contains 1 or 2 words that are genus names, so we split the name by space and take the first 2 words and join them by '_' sign
def clean_genus_names(genus_names):
    cleaned_genus_names = []
    for name in genus_names:
        words = name.split() # Split the name by space
        if len(words) >= 2:
            name = '_'.join(words[:2])
        cleaned_genus_names.append(name)
    return cleaned_genus_names

non_fungus_genus_names = clean_genus_names(non_fungus_genus_names)
non_fungus_genus_names

In [None]:
# Now that we have a list with the genus names of the non-fungus types we can filter out the non-fungus types from our dataset
# We will delete the non-fungus types based on the genus names of the non-fungus types from our directory

def delete_fungus(fungus, dir):
    for subdir in os.listdir(dir):
        if subdir in fungus:
            shutil.rmtree(os.path.join(dir, subdir))
            print(f"Deleted {subdir}")
delete_fungus(non_fungus_genus_names, images_dir) # About 100 non-fungus types were deleted

In [None]:
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]
len(subdirs) #7644

In [None]:
# It didint quite work as expected. It seems that we dont have the exact genus names of the non-fungus types in our dataset. Maybe we can also try to delete fungi types
# based on their phylum names not kingdom. We can try to get the phyllum names of the non-fungus types and then delete the fungi types based on their phyllum names
# Lets check how many unique phylum names we have in our dataset and how many fungis each phyllum contains

fungus_phylums = fungus_types_df['phylum'].unique()
fungus_phylums

In [None]:
fungus_phylums_counts = fungus_types_df['phylum'].value_counts()
fungus_phylums_counts

The most popular mushroom phylums are Basidiomycota and Ascomycota and they are the ones that we need because lots of mushrooms in these phylums are suitable for collecting, the rest phylums are not needed

In [None]:
# We do similar thing as we did with the non fungi kingdoms. We can get the mushroom names based on their phylum that we got from the fungus_phyllums dataframe
# and we delete those which are not Basidiomycota or Ascomycota
del_phylum_list = []

del_phylum_names = fungus_phylums_counts.index[2:]
del_phylum_names_df = fungus_types_df[fungus_types_df['phylum'].isin(del_phylum_names)]

for id in del_phylum_names_df['name_id']:
    del_phylum_list.append(fungus_genus_names_df[fungus_genus_names_df['id'] == id]['text_name'].values[0])
del_phylum_list

In [None]:
# Now we have a lost of genus names of the fungi types that are not Basidiomycota or Ascomycota. We can delete them from our dataset, but first we have to clean the names

del_phylum_list = clean_genus_names(del_phylum_list)
del_phylum_list

In [None]:
# We delete the fungis in out dataset based on their genus names in our list
delete_fungus(del_phylum_list, images_dir) # About 10 fungi types were deleted

In [None]:
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]
len(subdirs) #7626

It still doesnt shrink our dataset enough, there are still a lot of mushrooms that are not useful for our app and not suitable for common collecting. We deleted the non fungi kingdom and phylums that are uncommon. So we were left with a 7626 classes of fungis that belong to 2 phylums - Basidiomycota and Ascomycota. Maybe it is time to go even deeper to classes of phylums and check if they may be the solution of our problem

In [None]:
fungus_types_df['class'].unique()

In [None]:
# First we can delete from the fungus_types_df the non-fungus types and the fungi phylums that are not Basidiomycota or Ascomycota
fungus_types_df = fungus_types_df.drop(fungus_types_df[fungus_types_df['family'].isin(non_fungus_families)].index)
fungus_types_df = fungus_types_df.drop(fungus_types_df[fungus_types_df['phylum'].isin(del_phylum_names)].index)
fungus_types_df

In [None]:
# Lets create a list of sorted the unique classes of the fungis
fungus_classes = fungus_types_df['class'].unique()
fungus_classes = pd.Series(fungus_classes).sort_values().values
fungus_classes = pd.Series(fungus_classes).dropna().values
fungus_classes

In [None]:
# We can make a plot of the number of fungis in each class, we will use the name_id to see how many fungis are in each class
fungus_classes_counts = fungus_types_df['class'].value_counts()
fungus_classes_counts = fungus_classes_counts.sort_index()
fungus_classes_counts.plot(kind='bar', figsize=(12, 10), title='Number of fungis in each class', xlabel='Class', ylabel='Number of fungis', color='green')

Chcemy teraz spośród tych wszystkich klas wyodrębnić te które występują najczęściej i są chętnie zbierane

Po researchu wyodrębniłem klasy które występują często oraz są porządane przez zbieraczy grzybów:
1. Ascomycota:
    - Pezizomycetes (dużo grzybów jadalnych jest w tej klasie, które są poszukiwane)
2. Basidiomycota:
    - Agaricomycetes (bardzo dużo grzybów jadalnych, jest też sporo dziwnych odmian ale jest to nasza główna klasa więc zostawiamy, potencjalnie do dalszego filtrowania)
    
Takie, które mogą być przydatne do rozpoznania, choć nie są powszechnie zbierane:
1. Ascomycota:
    - Arthoniomycetes (porosty)
    - Ascomycetes (chyba to klasa taka ogólna, parę grzybów z tej klasy w naszym datasecie jest rzadka ale mają dobry potencjał do trenowania - dobrej jakości zdjęcia)
    - Leotiomycetes (patyczniaki, jest ich dużo ale są bardzo zdywersyfikowane, większość się nie zbiera i mają zdjęcia średniej jakości ale trzymam je tu bo mamy ich sporo)
2. Basidiomycota:
    - Dacrymycetes (popularne grzyby aczkolwiek są małe i pasożytnicze, niezbieralne spożywczo)
    - Geoglossomycetes (mała klasa, bardzo charakterystyczna, często zbiera się ją kolekcjonersko ze względu na jej wygląd i całkiem rzadkie występowanie)
    - Tremellomycetes (charakterystyczne i niektóre gatunki są wykorzystywane w przemyśle spożywczym oraz farmaceutycznym, raczej nie zbiera się ich na codzień ale można pomyśleć nad pozostawieniem ich jako ciekawy dodatek)
    
Takie, których na spokojnie można się pozbyć, nikt poza ekstremalnymi fanami nie będzie ich zbierał
1. Ascomycota:
    - Dorthideomycetes (głównie pasożyty)
    - Sordariomycetes (mało popularne, czasem się je zbiera do wykorzystania w medycynie tradycyjnej ale nie mają wartości kolekcjonerskiej/spożywczej)
    - Eurotiomycetas (nie nadają się do zbierania za bardzo, niespotykane)
    - Lecanoromycetes (porosty)
    - Laboulbeniomycetes (owadorośla - nieprzydatne nam)
    - Saccharomycetes (drożdże)
    - Euascomycetes (niespotykana klasa, w naszym datasecie nawet nie ma grzybów z niej - do usunięcia)
    - Euascomycotina (mała klasa, nie będąca obiektem poszukiwań przeciętnych grzybiarzy)
    - Neolectomycetes (mała klasa, rzadko występuje w Europie, choć ma całkiem ciekawe grzyby, jednak nie przydadzą nam się)
    - Orbiliomycetes (klasa grzybów drapieżnych, niezbieralne)
    - Plectomycetes (bakterie)
    - Pneumocysomycetes (bakterie, mogą być w ludzkich płucach :O )
    - Sareomycetes (bardzo mało info więc zakładam że nieprzydatna z punktu widzenia zbieractwa)
    - Schizosaccharomycetes (dziwadła, niepopularne i niezbieralne)
    - Xylobotromycetes (pasożyty)
    - Collemopsidiomycetes (rzadkie grzyby, niespotykane raczej w Europie)
    - Coniocybomycetes (żyją w symbiozie z drzewami, mała klasa, raczej nie jest celem grzybiarzy)
    - Lichinomycetes (żyją z sinicami i tworzą porosty, do usunięcia)
2. Basidiomycota:
    - Atractiellomycetes (mała klasa, zawiera pasożyty i grzyby do bardziej naukowych badań)
    - Malasseziomycetes (podobnie jak powyższa, nie nadaje się do zbieractwa)
    - Tritirachiomycetes (mała klasa o jednym gatunku, niezbieralny spożywczo)
    - Hyphomycetes (dużo pleśni, całkiem często spotykane ale nie są zbierane ani wykorzystywane spożywczo)
    - Urediniomycetes (podklasa rdzy)
    - Ustilaginomycetes (pasożyty, niepotrzebne nam)
    - Agaricostilbomycetes (całkiem nowa klasa, mało przykładów, niepotrzebne)
    - Exobasidiomycetes (pasożyty roślinne)
    - Wallemiomycetes (malutkie grzyby, nowa klasa, spotykane w jedzeniu)
    - Pucciniomycetes (rdze, bleh)
    - Cystobasidiomycetes (też coś wspólnego ze rdzą mają to nie bierzemy pod uwagę)

In [None]:
# So after this we have few classes that we want to keep and that are: Pezizomycetes, Agaricomycetes,
# Arthoniomycetes, Ascomycetes, Leotiomycetes, Dacrymycetes, Geoglossomycetes, Tremellomycetes
# The rest of the classes we will delete from our dataset, we will make a list of the classes that we want to delete

delete_fungus_classes = fungus_classes[~np.isin(fungus_classes, ['Pezizomycetes', 'Agaricomycetes', 'Arthoniomycetes', 'Ascomycetes', 'Leotiomycetes', 'Dacrymycetes',
                                                                  'Geoglossomycetes', 'Tremellomycetes'])]
delete_fungus_classes

In [None]:
# Now we will do the same process as we did with the non-fungus types and with the fungi phylums. We will get the genus names of the fungis that are in the classes that we want to delete

del_class_list = []
del_class_df = fungus_types_df[fungus_types_df['class'].isin(delete_fungus_classes)]

for id in del_class_df['name_id']:
    del_class_list.append(fungus_genus_names_df[fungus_genus_names_df['id'] == id]['text_name'].values[0])
del_class_list

In [None]:
len(del_class_list) # 10559

In [None]:
del_class_list = clean_genus_names(del_class_list)
del_class_list

In [None]:
# Lets see how many mushrooms we will delete from our dataset
catalog_names = os.listdir(images_dir)

matching_values = [value for value in del_class_list if value in catalog_names]
count = len(matching_values)
print(f"The number of values in your list that match the names of the catalogs in your images_dir is: {count}") # 837

In [None]:
# Delete the fungis from our dataset based on their names in the classes that we want to delete
delete_fungus(del_class_list, images_dir)

In [None]:
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]
len(subdirs) #6859

In [None]:
# Less than 1000 mushrooms from our dataset were deleted, which is not enough to make a big difference. 
# I noticed that some mushrooms that needed to be deleted dont have its corresponding id in the names.csv file, so we need to check which ones are missing

# We create a list of the ids of the fungus_genus_names and we will compare it with the name_ids from the fungus_types_csv
name_ids = pd.DataFrame(fungus_types_csv['name_id'])
ids = pd.DataFrame(fungus_genus_names['id'])


missing_ids = []
for id in ids['id'].values:
    if id not in name_ids['name_id'].values:
        missing_ids.append(id)
missing_ids

In [None]:
# Now that we have mushrooms ids that are not in the names.csv file we can try to delete them based on their name_id

unmatched_names = []
for id in missing_ids:
    matching_rows = fungus_genus_names_df[fungus_genus_names_df['id'] == id]
    if not matching_rows.empty:
        unmatched_names.append(matching_rows['text_name'].values[0])
unmatched_names

In [None]:
len(unmatched_names)

In [None]:
with open('list.txt', 'w') as f: # We write the unmatched names to a file so we can check it and determine which we can delete or not
    for item in unmatched_names:
        f.write("%s\n" % item)

I used ChatGPT (4o model) to determine which mushrooms from the list i should keep. I made another text file with these names called mushrooms_to_keep.txt

In [None]:
# We can now clean the names of the unmatched mushrooms and names that are in mushrooms_to_keep.txt file, but first lets make a list of the mushrooms that we want to keep

mushrooms_to_keep = open('mushrooms_to_keep.txt', 'r')
mushrooms_to_keep = mushrooms_to_keep.readlines()
mushrooms_to_keep = [name.strip() for name in mushrooms_to_keep]
mushrooms_to_keep

In [None]:
# We can now clean both the unmatched names and the names that we want to keep
unmatched_names = clean_genus_names(unmatched_names)
mushrooms_to_keep = clean_genus_names(mushrooms_to_keep)


In [None]:
# Since we want to delete mushrooms that are not in the mushrooms_to_keep list from the unmatched_names list first we can check if the values in the unmatched_names list are unique
len(unmatched_names) # 34953
len(set(unmatched_names)) # 33662
# We can see that the values in the unmatched_names list are not unique so we need to make them unique
unmatched_names = list(set(unmatched_names))
len(unmatched_names) # 33662


In [None]:
# We can now delete mushrooms from the mushrooms_to_keep list from the unmatched_names list
# First we can see how many mushrooms in the mushrooms_to_keep list are in the unmatched_names list
matching_mushrooms = [name for name in mushrooms_to_keep if name in unmatched_names]
len(matching_mushrooms) # 35
# Now we can delete the values in mushrooms_to_keep from the unmatched_names list
unmatched_names = [name for name in unmatched_names if name not in mushrooms_to_keep]
len(unmatched_names)

In [None]:
# We have the names of the mushrooms that are not in the names.csv file, we can try to delete them from our dataset to see if it makes a difference
delete_fungus(unmatched_names, images_dir)

In [None]:
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]
len(subdirs) # 3976

*NOTE!*   Do this after Fungi detection notebook!

Now we have smaller dataset but there are some dirs that contain small amount of images so first step will be to create modified copies of them.
If that wont work efficiently we can delete them and later we can make modified copies anyway.

In [None]:
images_dir = os.path.join(MUSHROOMS_PATH, 'images_FasterRCNN', 'images_correct')
subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]

In [None]:
# Since we have a lot of mushroom types in our dataset we can try to delete the ones that have less than 20 images, we will see then how many mushrooms we have left in our dataset
# Check if in each subdirectory there are less than 20 images
for subdir in subdirs:
    images = [img for img in os.listdir(subdir) if img.endswith('.jpg')]

    if len(images) < 20:
        shutil.rmtree(subdir)
        print(f"Deleted {subdir}")

subdirs = [os.path.join(images_dir, subdir) for subdir in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, subdir))]
len(subdirs) # 1002

Since we rotate images images directly in Mushrooms recognition model in ImageDataGenerator we dont need to do it here, this would only create not needed copies that require more space and decrease models performance

It turned out that our species occur mostly in Northern America. Since our goal is to have species that can be seen in Europe we had to change our dataset by a lot. After some research we got 189 species that can be found in Poland and Europe.
For 87 we already have images but for the rest we need to get them. We will try to obtain them from iNaturalist database using iNaturalist API

First we can create a function to connect with iNaturalist API and download 50 images of a specie that we need

In [None]:
from pyinaturalist.node_api import get_observations
import os
import requests
import random

def get_inaturalist_images(specie_name, max_images, save_dir):
   
    # Parameters for the API request
    params = {
        'q': specie_name,
        'per_page': max_images,
        'order_by': 'created_at',
        'order': 'desc'
    }

    # Make the API request using pyinaturalist module to get the observation data including the images
    response = get_observations(**params)
    results = response['results']
    
    # Clean the genus names
    specie_name = specie_name.replace('-', '_')

    for i, result in enumerate(results):
        if 'photos' in result:
            for photo in result['photos']:
                photo_url = photo['url'].replace('square', 'medium')
                photo_response = requests.get(photo_url, stream=True)
                if photo_response.status_code == 200:
                    specie_dir = os.path.join(save_dir, specie_name)
                    if not os.path.exists(specie_dir):
                        os.makedirs(specie_dir, exist_ok=True)
                        print(f"Created directory {specie_dir}")
                    photo_path = os.path.join(specie_dir, f"{i + random.randint(1, 199999)}.jpg")
                    with open(photo_path, 'wb') as f:
                        for chunk in photo_response.iter_content(1024):
                            f.write(chunk)
                    print(f"Saved {photo_path}")
                else:
                    print(f"Failed to download image from {photo_url}")

Now we have to prepare our mushroom species list. I prepared one that summed and compared species from our dataset and new ones that mostly occur in Poland and Europe.

In [None]:
# File with the list is called mushrooms_europe.txt
# Lets see couple of examples of the mushrooms that are in the list
mushrooms_europe = open('mushrooms_dataset/mushrooms_europe.txt', 'r')
mushrooms_europe = [name.strip() for name in mushrooms_europe]
mushrooms_europe[:20]

Now we also need a list with only those species that we have to download. I prepared one that is called mushroom_europe_download.txt

In [None]:
# Examples of the mushrooms that are in the list
mushrooms_europe_to_download = open('mushrooms_dataset/mushrooms_europe_download.txt', 'r')
mushrooms_europe_to_download = [name.strip() for name in mushrooms_europe_to_download]
mushrooms_europe_to_download[:20]

In [None]:
# From our huge dataset we now have to delete the mushrooms that are not in the mushrooms_europe list
list_of_mushrooms = os.listdir(images_dir)
matching_mushrooms = [name for name in list_of_mushrooms if name in mushrooms_europe]
len(matching_mushrooms)

In [None]:
len(list_of_mushrooms)

In [None]:
# Delete the mushrooms from our images_dir that are not in the mushrooms_europe list
for specie in list_of_mushrooms:
    if specie not in matching_mushrooms:
        shutil.rmtree(os.path.join(images_dir, specie))
        print(f"Deleted {specie}")

In [None]:
# We can now check how many mushrooms we have left in our dataset
list_of_mushrooms = os.listdir(images_dir)
len(list_of_mushrooms)

In [None]:
# We can now download the images of the mushrooms that are in the mushrooms_europe_downaload list
# First we need to adjust the names of the mushrooms in the list, we have to replace '_' with '-'
mushrooms_europe_to_download = [name.replace('_', '-') for name in mushrooms_europe_to_download]
mushrooms_europe_to_download[:20]

In [None]:
len(mushrooms_europe_to_download)

In [None]:
for specie in mushrooms_europe_to_download:
    # check if in specie there are any images, if yes, skip the download
    if os.path.exists(os.path.join(images_dir, specie.replace('-', '_'))):
        print(f"Images for {specie} already exist")
        continue
    get_inaturalist_images(specie, max_images=30, save_dir=images_dir)

In [None]:
# lets see how many images and mushroom species we have in our dataset
list_of_mushrooms = os.listdir(images_dir)
images_count = 0
for specie in list_of_mushrooms:
    images_count += len(os.listdir(os.path.join(images_dir, specie)))
print(f"We have {images_count} images of {len(list_of_mushrooms)} mushroom species in our dataset")

In [None]:
# We can now create final list of mushrooms that are in our dataset
final_mushroom_list = open('mushrooms_dataset/final_mushroom_list.txt', 'w')
for specie in list_of_mushrooms:
    final_mushroom_list.write(f"{specie}\n")
final_mushroom_list.close()