In [None]:
import os
import shutil
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

MUSHROOMS_PATH = 'mushrooms_dataset'
MUSHROOM_DATASET = 'Mushrooms_images_names.tsv'

In [None]:
def create_mushrooms_path(path):
    if os.path.exists(path):
        print(f"Path already exists: {path}")
    else:
        os.makedirs(path)
        print(f"Path created: {path}")
        
create_mushrooms_path(MUSHROOMS_PATH)

In [None]:
def add_dataset_to_path(path,dspath):
    if os.path.exists(dspath):
        shutil.move(dspath, os.path.join(path, dspath))
    else:
        print(f"Dataset already in the right path: {path} or does not exist.")
add_dataset_to_path(MUSHROOMS_PATH, MUSHROOM_DATASET)

In [None]:
mushrooms = pd.read_csv(os.path.join(MUSHROOMS_PATH, MUSHROOM_DATASET), delimiter='\t')
mushrooms.head()

In [None]:
import urllib.request
from PIL import Image

plt.figure(figsize=(10,10))
for i in range(9):
    img_path = mushrooms.loc[i,'image']
    img_title = mushrooms.loc[i,'name']

    headers = {'User-Agent': 'Chrome/122.0.6261.129'}
    request = urllib.request.Request(img_path, headers=headers)
    response = urllib.request.urlopen(request)
    img = np.array(Image.open(response))
    plt.subplot(3,3,i+1)
    plt.imshow(img)
    plt.title(img_title)
plt.show()

In [None]:
mushrooms.isnull().sum()

In [None]:
mushrooms_without_nulls = mushrooms.dropna()
mushrooms_without_nulls.isnull().sum()

In [None]:
mushrooms = mushrooms_without_nulls
mushrooms

In [None]:
import requests
from concurrent.futures import ThreadPoolExecutor

headers = {
    'User-Agent': 'Chrome/122.0.6261.129'
}


df = mushrooms

# # sort values for tests
# df = df.sort_values('name')

urls = df['image'].tolist()
names = df['name'].tolist()

def download_image(url_name):
    url, name = url_name
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    if response.status_code == 200:
        if not os.path.exists(f'{MUSHROOMS_PATH}/images/{name.replace(" ", "_")}'):
            os.makedirs(f'{MUSHROOMS_PATH}/images/{name.replace(" ", "_")}') 
            print("Created directory: ", f'{MUSHROOMS_PATH}/images/{name.replace(" ", "_")}')
        filename = f'{MUSHROOMS_PATH}/images/{name.replace(" ", "_")}/{os.path.basename(url)}'
        if os.path.exists(filename):
            print(f'Skipping download of {name} image  - {filename} (already exists)')
        else:
            with open(filename, 'wb') as f:
                f.write(response.content)
    else:
        print(f'Failed to download {name}, {url}') 
        
with ThreadPoolExecutor(max_workers=12) as executor:
    executor.map(download_image, zip(urls,names))