In [22]:
import matplotlib.pyplot as plt
import numpy             as np
import pandas            as pd
import cv2               as cv

import os
import sys, tarfile

# To do with reading files as bits
import io

import PIL

from os.path                  import join, dirname, abspath
from mushroom_learning.params import BUCKET_NAME, STORAGE_LOCATION
from google.cloud             import storage



# Dowload Relevant directories

In my case I already have many downloaded, they are located at:

In [5]:
!ls -rtlh ../raw_data

total 1.5M
-rw-r--r-- 1 noel noel 366K Sep 20  2019 UCI_mushrooms.csv
drwxr-xr-x 8 noel noel 4.0K Feb 28 14:53 David_Harper_species_pictures
drwxr-xr-x 2 noel noel 4.0K Feb 28 15:10 Dinahar_P_properties
drwxr-xr-x 4 noel noel 4.0K Feb 28 15:43 Stepan_Dupliak_edibleOrPoison_Photo
-rw-r--r-- 1 noel noel 546K Feb 28 16:26 categoricals
-rw-r--r-- 1 noel noel 546K Feb 28 16:26 categoricals.tgz
-rw-r--r-- 1 noel noel  707 Feb 28 17:05 README
drwxr-xr-x 5 noel noel 4.0K Feb 28 20:09 Ilya_kondrusevich_mushrooms
drwxr-xr-x 8 noel noel 4.0K Mar  4 07:50 David_Harper_Labelled_Cleaned
drwxr-xr-x 8 noel noel 4.0K Mar  4 11:36 Six_Species_Clean
drwxr-xr-x 4 noel noel 4.0K Mar  6 16:21 fungi_identification


Otherwise, use the links provided in the Github README <br>
https://github.com/DSP-Tan/mushroom_learning

Note: In the case of the david Harper data set, use the data set on google cloud, download like this:

## First see what is on the cloud

**First consult with others incase the preprocessing has already been done, and things are on the cloud**

In [12]:
print(BUCKET_NAME)
print(STORAGE_LOCATION)

mushroom-bucket-le-wagon
models/first_model


In [13]:
#gcloud alpha storage ls --recursive gs://BUCKET_NAME/PREFIX**
!gcloud alpha storage ls  gs://mushroom-bucket-le-wagon/

gs://mushroom-bucket-le-wagon/6_species.tgz
gs://mushroom-bucket-le-wagon/David_Harper_Labelled_Cleaned.tgz
gs://mushroom-bucket-le-wagon/Six_Species_Clean.tgz
gs://mushroom-bucket-le-wagon/fungi_test.tgz
gs://mushroom-bucket-le-wagon/fungi_train_val.tgz
gs://mushroom-bucket-le-wagon/test.json
gs://mushroom-bucket-le-wagon/train.json
gs://mushroom-bucket-le-wagon/val.json
gs://mushroom-bucket-le-wagon/data/
gs://mushroom-bucket-le-wagon/images/
gs://mushroom-bucket-le-wagon/models/
gs://mushroom-bucket-le-wagon/unzipped/


## Downloaded needed directories from cloud

In [15]:
STORAGE_LOCATION='David_Harper_Labelled_Cleaned.tgz'

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(STORAGE_LOCATION)
blob.download_to_filename('David_Harper.tgz')

We now have David Harper in our current working directory.

In [34]:
# Confirm download
print('Confirm download')
! ls -rtlh David_Harper.tgz
print('\nWe are in folder:')
! pwd
print('\nThis folder now contains')
! ls -rtlh

Confirm download
-rw-r--r-- 1 noel noel 54M Mar  7 07:57 David_Harper.tgz

We are in folder:
/home/noel/code/DSP-Tan/mushroom_learning/notebooks

This folder now contains
total 58M
-rw-r--r-- 1 noel noel  59K Mar  3 18:08 02_UCIDataset.ipynb
-rw-r--r-- 1 noel noel  73K Mar  3 18:08 01_DinaharDataset_primary.ipynb
-rw-r--r-- 1 noel noel 601K Mar  3 18:08 mushrooms.ipynb
-rw-r--r-- 1 noel noel  54M Mar  7 07:57 David_Harper.tgz
-rw-r--r-- 1 noel noel 2.8M Mar  7 08:01 SpeciesDatatSetExploration.ipynb
-rw-r--r-- 1 noel noel 9.0K Mar  7 08:07 DataSet_Merging_Cleaning.ipynb


### Function to extract tar files in python

In [24]:
def extract(tar_url, extract_path='.'):
    print(tar_url)
    tar = tarfile.open(tar_url, 'r')
    for item in tar:
        tar.extract(item, extract_path)
        if item.name.find(".tgz") != -1 or item.name.find(".tar") != -1:
            extract(item.name, "./" + item.name[:item.name.rfind('/')])
try:

    extract(sys.argv[1] + '.tgz')
    print('Done.')
except:
    name = os.path.basename(sys.argv[0])
    print(name[:name.rfind('.')], '<filename>')

-f.tgz
ipykernel_launcher <filename>


In [35]:
extract('./David_Harper.tgz','.')

./David_Harper.tgz


In [36]:
!ls -rtlh

total 58M
-rw-r--r-- 1 noel noel  59K Mar  3 18:08 02_UCIDataset.ipynb
-rw-r--r-- 1 noel noel  73K Mar  3 18:08 01_DinaharDataset_primary.ipynb
-rw-r--r-- 1 noel noel 601K Mar  3 18:08 mushrooms.ipynb
-rw-r--r-- 1 noel noel  54M Mar  7 07:57 David_Harper.tgz
-rw-r--r-- 1 noel noel 2.8M Mar  7 08:01 SpeciesDatatSetExploration.ipynb
-rw-r--r-- 1 noel noel 9.0K Mar  7 08:07 DataSet_Merging_Cleaning.ipynb
drwxr-xr-x 8 noel noel 4.0K Mar  7 08:08 David_Harper_Labelled_Cleaned


**The unzipped downloaded folder is now sitting in our directory, it is named "David_Harper_Labelled_Cleaned". Note the name of the unzipped folder can often be different to the name of the zipped one. We could have sent it anywhere we wanted using the parameter 'extract_path'**

## Examine contents of folder

In [49]:
path_to_imgs='./David_Harper_Labelled_Cleaned'

# listing directories
dir_list=os.listdir(path_to_imgs)
dir_list

['Boletus_edulis',
 'Amanita_bisporigera',
 'rename_sort_by_size.sh',
 'Cantharellus',
 'Omphalotus_olearius',
 'Amanita_muscaria',
 'Russula_mariae']

### Remove extraneous files from directory list

In [77]:
dir_list=[i for i in dir_list if '.sh' not in i]
print(f'There are {len(dir_list)} species in the David Hardper dataset')
dir_list

There are 6 species in the David Hardper dataset


['Boletus_edulis',
 'Amanita_bisporigera',
 'Cantharellus',
 'Omphalotus_olearius',
 'Amanita_muscaria',
 'Russula_mariae']

### Examine subfolders

In [76]:
# Make path string using string concatanation
bolete_path=path_to_imgs+'/'+dir_list[0]
print('Path to boletes:')
print(bolete_path)

boletes=os.listdir(bolete_path)
print(f'\nWe have {len(boletes)} pictures of Boletus Edulis')
boletes[0:10]

Path to boletes:
./David_Harper_Labelled_Cleaned/Boletus_edulis

We have 439 pictures of Boletus Edulis


['440_Boletus_edulis_Harper.jpg',
 '443_Boletus_edulis_Harper.jpg',
 '371_Boletus_edulis_Harper.jpg',
 '392_Boletus_edulis_Harper.jpg',
 '192_Boletus_edulis_Harper.jpg',
 '351_Boletus_edulis_Harper.jpg',
 '263_Boletus_edulis_Harper.jpg',
 '78_Boletus_edulis_Harper.jpg',
 '171_Boletus_edulis_Harper.jpg',
 '99_Boletus_edulis_Harper.jpg']

**Note os.listdir gives unordered directory list**

**This folder has already been labelled and cleaned. The images are numbered in accordance with their image size, this made it easier to get rid of the microscope pictures, which are not also no longer there.**

**This sorting has been done by the bash script "rename_sort_by_size.sh". Just delete this script if it creates problems looping over the folder later**

In [53]:
# To list the images in order of their filesise we can use ls -rSlh
! ls -rSlh David_Harper_Labelled_Cleaned/Boletus_edulis

total 9.8M
-rw-r--r-- 1 noel noel 7.6K Feb 28 16:45 4_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 7.7K Feb 28 16:45 5_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 8.3K Feb 28 16:45 6_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 8.8K Feb 28 16:45 7_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.1K Feb 28 16:45 8_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.2K Feb 28 16:45 9_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.2K Feb 28 16:45 10_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.2K Feb 28 16:45 11_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.2K Feb 28 16:45 12_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.5K Feb 28 16:45 13_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.6K Feb 28 16:45 14_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.8K Feb 28 16:45 15_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.9K Feb 28 16:45 16_Boletus_edulis_Harper.jpg
-rw-r--r-- 1 noel noel 9.9K Feb 28 16:45 17_Boletus_edulis_Harper.jpg


Note how there are missing entries at the start, due to missing image files. There are also higher missing entries because sometimes there were microscope/other irrelevant pictures in larger files too.

### Sort the python list according to existing labels

In [66]:
boletes.sort(reverse=False,key= lambda bolete: int(bolete.split('_')[0]))
boletes[:10]

['4_Boletus_edulis_Harper.jpg',
 '5_Boletus_edulis_Harper.jpg',
 '6_Boletus_edulis_Harper.jpg',
 '7_Boletus_edulis_Harper.jpg',
 '8_Boletus_edulis_Harper.jpg',
 '9_Boletus_edulis_Harper.jpg',
 '10_Boletus_edulis_Harper.jpg',
 '11_Boletus_edulis_Harper.jpg',
 '12_Boletus_edulis_Harper.jpg',
 '13_Boletus_edulis_Harper.jpg']

## Generalise procedure with functions

In this case we just have 6 species, so this is not really a necessary step, but in folders where we have thousands of subfolders, we may want an easy way to find a species/genus by its name. The following function does this.

### Step 1 find species directory

In [84]:
dir_list

['Boletus_edulis',
 'Amanita_bisporigera',
 'Cantharellus',
 'Omphalotus_olearius',
 'Amanita_muscaria',
 'Russula_mariae']

In [79]:
boletes[0].replace('_', ' ')

'440 Boletus edulis Harper.jpg'

In [82]:
def find_in_list(name,dir_list):
    for i in dir_list:
        if name.lower() in i.replace('_', ' ').lower():
            species_dir=i
    return species_dir

In [83]:
print(find_in_list('amanita muscaria',dir_list))

Amanita_muscaria


### Step 2: make path and sort/clean list

In [96]:
# Make path string using string concatanation
amusc_path= path_to_imgs + '/' + find_in_list('amanita muscaria',dir_list)
print('Path to A_muscara:')
print(amusc_path)

amuscs=os.listdir(amusc_path)
print(f'\nWe have {len(amuscs)} pictures of amanita muscaria')
amuscs.sort(reverse=False,key= lambda bolete: int(bolete.split('_')[0]))

amuscs[0:10]

Path to A_muscara:
./David_Harper_Labelled_Cleaned/Amanita_muscaria

We have 363 pictures of amanita muscaria


['1_Amanita_muscaria_Harper.jpg',
 '2_Amanita_muscaria_Harper.jpg',
 '3_Amanita_muscaria_Harper.jpg',
 '4_Amanita_muscaria_Harper.jpg',
 '5_Amanita_muscaria_Harper.jpg',
 '6_Amanita_muscaria_Harper.jpg',
 '7_Amanita_muscaria_Harper.jpg',
 '8_Amanita_muscaria_Harper.jpg',
 '9_Amanita_muscaria_Harper.jpg',
 '10_Amanita_muscaria_Harper.jpg']

### Examine for corrupted files

In [93]:
def find_corrupted(directory):
    for filename in listdir('./'):
        if filename.endswith('.png') or filename.endswith('.jpg') or filename.endswith('.JPG'):
            try:img = Image.open('./'+filename) # open the image file
                img.verify() # verify that it is, in fact an image
            except (IOError, SyntaxError) as e:
                print('Bad file:', filename) # print out the names of corrupt files

IndentationError: expected an indented block (1174283270.py, line 4)

In [97]:
def find_corrupted(directory):
    for filename in listdir(directory):
        png=filename.endswith('.png') or filename.endswith('.PNG')
        jpg=filename.endswith('.jpg') or filename.endswith('.JPG')
        jpeg=filename.endswith('.jpeg') or filename.endswith('.JPEG')
        if png or jpg or jpeg:
            try:
                img=Image.open('./'+filename)
                img.verify()
            except (IOError, SyntaxError) as e:
                print('Bad file:',filename)

In [95]:
amuscs

['1_Amanita_muscaria_Harper.jpg',
 '2_Amanita_muscaria_Harper.jpg',
 '3_Amanita_muscaria_Harper.jpg',
 '4_Amanita_muscaria_Harper.jpg',
 '5_Amanita_muscaria_Harper.jpg',
 '6_Amanita_muscaria_Harper.jpg',
 '7_Amanita_muscaria_Harper.jpg',
 '8_Amanita_muscaria_Harper.jpg',
 '9_Amanita_muscaria_Harper.jpg',
 '10_Amanita_muscaria_Harper.jpg',
 '11_Amanita_muscaria_Harper.jpg',
 '12_Amanita_muscaria_Harper.jpg',
 '13_Amanita_muscaria_Harper.jpg',
 '14_Amanita_muscaria_Harper.jpg',
 '15_Amanita_muscaria_Harper.jpg',
 '16_Amanita_muscaria_Harper.jpg',
 '17_Amanita_muscaria_Harper.jpg',
 '18_Amanita_muscaria_Harper.jpg',
 '19_Amanita_muscaria_Harper.jpg',
 '20_Amanita_muscaria_Harper.jpg',
 '21_Amanita_muscaria_Harper.jpg',
 '22_Amanita_muscaria_Harper.jpg',
 '23_Amanita_muscaria_Harper.jpg',
 '24_Amanita_muscaria_Harper.jpg',
 '25_Amanita_muscaria_Harper.jpg',
 '26_Amanita_muscaria_Harper.jpg',
 '27_Amanita_muscaria_Harper.jpg',
 '28_Amanita_muscaria_Harper.jpg',
 '29_Amanita_muscaria_Harper.