In [None]:
!pip install tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from keras.layers import Dropout

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import numpy as np
from PIL import Image

# Load Data
og_df = pd.read_csv(r"C:\Users\inesb\Downloads\Deep-Learning-project\metadata.csv")

# Add the root directory with the unzipped images.
data_root_path = r'C:\Users\inesb\Downloads\rare_species' 
og_df['full_path'] = og_df['file_path'].apply(lambda x: os.path.join(data_root_path, x))

# Remove all variables that aren't necessary
# Since we just want to predict the family variable, we'll only keep that and the full_path to the images
print(og_df["kingdom"].nunique()) # 1, "animalia"
data = og_df["full_path", "family"]

# Check for null values
print(data.info())
# There are no null values

# Encode each category in the target variable
data['fam_enc'] = pd.factorize(data['family'])[0]
print(data['family'].nunique())
print(data["fam_enc"].nunique())
data.head(3)

# Check the class distribution throughout the dataset
class_distribution = data['full_path'].groupby(data["fam_enc"]).count()
print(class_distribution.describe().T)
# The count of images per class varies from 29 to 300, so we can consider this an imbalanced dataset

# Check for duplicates
data.drop_duplicates(keep='first')
data.info()
# There remain 11983 non-null rows, so there are no duplicates

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11983 entries, 0 to 11982
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   rare_species_id  11983 non-null  object
 1   eol_content_id   11983 non-null  int64 
 2   eol_page_id      11983 non-null  int64 
 3   kingdom          11983 non-null  object
 4   phylum           11983 non-null  object
 5   family           11983 non-null  object
 6   file_path        11983 non-null  object
 7   full_path        11983 non-null  object
dtypes: int64(2), object(6)
memory usage: 749.1+ KB


Unnamed: 0,rare_species_id,eol_content_id,eol_page_id,kingdom,phylum,family,file_path,full_path,label
0,75fd91cb-2881-41cd-88e6-de451e8b60e2,12853737,449393,animalia,mollusca,unionidae,mollusca_unionidae/12853737_449393_eol-full-si...,C:\Users\inesb\Downloads\rare_species\mollusca...,mollusca_unionidae
1,28c508bc-63ff-4e60-9c8f-1934367e1528,20969394,793083,animalia,chordata,geoemydidae,chordata_geoemydidae/20969394_793083_eol-full-...,C:\Users\inesb\Downloads\rare_species\chordata...,chordata_geoemydidae
2,00372441-588c-4af8-9665-29bee20822c0,28895411,319982,animalia,chordata,cryptobranchidae,chordata_cryptobranchidae/28895411_319982_eol-...,C:\Users\inesb\Downloads\rare_species\chordata...,chordata_cryptobranchidae


In [None]:
# Function to find the largest and smallest images in a directory
def find_extreme_images(data_root_path):
  # Define the variables of smalles and biggest images
    largest_image = None
    smallest_image = None
    largest_size = 0
    smallest_size = float('inf') ## infinit number

# Iteration loop for each folder to compare the image sizes
    for folder in os.listdir(data_root_path):
        folder_path = os.path.join(data_root_path, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file)
                with Image.open(file_path) as img:
                    img_size = img.size[0] * img.size[1]
                    if img_size > largest_size:
                        largest_size = img_size
                        largest_image = file_path, img.size
                    if img_size < smallest_size:
                        smallest_size = img_size
                        smallest_image = file_path, img.size

    return largest_image, smallest_image

largest_image, smallest_image = find_extreme_images(data_root_path)

print("Largest image in the directory:")
print("File Path:", largest_image[0])
print("Size (Width x Height):", largest_image[1])
print("\nSmallest image in the directory:")
print("File Path:", smallest_image[0])
print("Size (Width x Height):", smallest_image[1])



Largest image in the directory:
File Path: C:\Users\inesb\Downloads\rare_species\mollusca_cardiidae\30003931_46473744_eol-full-size-copy.jpg
Size (Width x Height): (17000, 6800)

Smallest image in the directory:
File Path: C:\Users\inesb\Downloads\rare_species\chordata_delphinidae\9484689_46559297_eol-full-size-copy.jpg
Size (Width x Height): (193, 129)


In [24]:
list_folders = []
for item in os.listdir(data_root_path):
    item_path = os.path.join(data_root_path, item)
    if os.path.isdir(item_path):
        list_folders.append(item)

print(list_folders)

['arthropoda_apidae', 'arthropoda_attelabidae', 'arthropoda_carabidae', 'arthropoda_cerambycidae', 'arthropoda_coenagrionidae', 'arthropoda_formicidae', 'arthropoda_gomphidae', 'arthropoda_lucanidae', 'arthropoda_nymphalidae', 'arthropoda_palinuridae', 'arthropoda_papilionidae', 'arthropoda_pisauridae', 'arthropoda_platystictidae', 'arthropoda_pseudophasmatidae', 'arthropoda_tettigoniidae', 'arthropoda_theraphosidae', 'arthropoda_triopsidae', 'chordata_accipitridae', 'chordata_acipenseridae', 'chordata_agamidae', 'chordata_albulidae', 'chordata_alcedinidae', 'chordata_alligatoridae', 'chordata_alopiidae', 'chordata_ambystomatidae', 'chordata_anatidae', 'chordata_anguidae', 'chordata_aotidae', 'chordata_ardeidae', 'chordata_arthroleptidae', 'chordata_atelidae', 'chordata_balaenicipitidae', 'chordata_balaenidae', 'chordata_balaenopteridae', 'chordata_balistidae', 'chordata_bombycillidae', 'chordata_bovidae', 'chordata_brachypteraciidae', 'chordata_bucerotidae', 'chordata_bufonidae', 'cho