# Download Datasets
Datasets sourced from Kaggle, make sure to prepare your Kaggle API key (kaggle.json).

In [1]:
# Initializing Kaggle's Python package

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

ref                                                                   title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
arnabchaki/data-science-salaries-2023                                 Data Science Salaries 2023 💸                         25KB  2023-04-13 09:55:16          26915        750  1.0              
tawfikelmetwally/automobile-dataset                                   Car information dataset                               6KB  2023-05-28 18:26:48           1681         47  0.9411765        
fatihb/coffee-quality-data-cqi                                        Coffee Quality Data (CQI May-2023)                   22KB  2023-05-12 13:06:39           4237         92  1.0              
mohithsairamreddy/salary-data 

## Fruits and Vegetables Dataset 
Source: https://www.kaggle.com/datasets/moltean/fruits

In [2]:
# Download Fruits360 dataset
! kaggle datasets download moltean/fruits

Downloading fruits.zip to /work
100%|██████████████████████████████████████▉| 1.28G/1.28G [00:04<00:00, 325MB/s]
100%|██████████████████████████████████████| 1.28G/1.28G [00:14<00:00, 94.0MB/s]


In [3]:
# Unzip
! unzip -qq fruits

## Flowers Dataset
Source: https://www.kaggle.com/datasets/l3llff/flowers

In [4]:
# Download Flowers dataset
! kaggle datasets download l3llff/flowers

Downloading flowers.zip to /work
 92%|████████████████████████████████████▋   | 209M/228M [00:02<00:00, 91.6MB/s]
100%|████████████████████████████████████████| 228M/228M [00:04<00:00, 53.8MB/s]


In [5]:
# Unzip
! unzip -qq flowers

## Animals Dataset
Source: https://www.kaggle.com/datasets/iamsouravbanerjee/animal-image-dataset-90-different-animals

In [6]:
# Download 90 Animals dataset
! kaggle datasets download iamsouravbanerjee/animal-image-dataset-90-different-animals

Downloading animal-image-dataset-90-different-animals.zip to /work
 98%|████████████████████████████████████████▏| 643M/656M [00:03<00:00, 266MB/s]
100%|████████████████████████████████████████| 656M/656M [00:07<00:00, 90.3MB/s]


In [7]:
# Unzip
! unzip -qq animal-image-dataset-90-different-animals

# Structuring Datasets
Structuring directories for each dataset.

## Raw Dataset
Preparing dataset directories and moving raw dataset.

In [5]:
import shutil
import os

# Downloaded datasets
ANIMAL_DL = "../animals/animals"
FRUITS_DL = "../fruits-360_dataset"
FLOWER_DL = "../flowers"

# New directories
DATASET_PATH = "../datasets/"
RAW_PATH = "raw/"
FLORA_PATH = "flora/"
FAUNA_PATH = "fauna/"

# Create two dataset directories
try:
    os.makedirs(DATASET_PATH + RAW_PATH)
    os.makedirs(DATASET_PATH + FLORA_PATH)
    os.makedirs(DATASET_PATH + FAUNA_PATH)
except FileExistsError:
    print("File already exists")

# Removing unused files
os.remove("../animal-image-dataset-90-different-animals.zip")
os.remove("../fruits.zip")
os.remove("../flowers.zip")
shutil.rmtree("../fruits-360-original-size")

In [9]:
# Moving downloaded datasets
ANIMAL_RAW = f"{DATASET_PATH}{RAW_PATH}/animals"
FRUITS_RAW = f"{DATASET_PATH}{RAW_PATH}/fruits"
FLOWER_RAW = f"{DATASET_PATH}{RAW_PATH}/flowers"

shutil.copytree(ANIMAL_DL, ANIMAL_RAW)
shutil.rmtree(ANIMAL_DL[:10])

FRUITS_DL = f"{FRUITS_DL}/fruits-360"
shutil.copytree(FRUITS_DL, FRUITS_RAW)
shutil.rmtree(FRUITS_DL)

shutil.copytree(FLOWER_DL, FLOWER_RAW)
shutil.rmtree(FLOWER_DL)

os.rename("../name of the animals.txt", f"{ANIMAL_RAW}/animal_names.txt")

## Processed Dataset
Creating processed dataset, ready for model development.

### Animals Dataset
Using splitfolders to split 80/20

In [10]:
import splitfolders

animal_input_folder = ANIMAL_RAW
animal_output = DATASET_PATH + FAUNA_PATH

splitfolders.ratio(animal_input_folder, output=animal_output, seed=1337, ratio=(.8, .2))

Copying files: 5400 files [00:07, 697.43 files/s]


### Fruits-Vegetables Dataset
Using Training for training set, Test for validation set, and test-multiple_fruits for optional testing.

In [11]:
train_src = f"{FRUITS_RAW}/Training"
val_src = f"{FRUITS_RAW}/Test"
test_src = f"{FRUITS_RAW}/test-multiple_fruits"

fruits_train = f"{DATASET_PATH}{FLORA_PATH}/train"
fruits_val = f"{DATASET_PATH}{FLORA_PATH}/val"
fruits_test = f"{DATASET_PATH}{FLORA_PATH}/test"

shutil.copytree(train_src, fruits_train)
shutil.copytree(val_src, fruits_val)
shutil.copytree(test_src, fruits_test)

'../datasets/flora//test'

### Flowers Dataset
Splitting and combining with fruits-vegetables

In [20]:
ft_count = sum([len(files) for r, d, files in os.walk(fruits_train)])
fv_count = sum([len(files) for r, d, files in os.walk(fruits_val)])
print(f"training count: {ft_count}\nvalidation count: {fv_count}")
print(f"split ratio: {ft_count / (ft_count + fv_count)}")

training count: 67692
validation count: 22688
split ratio: 0.7489710112856827


In [21]:
flower_input_folder = FLOWER_RAW
flower_output = DATASET_PATH + FLORA_PATH + "flowers/"

splitfolders.ratio(flower_input_folder, output=flower_output, seed=1337, ratio=(.75, .25))

Copying files: 15740 files [00:21, 745.76 files/s]
