In [1]:
import numpy as np 
import pandas as pd 
import os

# Input data files are available in the read-only "/kaggle/input/" directory
# Output files are in the "/kaggle/working/" directory 

This Jupyter Notebook is created as a Kaggle Notebook. This is because I will be compiling multiple datasets from Kaggle and it is easier to connect all the datasets using Kaggle Notebook (without downloading them). 

This will be used for the image captioning part of the project. We aim to use images that fit within the context of general themes present in SongCi. Each dataset has a different number of images so I aimed to have around the same number of images for each category and for each subcategory so there is no dataset imbalance. 

Here is a breakdown of the general categories for the images (the number after each category is the number of images for that category):

* flowers /leaves
    * Chrysanthemum (300)
    * Peony (300)
    * Orchid (300)
    * Leaves (300)
* weather
    * Sunrise (300)
    * Cloudy (300)
    * Shine (253)
    * Rime/Snow (300)
* landscape
    * Forest (300)
    * Mountain (300)
    * River/Lake/Sea (92 lake, 98 river, and 110 sea = 300 in total)
    * Waterfall (300)


The names of the datasets that I am using and links:
1. boat-vs-sea-images-dataset: https://www.kaggle.com/datasets/waqasahmedbasharat/boat-vs-sea-images-dataset?select=sea
    * taking 100 images from the sea folder
2. flower299: https://www.kaggle.com/datasets/bogdancretu/flower299
    * taking 300 images from each of the Chrysanthemum, Peony, and Orchid folders
3. river-vs-lake: https://www.kaggle.com/datasets/hariharanalm/river-vs-lake
    * taking all the images: 190 images total
4. visual-china: https://www.kaggle.com/datasets/protectoryao/visual-china
    * taking 300 images of the waterfall folder
5. landscape-recognition-image-dataset-12k-images: https://www.kaggle.com/datasets/utkarshsaxenadn/landscape-recognition-image-dataset-12k-images
   * taking 300 images from the forest folder and 300 from the mountain folder
6. multiclass-weather-dataset: https://www.kaggle.com/datasets/pratik2901/multiclass-weather-dataset/data
    * take images from cloudy (300 images), sunrise (300 images), and shine folders (all 253 images)
7. weather-dataset: https://www.kaggle.com/datasets/jehanbhathena/weather-dataset 
    * take 300 images from the rime folder
8. leaf-detection: https://www.kaggle.com/datasets/alexo98/leaf-detection?select=train
    * take 300 images

In [3]:
import shutil 
# copy images to output folders
# want to have 50 test, 50 valid, and rest 200 in train for each category 
output_train = "/kaggle/working/output_train/"
output_valid = "/kaggle/working/output_valid/"
output_test = "/kaggle/working/output_test/"

os.mkdir(output_train)
os.mkdir(output_valid)
os.mkdir(output_test)

output_train_count = 0
output_valid_count = 0
output_test_count = 0

In [4]:
# function to copy images
def copy_image (input_dir, output_train_count, output_valid_count, output_test_count, train =200, valid = 50, test = 50):
    total = train + valid + test
    valid_test = valid + test

    files = []
    for _,_,temp in os.walk(input_dir):
        files = temp
    
    # image extension
    extension = files[0].split(".")[-1]
    
    # also make sure that the output images are ordered numerically 
    for i in files[:test]:
        shutil.copy(os.path.join(input_dir,i), os.path.join(output_test,str(output_test_count).zfill(4) + "." + extension))
        output_test_count+=1
    for i in files[test:valid_test]:
        shutil.copy(os.path.join(input_dir,i), os.path.join(output_valid,str(output_valid_count).zfill(4) + "." + extension))
        output_valid_count+=1
    if len(files) <total:
        for i in files[valid_test:]:
            shutil.copy(os.path.join(input_dir,i), os.path.join(output_train,str(output_train_count).zfill(4) + "." + extension))
            output_train_count+=1
    else:
        for i in files[valid_test:total]:
            shutil.copy(os.path.join(input_dir,i), os.path.join(output_train,str(output_train_count).zfill(4) + "." + extension))
            output_train_count+=1

    # return it so it can be updated globally 
    return output_train_count, output_valid_count, output_test_count

In [5]:
input_dirs = []
dirname = "/kaggle/input"


# adding all the flower/leaves images first
# taking 300 images from each of the Chrysanthemum, Peony, and Orchid folders from flower299
input_dirs.append(os.path.join(dirname, 'flower299/Flowers299', "Chrysanthemum"))
input_dirs.append(os.path.join(dirname, 'flower299/Flowers299', "Peony"))
input_dirs.append(os.path.join(dirname, 'flower299/Flowers299', "Orchid"))
# add leaf images (take 300 from dataset)
input_dirs.append(os.path.join(dirname, 'leaf-detection', "train"))



# add weather images
# take 300 images from sunrise and cloudy and take all the images in shine (253)
input_dirs.append(os.path.join(dirname, 'multiclass-weather-dataset/Multi-class Weather Dataset', "Cloudy"))
input_dirs.append(os.path.join(dirname, 'multiclass-weather-dataset/Multi-class Weather Dataset', "Shine"))
input_dirs.append(os.path.join(dirname, 'multiclass-weather-dataset/Multi-class Weather Dataset', "Sunrise"))
# take 300 images from rime
# taking images from rime because context of the images are more related to SongCi themes than snow
input_dirs.append(os.path.join(dirname, "weather-dataset/dataset", "rime"))



# add landscape images
# taking 300 images from the forest folder and 300 from the mountain folder
input_dirs.append(os.path.join(dirname, "landscape-recognition-image-dataset-12k-images/Landscape Classification/Landscape Classification/Training Data", "Forest"))
input_dirs.append(os.path.join(dirname, "landscape-recognition-image-dataset-12k-images/Landscape Classification/Landscape Classification/Training Data", "Mountain"))
# taking 300 images from waterfall 
# Huangguoshu is the name of the waterfall 
# even though it is the same waterfall, the images are from different angles/lighting so it is still diverse
input_dirs.append(os.path.join(dirname, "visual-china/Visual_China/train/Huangguoshu"))


In [6]:
for dir in input_dirs:
    output_train_count, output_valid_count, output_test_count = copy_image(dir,output_train_count, output_valid_count, output_test_count)

In [7]:
# special case for River/Lake/Sea and Waterfall lanscape images

# River/Lake/Sea (92 lake, 98 river, and 110 sea = 300 in total)
# want approximately even representation of each in train/test/valid
# test/valid: 15 lake, 16 river, 19 sea
# train: 62 lake, 66 river, 72 sea
temp = os.path.join(dirname,"river-vs-lake/River vs Lake", "lake water")
output_train_count, output_valid_count, output_test_count = copy_image(temp,output_train_count, output_valid_count, output_test_count,train = 62, valid = 15, test = 15)

temp = os.path.join(dirname,"river-vs-lake/River vs Lake", "river water")
output_train_count, output_valid_count, output_test_count = copy_image(temp,output_train_count, output_valid_count, output_test_count,train = 66, valid = 16, test = 16)

temp = os.path.join(dirname,"boat-vs-sea-images-dataset", "sea")
output_train_count, output_valid_count, output_test_count = copy_image(temp,output_train_count, output_valid_count, output_test_count,train = 72, valid = 19, test = 19)

In [8]:
# return the number of images
# should have 2353 images for the training set and 600 images for the validation and test sets 
for _,_,files in os.walk(output_train):
    print(len(files))
for _,_,files in os.walk(output_test):
    print(len(files))
for _,_,files in os.walk(output_valid):
    print(len(files))

2353
600
600
