## Obtaining the dataset

We use the same code as the Capstone Project to download the dataset from Kaggle

In [1]:
#Delete sample folder
import shutil
import os
if os.path.exists("sample_data"):
  shutil.rmtree("sample_data")


#In order to use the kaggle dataset, here are the user credentials to download the *dataset*
import json
data = {"username": "eglqwevffjksjenodb","key": "e6eb0ddd811634e7292a46ce35ee14de"}
with open("kaggle.json", "w") as outfile:
    json.dump(data, outfile)

In [2]:
!mkdir ~/.kaggle

In [3]:
!mv kaggle.json ~/.kaggle/

In [4]:
!chmod 600 /root/.kaggle/kaggle.json

Download the dataset

In [5]:
!kaggle datasets download -d jehanbhathena/weather-dataset

Downloading weather-dataset.zip to /content
100% 586M/587M [00:26<00:00, 21.1MB/s]
100% 587M/587M [00:26<00:00, 23.6MB/s]


Unzip the file

In [6]:
!unzip -q weather-dataset.zip

Remove the zip file

In [7]:
os.remove("weather-dataset.zip")

## Exploratory Data Analysis

In [8]:
import os
from PIL import Image
import matplotlib.pyplot as plt

In [9]:
subfolders = [folder for folder in os.listdir('/content/dataset') if os.path.isdir(os.path.join('/content/dataset', folder))]
subfolders

['rainbow',
 'glaze',
 'rime',
 'hail',
 'sandstorm',
 'lightning',
 'rain',
 'frost',
 'fogsmog',
 'dew',
 'snow']

In [10]:
num_images = []
total = 0

for subfolder in subfolders:
    subfolder_path = os.path.join('/content/dataset', subfolder)
    images = [image for image in os.listdir(subfolder_path) if image.endswith('.jpg')]
    num_images.append(len(images))
    print(f"{subfolder}: {len(images)}")
    total += len(images)
print(f'Total images: {total}')

rainbow: 232
glaze: 639
rime: 1160
hail: 591
sandstorm: 692
lightning: 377
rain: 526
frost: 475
fogsmog: 851
dew: 698
snow: 621
Total images: 6862


## Creating the folders train, validation, and test

Copying the logic from the Capstone project codes

In [11]:
# Split Percentage (0 or 100 mean no spliting)
train_percentage = 0.70
validation_percentage = 0.20
test_percentage = 0.10

In [12]:
import random

def split_data(src_folder, train_folder, test_folder, validation_folder, train_percentage, test_percentage, validation_percentage, seed=None):
    # Set the random seed
    random.seed(seed)

    # Create train, test, and validation folders
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)
    os.makedirs(validation_folder, exist_ok=True)

    # Iterate over the folders in the source folder
    for folder_name in os.listdir(src_folder):
        folder_path = os.path.join(src_folder, folder_name)

        # Skip non-folder files
        if not os.path.isdir(folder_path):
            continue

        # Create the corresponding train, test, and validation subfolders
        train_subfolder = os.path.join(train_folder, folder_name)
        test_subfolder = os.path.join(test_folder, folder_name)
        validation_subfolder = os.path.join(validation_folder, folder_name)
        os.makedirs(train_subfolder, exist_ok=True)
        os.makedirs(test_subfolder, exist_ok=True)
        os.makedirs(validation_subfolder, exist_ok=True)

        # Iterate over the files in the current folder
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            # Decide whether to place the file in the train, test, or validation subfolder
            rand_value = random.uniform(0, 1)
            if rand_value < train_percentage:
                destination_folder = train_subfolder
            elif rand_value < train_percentage + test_percentage:
                destination_folder = test_subfolder
            else:
                destination_folder = validation_subfolder

            # Copy the file to the appropriate subfolder
            shutil.copy(file_path, destination_folder)

In [13]:
if os.path.exists("weather_dataset"):
  shutil.rmtree("weather_dataset")
os.makedirs(os.path.join('weather_dataset'))

src_folder = '/content/dataset'
train_folder = 'weather_dataset/train'
test_folder = 'weather_dataset/test'
validation_folder = 'weather_dataset/validation'

seed = 4840

split_data(src_folder, train_folder, test_folder, validation_folder, train_percentage, test_percentage, validation_percentage, seed)

print("train dew",len(os.listdir("weather_dataset/train/dew")))
print("train fogsmog",len(os.listdir("weather_dataset/train/fogsmog")))
print("train frost",len(os.listdir("weather_dataset/train/frost")))
print("train glaze",len(os.listdir("weather_dataset/train/glaze")))
print("train hail",len(os.listdir("weather_dataset/train/hail")))
print("train lightning",len(os.listdir("weather_dataset/train/lightning")))
print("train rain",len(os.listdir("weather_dataset/train/rain")))
print("train rainbow",len(os.listdir("weather_dataset/train/rainbow")))
print("train rime",len(os.listdir("weather_dataset/train/rime")))
print("train sandstorm",len(os.listdir("weather_dataset/train/sandstorm")))
print("train snow",len(os.listdir("weather_dataset/train/snow")))
print(".....")
print("validation dew",len(os.listdir("weather_dataset/validation/dew")))
print("validation fogsmog",len(os.listdir("weather_dataset/validation/fogsmog")))
print("validation frost",len(os.listdir("weather_dataset/validation/frost")))
print("validation glaze",len(os.listdir("weather_dataset/validation/glaze")))
print("validation hail",len(os.listdir("weather_dataset/validation/hail")))
print("validation lightning",len(os.listdir("weather_dataset/validation/lightning")))
print("validation rain",len(os.listdir("weather_dataset/validation/rain")))
print("validation rainbow",len(os.listdir("weather_dataset/validation/rainbow")))
print("validation rime",len(os.listdir("weather_dataset/validation/rime")))
print("validation sandstorm",len(os.listdir("weather_dataset/validation/sandstorm")))
print("validation snow",len(os.listdir("weather_dataset/validation/snow")))
print(".....")
print("test dew",len(os.listdir("weather_dataset/test/dew")))
print("test fogsmog",len(os.listdir("weather_dataset/test/fogsmog")))
print("test frost",len(os.listdir("weather_dataset/test/frost")))
print("test glaze",len(os.listdir("weather_dataset/test/glaze")))
print("test hail",len(os.listdir("weather_dataset/test/hail")))
print("test lightning",len(os.listdir("weather_dataset/test/lightning")))
print("test rain",len(os.listdir("weather_dataset/test/rain")))
print("test rainbow",len(os.listdir("weather_dataset/test/rainbow")))
print("test rime",len(os.listdir("weather_dataset/test/rime")))
print("test sandstorm",len(os.listdir("weather_dataset/test/sandstorm")))
print("test snow",len(os.listdir("weather_dataset/test/snow")))

train dew 476
train fogsmog 592
train frost 340
train glaze 475
train hail 435
train lightning 256
train rain 365
train rainbow 161
train rime 824
train sandstorm 483
train snow 457
.....
validation dew 161
validation fogsmog 187
validation frost 92
validation glaze 112
validation hail 104
validation lightning 84
validation rain 111
validation rainbow 40
validation rime 212
validation sandstorm 140
validation snow 111
.....
test dew 61
test fogsmog 72
test frost 43
test glaze 52
test hail 52
test lightning 37
test rain 50
test rainbow 31
test rime 124
test sandstorm 69
test snow 53


In [14]:
#os.removedirs("/content/weather_dataset")

## Running pre trained models

### ResNet50

In [14]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [15]:
train_data_dir = '/content/weather_dataset/train'
valid_data_dir = '/content/weather_dataset/validation'
image_size = (256, 256)
batch_size = 32

Including some augmentations

In [17]:
# Preprocessing
train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True
)

valid_datagen = ImageDataGenerator(rescale=1.0/255)

In [18]:
train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=image_size,
    batch_size=batch_size,
    class_mode='categorical'
)

valid_generator = valid_datagen.flow_from_directory(
    valid_data_dir,
    target_size=image_size,
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False
)

Found 4864 images belonging to 11 classes.
Found 1354 images belonging to 11 classes.


In [20]:
# Loading the model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

In [21]:
# Customize the last layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
# Define the number of classes
predictions = Dense(11, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

In [22]:
# Training
# 224, 224, epochs=10
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(train_generator, epochs=10, validation_data=valid_generator)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f48762abdc0>

In [23]:
# Training
# 256, 256, epochs=15
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(train_generator, epochs=15, validation_data=valid_generator)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7a66d81f78b0>

In [24]:
# Evaluation
loss, accuracy = model.evaluate(valid_generator)
print(f'Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}')

Validation Loss: 0.8061, Validation Accuracy: 0.7637


Making predictions

In [25]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the path to your test images folder
test_images_folder = '/content/weather_dataset/test'

# Get class labels
class_labels = list(train_generator.class_indices.keys())

# Preprocess and load the test images
test_datagen = ImageDataGenerator(rescale=1.0/255)
test_generator = test_datagen.flow_from_directory(
    test_images_folder,
    target_size=image_size,
    batch_size=1,
    class_mode='categorical',
    shuffle=False
)

# Predictions
predictions = model.predict(test_generator)

# Convert predictions to class labels
predicted_labels = [class_labels[np.argmax(pred)] for pred in predictions]

# Get ground truth labels
ground_truth_labels = test_generator.filenames
ground_truth_labels = [label.split('/')[0] for label in ground_truth_labels]  # Extract folder names

# Compare predictions with ground truth
for i in range(len(predicted_labels)):
    print(f"True Label: {ground_truth_labels[i]}, Predicted Label: {predicted_labels[i]}")

Found 644 images belonging to 11 classes.
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: glaze
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: hail
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True Label: dew, Predicted Label: dew
True 

Save the model

In [26]:
model.save(f"model-ResNet50.h5", save_format="h5")

Get the class labels

In [27]:
class_labels = list(train_generator.class_indices.keys())
class_labels

['dew',
 'fogsmog',
 'frost',
 'glaze',
 'hail',
 'lightning',
 'rain',
 'rainbow',
 'rime',
 'sandstorm',
 'snow']

Test the h5 model

In [26]:
import tensorflow as tf
from tensorflow import keras

# Load the model from the HDF5 file
model = keras.models.load_model('model-ResNet50.h5')

# Predictions
predictions = model.predict(test_generator)
predictions



array([[9.9740940e-01, 5.2697469e-06, 2.8281269e-04, ..., 2.3562036e-06,
        8.0061352e-07, 1.9143856e-06],
       [9.8543233e-01, 1.1448239e-03, 4.9290789e-04, ..., 2.1062298e-03,
        7.2225688e-05, 3.3877481e-04],
       [9.9292034e-01, 1.0776655e-03, 5.3092308e-04, ..., 3.5548161e-05,
        1.7504526e-04, 7.7518838e-05],
       ...,
       [1.6271093e-05, 9.6450351e-02, 8.5594959e-04, ..., 5.9069814e-03,
        3.5921868e-02, 6.2574142e-01],
       [5.0849398e-03, 6.9157422e-01, 4.9142103e-04, ..., 9.2166453e-04,
        1.6638269e-01, 2.1682715e-02],
       [4.6798750e-06, 2.3246500e-09, 9.9127167e-01, ..., 1.0506363e-04,
        1.4849947e-09, 8.1758043e-03]], dtype=float32)