In [1]:
# # Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import os
import random
import tensorflow as tf
import seaborn as sns

from matplotlib import image
from sklearn.metrics import confusion_matrix
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import VGG16
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Suppress keras warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [2]:
def load_metadata(data_folder, train_file, validation_file, test_file, column_names):
    """Reads the .csv files containing metadata (e.g. file location, class label) about the data images.
    
    Parameters:
    data_folder (str): file path to the folder containing the csv file.
    train_file (str): file name of the file containing metadata for the training set.
    validation_file (str): file name of the file containing metadata for the validation set.
    test_file (str): file name of the file containing metadata for the test set.
    column_names (list): array containing the column names for the csv files
    
    Returns:
    the read trainig, validation, and test files.
    """
    training_df = pd.read_csv(data_folder + train_file, names = column_names, skiprows=1)
    validation_df = pd.read_csv(data_folder + validation_file, names = column_names, skiprows=1)
    test_df = pd.read_csv(data_folder + test_file, names = column_names, skiprows=1)
    
    return training_df, validation_df, test_df


In [3]:
input_folder = "/kaggle/input/"
dataset_folder = input_folder + "eurosat-dataset/" + "EuroSAT/"
column_names =  ["id", "path", "class_id", "class_name"]

training_df, validation_df, test_df = load_metadata(dataset_folder,
                                                    "train.csv",
                                                    "validation.csv",
                                                    "test.csv",
                                                    column_names)

In [4]:
# Extract labels names
labels = []
for element in (training_df["class_name"].unique()):
    labels.append(element)

# Undersampling and Oversampling approach

In [5]:
# Define the functions

def extract_unique_names(df, column=str):
    """
    Extracts the unique labels for the dataframe
    
    df: the target dataframe for which to get labels
    column: str, the column name containing lable information
    """
    labels = []
    for element in (df[column].unique()):
        labels.append(element)
    return labels

def get_min(df, target_column=str, classes=list):
    """
    Get the class with the least amount of instances in the dataset (minority class).
    
    df: the dataframe for which to find the minority class
    target_column: str, df column containing information about classes/labels
    classes: list, a list of all unique classes contained in the dataframe
    """
    min_count = len(df)
    for label in classes:
        count = df[target_column].value_counts()[label]
        if count < min_count:
            min_count = count
            min_variable = label
    return min_count, min_variable

def get_max(df, target_column=str, classes=list):
    """
    Get the class with the most instances in the dataset (majority class).
    
    df: the dataframe for which to find the majority class
    target_column: str, df column containing information about classes/labels
    classes: list, a list of all unique classes contained in the dataframe
    """
    max_count = 0
    for label in classes:
        count = df[target_column].value_counts()[label]
        if count > max_count:
            max_count = count
            max_variable = label
    return max_count, max_variable

def undersample(df, target_column=str, classes=list):
    """
    Undersample the dataset so that all labels have the same amount of instances.
    This is recommended if one has a lot of data.
    
    df: the dataframe to undersample
    target_column: str, the name of the column containing class/label information
    classes: list, a list containing all unique classes present in the dataset
    
    Returns: a dataframe containing as many instances per class as the amount of instances of the
    minority class (the class with the least instances) in the original data frame
    """
    nMax, label = get_min(df, target_column, classes)
    res = df.groupby(target_column).apply(lambda x: x.sample(n=min(nMax, len(x))))
    return res

## Undersampling

In [6]:
# Training set

train_classes = extract_unique_names(training_df, "class_name")
val_classes   = extract_unique_names(validation_df, "class_name")
test_classes  = extract_unique_names(test_df, "class_name")

if train_classes.sort() == val_classes.sort() == test_classes.sort():
    classes = train_classes
else:
    print("!!Warning: Classes are different for the training, validation, and test sets!!")

train_min_count, train_min_variable = get_min(training_df, "class_name", classes)
val_min_count, val_min_variable = get_min(validation_df, "class_name", classes)
test_min_count, test_min_variable = get_min(test_df, "class_name", classes)

print("Min count, Min variable")
print("Train", train_min_count, train_min_variable)
print("Valid", val_min_count, val_min_variable)
print("Test ", test_min_count, test_min_variable)


Min count, Min variable
Train 1400 Pasture
Valid 400 Pasture
Test  200 Pasture


In [7]:
# Perform undersampling on all classes

train_df_under = undersample(training_df, "class_name", classes)
val_df_under = undersample(validation_df, "class_name", classes)
test_df_under = undersample(test_df, "class_name", classes)

print(f"Undersampled training dataframe length: {len(train_df_under)}")
# print(train_df_under['class_name'].value_counts())

print(f"\nUndersampled validation dataframe length: {len(val_df_under)}")
# print(val_df_under['class_name'].value_counts())

print(f"\nUndersampled test dataframe length: {len(test_df_under)}\n")
# print(test_df_under['class_name'].value_counts())


Undersampled training dataframe length: 14000

Undersampled validation dataframe length: 4000

Undersampled test dataframe length: 2000



### Start with the CNN part

In [8]:
# The data is already normalized so that pixels values range from 0,255. We can rescale pixel value between
# [0,1] by divinding by 255.

rescaling_factor = 1. / 255
# Generate batches of tensor image data by setting up the generators.
# If you want to perform augmentation, add parameters to the training_generatorF
training_generator = ImageDataGenerator(rescale=rescaling_factor)
test_generator = ImageDataGenerator(rescale=rescaling_factor)

# Size of input data is 64x64 pixels
image_width, image_height = (64, 64)

batch_size = 128 # How big of a set of data to use for every training epoch
# 128 is a pretty standard value
