In [1]:
%%capture
!pip install rasterio

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Milestone 1: Data Collection, Exploration, and Preprocessing 

### 2. Data Exploration:

#### 1 Perform exploratory data analysis (EDA) to understand the composition of the images, 
including the number of bands and their relevance to land type classification. 
### Needed Bands:
[Sentinel 2 Bands and Combinations](https://gisgeography.com/sentinel-2-bands-combinations/)

### Load the data
#### 2 Inspect the dataset for potential issues, such as imbalanced classes, missing data, or mislabeled images.
- Class Imbalance: Check if some land types have significantly more images than others.
- 
*Solution:* Use data augmentation.

## Data Augmention and Loading

In [3]:
# All bands images path
data_path = '/kaggle/input/eurosat-dataset/EuroSATallBands/'

# Opening the Dataframes
train_df = pd.read_csv(data_path + 'train.csv')
test_df = pd.read_csv(data_path + 'test.csv')
validation_df = pd.read_csv(data_path + 'validation.csv')

# Converting 'Label' col to string in the dataframes to encode them
train_df['Label'] = train_df['Label'].astype(str)
validation_df['Label'] = validation_df['Label'].astype(str)
test_df['Label'] = test_df['Label'].astype(str)

In [4]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Train Data Generator


In [5]:
train_datagen = ImageDataGenerator(
    rescale=1./255,       # Ensures that pixel values are normalized
    zoom_range=0.2,       # Specifies the range for random zoom.
    horizontal_flip=True, # Enables random horizontal flipping.
    vertical_flip=True,   # Enables random vertical flipping.
    rotation_range=40,    # Specifies the range (in degrees) for random rotations. 
    shear_range=20,       # Specifies the shear intensity (in degrees).
    fill_mode='nearest'   # Determines how to fill in pixels that are created during transformations.
)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,   # The Pandas DataFrame containing the image metadata
    directory=data_path,  # The path to the directory where the images are stored.
    x_col='Filename',     # The column name in the DataFrame that specifies the image file names or paths.
    y_col='Label',        # The column name in the DataFrame that specifies the labels for the images.
    target_size=(64, 64), # The dimensions to resize your images to
    batch_size=32,        # The number of images to process in each batch.
    class_mode='sparse',  # etermines how labels are returned.
    shuffle=True
)

Found 19317 validated image filenames belonging to 10 classes.


### Validation Data Generator

In [6]:
validation_datagen = ImageDataGenerator(rescale=1./255)

validation_generator = validation_datagen.flow_from_dataframe(
    dataframe=validation_df,
    directory=data_path,
    x_col='Filename',
    y_col='Label',
    traget_size=(64, 64),
    batch_size=32,
    class_mode='sparse'
)

Found 5519 validated image filenames belonging to 10 classes.


### Test Data Generator

In [7]:
test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=data_path,
    x_col='Filename',
    y_col='Label',
    traget_size=(64, 64),
    batch_size=32,
    class_mode='sparse'
)

Found 2759 validated image filenames belonging to 10 classes.
