# Cloudmasking
*by Felix*

Notebook to test how the cloudmasking work. For actually cloudmasking the data please use the feature_engineering.py .
We first need to import all the needed modules.

In [2]:
# import the needed modules
import numpy as np
import pandas as pd
import os

Set up Working directory

In [10]:
#path = os.getcwd()
path = '/Users/felixbehrendt/neuefische/Radiant-Earth-Spot-Crop/'
# Set Workign directory and print
os.chdir(path)
print(f'Current Working directory: {path}')

Current Working directory: /Users/felixbehrendt/neuefische/Radiant-Earth-Spot-Crop/


Load data from preprocessing module

In [12]:
df = pd.read_csv('data/mean_band_perField_perDate.csv')
df.head()

Unnamed: 0,field_id,date,label,B02,B03,B04,B08,B11,B12,CLM
0,1,2017-04-01,4,21.934084,29.180065,35.55466,62.490353,68.3971,46.04019,255.0
1,1,2017-04-11,4,14.844051,23.114147,30.607718,58.736336,73.43569,48.863342,0.0
2,1,2017-04-21,4,13.385852,21.596462,29.223473,57.065918,73.66881,49.313503,0.0
3,1,2017-05-01,4,15.408361,22.471062,29.371382,56.434082,71.05788,46.557877,0.0
4,1,2017-05-11,4,54.829582,65.73955,72.90675,95.67203,66.14791,58.643085,255.0


The data has only two values for the Cloudmask(CLM) - 0 : unclouded and 255 : No Information See EDA_cloud_mask.ipynb. </br>

Currently its possible to choose between two options:
1. Keep all values              --> delete CLM column
2. Keep rows with CLM == 0      --> delete CLM column

In [35]:
def drop_unknown_fun(df:pd.DataFrame, verbose:bool=False) -> pd.DataFrame:
    """Takes the Data and removes all rows with unknown Cloudinformation

    Args:
        df (pd.DataFrame): Full Dataset
        verbose (Boolean): Print information about loose of information (rows), Default to False

    Returns:
        pd.DataFrame: Dataset without clouds or dropped Cloud column
    """

    # create subset with only the data that have no cloud and drop CLM Column
    df_wo_cloud = df[df.CLM == 255]

    # Print Loose of information
    if verbose:
        print(f'Rows without unknown:             {df_wo_cloud.shape[0]}')
        print(f'Rows with unknown:                {df.shape[0]}')
        print(f'Precentage of remaining Data:     {round((df_wo_cloud.shape[0] / df.shape[0]) * 100, 3)} %')

    return df_wo_cloud

def delete_CLM_column(df:pd.DataFrame) -> pd.DataFrame:
    """Deletes the CLM column

    Args:
        df (pd.DataFrame): Data with CLM column

    Returns:
        pd.DataFrame: Data without CLM column
    """
    return df.drop('CLM', axis=1)


def cloud_mask(df:pd.DataFrame, drop_unknown:bool = False, verbose:bool = False) -> pd.DataFrame:
    """ Handle cloudy data in the dataset

    Args:
        df (pd.DataFrame): Dataset (independently from dataset)
        drop_unknown (bool): Decide whether to drop unknown data or not. Default to False
        verbose (bool, optional): Print information about loose of information (rows). Defaults to False.

    Returns:
        pd.DataFrame: returns data as df without cloudinformation
    """
    if drop_unknown:
        df = drop_unknown_fun(df, verbose)
    
    return delete_CLM_column(df)


First a short look on the data by keeping the unknown data:

In [15]:
df_woCLM = cloud_mask(df)
df_woCLM.head()

print(f'Rows of Original Data: {df.shape[0]}')
print(f'Rows of Without Cloud Data: {df_woCLM.shape[0]}')

Rows of Original Data: 4301227
Rows of Without Cloud Data: 4301227


First a short look on the data by deleting the unknown data:

In [36]:
df_woCLM_2 = cloud_mask(df, drop_unknown=True, verbose=True)
df_woCLM_2.head()

print(f'Rows of Original Data: {df.shape[0]}')
print(f'Rows of Without Cloud Data: {df_woCLM_2.shape[0]}')

Rows without unknown:             1473025
Rows with unknown:                4301227
Precentage of remaining Data:     34.247 %
Rows of Original Data: 4301227
Rows of Without Cloud Data: 1473025
