In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from tensorflow.keras.layers import LSTM, Dense, Input, Reshape, Dropout # type: ignore

In [None]:
path_train = '1-Resources/Technical Assesments/Task 1/train_land_cover_assignment.csv'
path_test = '1-Resources/Technical Assesments/Task 1/test_land_cover_assignment.csv'
data_in = pd.read_csv(path_train)

## 1. EXPLORE DATASET

### 1a. Categorical Columns

In [None]:
# Extract Categorical Columns
cats = data_in.select_dtypes(include='object')
cats.head(5)

In [None]:
# Check for Null values
cats.isna().sum()

In [None]:
# ------------------------------------------------------------------
# COUNT PLOTS FOR VALUE DISTRIBUTION
# ------------------------------------------------------------------


def dist_list(df):
    plt.figure(figsize = (15, 60))
    
    # Hist Plots for each column by iteration
    for i, col in enumerate(df, 1):
        plt.subplot(16,3,i)
        sns.countplot(data = df, x = df[col])
        plt.title(f'{col}')
    
    # plt.suptitle('HISTOGRAM PLOTS')
    plt.tight_layout()
    plt.show()

dist_list(cats)

### Insights

1. The woody Vegetation contains Cover contains values for >60% as well as >30 nd <30. Later on this will be subset to only >60%
    The area is majorly >60% wood cover

2. Building and Crop land have 2 Classfications ; No and Yes
    The area has less building and crop land coverage

3. There are no NULL or EMPTY Rows in this Categorical data


### =======================================================================================================================================


### 1b. Numerical Columns

In [None]:
numerics = data_in.select_dtypes(include='number')
numerics.head(5)

In [None]:
# ------------------------------------------------------------------
# HISTOGRAM PLOTS FOR NUMERICAL DISTRIBUTIONS
# ------------------------------------------------------------------


def dist_list(df):
    plt.figure(figsize = (20, 40))
    
    # Hist Plots for each column by iteration
    for i, col in enumerate(df, 1):
        plt.subplot(16,3,i)
        sns.histplot(data = df, x = df[col])
        plt.title(f'{col}')

    
    plt.tight_layout()
    plt.show()

dist_list(numerics)

In [None]:
# ------------------------------------------------------------------
# CORRELOGRAM PLOT
# ------------------------------------------------------------------

plt.figure(figsize = (20, 20))
sns.heatmap(numerics.corr(),
            annot = True, fmt = '.1f')
plt.show()

## 2. PREPARE DATA FOR ML

### 2a. Handle Null Rows

In [None]:
# Extract Columns with Null values
nulls = numerics.isna().sum().to_frame(name='Nulls')
nulls_0 = nulls[nulls['Nulls'] > 0]
nulls_0.T

In [None]:
# EXPLORE THE DISTRIBUTION OF VALUES IN COLUMNS WITH NULLS

# Extract Null column names
null_cols = nulls_0.T.columns.values
def dist_list(df):
    plt.figure(figsize = (20, 40))
    
    # Hist Plots for each column by iteration
    for i, col in enumerate(df, 1):
        plt.subplot(16,3,i)
        sns.histplot(data = df, x = df[col])
        plt.title(f'{col}')

    
    plt.tight_layout()
    plt.show()

dist_list(data_in[null_cols])

The Columns with Nulls have a Continuous, we can fill these null values by Interpolation intead of dropping them

In [None]:
# Interpolate the null values using the nearest value
data_in_2 = data_in.interpolate(method='nearest')
# data_in.isna().sum()

### 2b. Handle Duplicated Rows

In [None]:
data_in_2.duplicated().sum()
# There are no duplicates

### 2c. Categorical Columns Encoding

In [None]:
# Subset the data to WoodCover > 60%
data_in_2 = data_in_2[data_in_2['wcover'] == '>60%']
data_in_2 = data_in_2.reset_index()
data_in_2.head(5)

In [None]:
# Encode Categorical Columns
codes = data_in_2.copy()
encoder = OrdinalEncoder()
codes = encoder.fit_transform(codes[cats.columns])
codes = pd.DataFrame(codes, columns = ['encode_building', 'encode_cropland', 'encode_wcover'])

# Merge the 2dfs on index
data_in_3 = pd.concat([codes, data_in_2], axis = 1)
# data_in_3 = pd.merge(left=data_in_2, right=codes, left_index=True, right_index=True, how='right' )

# Confirm Shapes after Join
print('Codes', codes.shape)
print('Data 2', data_in_2.shape)
print('Data 3', data_in_3.shape)

In [None]:
data_in_3.head(7)

### 2d. Numerical Columns Scaling

Use a MinMaxScaler to scale data to range 0 - 1

In [149]:
# Select Numeric Columns
data_in_4 = data_in_3[numerics.columns]
# Drop subid column
data_in_4 = data_in_4.drop(columns = 'subid')

# Create names for Scaled Columns
scaled_cols = []
for i in data_in_4.columns:
    i = 'scaled_'+i
    scaled_cols.append(i)
    
# Scale Numeric Columns
scaler = MinMaxScaler()
data_in_4 = scaler.fit_transform(data_in_4)
# Create a DF and merge DFs
data_in_4 = pd.DataFrame(data_in_4, columns = scaled_cols)
data_in_5 = pd.concat([codes, data_in_4], axis = 1)

# Confirm Shapes after Join
print('Codes', codes.shape)
print('Data 2', data_in_4.shape)
print('Data 3', data_in_5.shape)

Codes (7062, 3)
Data 2 (7062, 45)
Data 3 (7062, 48)


In [150]:
data_in_5.head(7)

Unnamed: 0,encode_building,encode_cropland,encode_wcover,scaled_lat,scaled_lon,scaled_bcount,scaled_x,scaled_y,scaled_bd20,scaled_bio1,...,scaled_mlon,scaled_nppm,scaled_npps,scaled_ph20,scaled_sirm,scaled_sirs,scaled_slope,scaled_snd20,scaled_soc20,scaled_tim
0,0.0,0.0,0.0,0.476351,0.633374,0.0,0.630476,0.46265,0.535604,0.810127,...,0.633374,0.35772,0.239044,0.821918,0.97822,0.743898,0.031763,0.801205,0.128342,0.184195
1,0.0,1.0,0.0,0.449686,0.871674,0.0,0.864381,0.426709,0.513932,0.822785,...,0.871676,0.289508,0.241364,0.657534,0.982638,0.644541,0.093402,0.668675,0.176471,0.191906
2,0.0,1.0,0.0,0.723253,0.906103,0.0,0.913524,0.699436,0.321981,0.860759,...,0.906102,0.23208,0.381851,0.780822,0.977256,0.652925,0.065564,0.644578,0.101604,0.266803
3,0.0,0.0,0.0,0.531858,0.357902,0.0,0.362667,0.529951,0.371517,0.746835,...,0.357901,0.389908,0.263494,0.547945,0.980492,0.62602,0.072418,0.680723,0.256684,0.302724
4,0.0,1.0,0.0,0.684215,0.43385,0.0,0.446857,0.680056,0.325077,0.734177,...,0.433847,0.370338,0.332989,0.452055,0.9806,0.686772,0.083805,0.283133,0.256684,0.406026
5,0.0,0.0,0.0,0.276752,0.773859,0.0,0.759238,0.256871,0.702786,0.759494,...,0.773861,0.798366,0.265504,0.438356,0.985573,0.650259,0.053663,0.518072,0.326203,0.194304
6,0.0,1.0,0.0,0.816221,0.2304,0.0,0.257143,0.82241,0.213622,0.746835,...,0.230396,0.2969,0.367112,0.60274,0.984314,0.674381,0.045365,0.271084,0.219251,0.532458


## 3. ML MODELLING

In [None]:
# MODEL ARCHITECTURES
# ! rm *.keras*

epochs_n = 350 # Number of Training Epochs
l_rate = 0.001 # Learning Rate

model_lstm_close = Sequential()
model_lstm_close.add(Input(shape=(seq_length, 5)))
model_lstm_close.add(LSTM(76, activation='relu', return_sequences=False))
model_lstm_close.add(Dense(close_y_train.shape[1]))
model_lstm_close.add(Reshape((close_y_train.shape[1], 1)))
model_lstm_close.compile(optimizer=Adam(learning_rate=l_rate), loss='mse', metrics = ['mae'])

In [None]:
model_lstm_close.fit(close_x_train, close_y_train, epochs=epochs_n, batch_size=32, validation_data=(close_x_test, close_y_test),
                     callbacks=[ModelCheckpoint('best_close_model2.keras', save_best_only=True)])