# Model 3B: Flood Classification Algorithm

This file classifies flood differences with food crises data

**Tasks**
* Prepare data for ML modeling (no missing values, normalise & split into train/test sets)
* Tune Random forest hyper-parameters with a grid search
* Create final model & evaluate
* Runtime: < 1 second

In [46]:
import os
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import timeit
os.getcwd()

start = timeit.default_timer()

## 1. Read Data

In [47]:
# Read FE data
FE = pd.read_csv('Data/FC_flood_differenced_data.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'Data/FC_flood_differenced_data.csv'

## 2. Model Preprocessing

In [None]:
# Drop unnecessary columns (i.e. space/time no longer need because lags are accounted for)
data = FE.drop(['country', 'admin_name', 'flood_date'], axis=1)


**Addressing missing values**

 - If 25% of a columns data is NA, that column is removed
 - For all other missing values, the column mean is used - BUT need a better option!

In [None]:
# Establish how much data is missing
print(data.isnull().sum().sort_values(ascending=False).head())

# Columns with 25% of values missing are dropped
variables_na = []
percent = len(data) * 0.25
for i in data:
    if data[i].isnull().sum() > percent:
        str(i)
        variables_na.append(i)
data = data.drop(columns=variables_na)

In [None]:
# Impute the missing values using SimpleImputer in sklearn.impute
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(data)

data = pd.DataFrame(data=imp.transform(data), columns=data.columns)

# Check if there is still missing data
data.isnull().sum().sort_values(ascending=False).head()

**Normalize the data set**
- Scales each column to values between 0-1 to be better interpreted by the models

In [None]:
predictor = data['flooding_diff']
data = data.drop(['flooding_diff'], axis=1)
scaler = MinMaxScaler()
data = gpd.GeoDataFrame(scaler.fit_transform(data), columns=data.columns, index=FE.index)
data.head()
data.describe()
data['class'] = predictor

**Binary Classification**
* Differences between classifications do not have enough variance to apply regression.
The classes will be rounded so that classification can be performed.
* Binary classification of to determine if there is a difference or not

In [None]:
data['class'] = data['class'].apply(np.ceil) # round classifications because differences are not enough to

# Binary Classification
data['class'] = [0 if x == 0 else 1 for x in data['class']]


## 3. Random Forest Classification

**Randomly split data in training and testing sets**

In [None]:
# A. SPLIT TRAIN/TEST SETS
# Random 70% split in train and test sets (time series is accounted for by lags)
# split data into x & y
y = data['class']
x = data.drop(columns=['class'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

**Tune hyper-parameters**
* n_estimators = Number of trees in random forest
* max_features = Number of features to consider at every split
* max_depth = Maximum number of levels in tree
* min_samples_leaf = Minimum number of samples required at each leaf node
* min_samples_split = Minimum number of samples required to split a node
* bootstrap = Method of selecting samples for training each tree

Found: {'bootstrap': True,
 'max_depth': 71,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 15,
 'n_estimators': 9}

In [None]:
# param_grid = {'n_estimators': [int(x) for x in np.linspace(start = 1, stop = 80, num = 10)],
#                'max_depth': [int(x) for x in np.linspace(start = 1, stop = 80, num = 10)],
#                'max_features': ['auto', 'sqrt'],
#                'min_samples_leaf': [2, 5, 10],
#                'min_samples_split': [2, 5, 10, 15],
#                'bootstrap': [True, False]}
#
# # Create a based model
# rf = RandomForestClassifier()
#
# # Instantiate the grid search model
# rf_Grid = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)
# rf_Grid.fit(x_train,y_train)
#
# # Get best parameters
# rf_Grid.best_params_


**Fit Optimised Model**

In [None]:
# C. RESULTS
# Fit optimised model
clf=RandomForestClassifier(n_estimators=9, min_samples_leaf=2, min_samples_split=15,
                           max_features='sqrt', max_depth=71, bootstrap=True)
clf.fit(x_train,y_train)


**Evaluate Model**
* The R square of our RF model is 0.08355, which means that the percentage of explained variance is approx 8.355%

In [None]:
# For plotting confusion matrix
def plot_confusion_matrix(train_pred, train, dom):
    from sklearn.metrics import confusion_matrix
    classes = {'Difference': -1, 'No Difference': 0}
    cf = confusion_matrix(train_pred,train)
    sns.heatmap(cf,annot=True,yticklabels=classes,xticklabels=classes,cmap='Blues', fmt='g')
    plt.title(f'Flood {dom} Confusion matrix', fontweight ='bold', fontsize= 14)
    plt.tight_layout()
    plt.figure(figsize=(1, 2), dpi=80)
    # plt.savefig(f'Images/conmat_flood_{dom}.png')
    plt.show()

# Evaluate performance
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)
print(f'Ensemble Train Model Accuracy(in %): {metrics.accuracy_score(y_train_pred,y_train)*100}')
print(f'Ensemble Test Model Accuracy(in %): {metrics.accuracy_score(y_test_pred,y_test)*100}')
train = plot_confusion_matrix(y_train_pred,y_train,dom='Training')
test = plot_confusion_matrix(y_test_pred,y_test,dom='Testing')

In [None]:
# Evaluate variable importance

# for name, importance in zip(x_test.columns, clf.feature_importances_):
#     print(name, "=", importance)

features = x_test.columns
importances = clf.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center') # color
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig('Images/importance_noflood.png')


stop = timeit.default_timer()
print('Running Time: ', stop - start, 'seconds')