## Model 2: Machine learning algorithm

In [68]:
# import libraries: standard env, +pysal, + jupyter, + matplotlib
#### import libraries ####
import os
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import timeit
os.getcwd()

start = timeit.default_timer()

## Read Data

In [69]:
# Read FS data
FS = pd.read_csv('Data/FS_data.csv')


In [70]:
# **Combine processed FS shapefile w/ sars data**

## Model Preprocessing

In [71]:
# Drop unnecessary columns (i.e. space/time no longer need because lags are accounted for)
admin_code = FS['admin_code']
data = FS.drop(['country', 'admin_code', 'admin_name', 'year_month', 'year', 'month', 'geometry'], axis=1)

**Addressing missing values**

 - If 25% of a columns data is NA, that column is removed
 - For all other missing values, the column mean is used - BUT need a better option!

In [72]:
# Establish how much data is missing
print(data.isnull().sum().sort_values(ascending=False).head())

# Find the columns with large amounts of na values
variables_na = []
for i in data:
    if data[i].isnull().sum() > 554:
        str(i)
        variables_na.append(i)
data = data.drop(columns=variables_na) #delete those

timelag2     540
timelag1     270
fews_ipc       0
ndvi_mean      0
rain_mean      0
dtype: int64


In [73]:
# Impute the missing values using SimpleImputer in sklearn.impute
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(data)

data = pd.DataFrame(data=imp.transform(data), columns=data.columns)

# Check if there is still missing data
data.isnull().sum().sort_values(ascending=False).head()

fews_ipc       0
ndvi_mean      0
rain_mean      0
et_mean        0
acled_count    0
dtype: int64

**Normalize the data set**
- Scales each column to values between 0-1 to be better interpreted by the models
- CHECK: how normalisation impacts model accuracy

In [74]:
fews_ipc = data['fews_ipc']
data = data.drop(['fews_ipc'], axis=1)
scaler = MinMaxScaler()
data = gpd.GeoDataFrame(scaler.fit_transform(data), columns=data.columns, index=FS.index)
data.head()
data.describe()
data['class'] = fews_ipc

**Add Binary Classification**
- Using column quartiles as a suitable threshold base
- CHECK: how classification complexity impacts models
- NOTE: Using +3 classes would be more informative in our model explorations

In [75]:
# Binary classification based on quartiles
data['class'] = [0 if x == 1 else 1 for x in data['class']]
data['class'] = data['class'].astype(float) # Reformat column




stop = timeit.default_timer()
print('Time: ', stop - start)