# ML

## Get Data from DB

In [290]:
import couchdb
import pandas as pd
import numpy as np

In [271]:
username = 'admin'
password = 'password'
db_url = 'http://127.0.0.1:5984/'
db_auth_url = f'http://{username}:{password}@127.0.0.1:5984/'

In [272]:
import couchdb
couch = couchdb.Server(db_auth_url)
couch

<Server 'http://127.0.0.1:5984/'>

In [273]:
# Connect do databases:
db_gdp = couch['gdp']
db_nat = couch['natural_disasters']

### Natural Disasters Data

In [274]:
# Query natural disasters table
rows = db_nat.view('_all_docs', include_docs=True)
data = [row['doc'] for row in rows]
df_nat = pd.DataFrame(data)

In [275]:
# TEMPORARY
df_nat = df_nat.drop(['_id', '_rev'], axis=1)
df_nat.head()

Unnamed: 0,country,iso_country,year,type,group,event_name,region,continent,total_deaths,total_affected,duration
0,Bangladesh,BGD,1961,Storm,Meteorological,,Southern Asia,Asia,11000.0,-1.0,0
1,Bangladesh,BGD,1961,Storm,Meteorological,,Southern Asia,Asia,-1.0,-1.0,0
2,Bangladesh,BGD,1961,Storm,Meteorological,,Southern Asia,Asia,266.0,-1.0,30
3,Belize,BLZ,1961,Storm,Meteorological,Hattie,Central America,Americas,275.0,-1.0,1
4,Canada,CAN,1961,Drought,Climatological,,Northern America,Americas,-1.0,-1.0,364


In [276]:
ml_features = ['type', 'group', 'total_deaths', 'total_affected']

Preparing Dataset

In [277]:
df_nat['total_deaths'].replace(-1, np.nan, inplace=True)
df_nat['total_affected'].replace(-1, np.nan, inplace=True)

In [278]:
df_nat = df_nat.dropna(subset=ml_features)
df_nat.shape[0]

7455

### Country GDP's Data

In [279]:
# Query gdp table
rows = db_gdp.view('_all_docs', include_docs=True)
data = [row['doc'] for row in rows]
df_gdp = pd.DataFrame(data)

In [280]:
# TEMPORARY
df_gdp = df_gdp.drop(['_id', '_rev'], axis=1)
df_gdp.head()

Unnamed: 0,iso_country,gdp_value,year
0,ZAF,387934600000.0,2019
1,ZAF,404842100000.0,2018
2,ZAF,381448800000.0,2017
3,ZAF,323585500000.0,2016
4,ZAF,346709800000.0,2015


## Combine Data

In [281]:
df = df_nat.copy()

for index, row in df.iterrows():
    iso = row['iso_country']
    year = row['year']

    gdps_by_year = df_gdp.loc[(df_gdp['iso_country'] == iso)].set_index('year')['gdp_value']

    gdp_m_1     = gdps_by_year[year-1]  if year-1   in gdps_by_year else np.nan
    gdp         = gdps_by_year[year]    if year     in gdps_by_year else np.nan
    gdp_p_1     = gdps_by_year[year+1]  if year+1   in gdps_by_year else np.nan
    gdp_p_2     = gdps_by_year[year+2]  if year+2   in gdps_by_year else np.nan
    gdp_p_3     = gdps_by_year[year+3]  if year+3   in gdps_by_year else np.nan

    gdp_change = ((gdp_p_1 + gdp_p_2 + gdp_p_3) / 3) / ((gdp_m_1 + gdp) / 2)
    
    df.at[index, 'gdp-1']  = gdp_m_1
    df.at[index, 'gdp']    = gdp
    df.at[index, 'gdp+1']  = gdp_p_1
    df.at[index, 'gdp+2']  = gdp_p_2
    df.at[index, 'gdp+3']  = gdp_p_3

    # adding gdp_change resembling the relative difference between the average of before and after the disaster
    df.at[index, 'gdp_change'] = gdp_change

df.head()

Unnamed: 0,country,iso_country,year,type,group,event_name,region,continent,total_deaths,total_affected,duration,gdp-1,gdp,gdp+1,gdp+2,gdp+3,gdp_change
8,Hong Kong,HKG,1961,Storm,Meteorological,Olga,Eastern Asia,Asia,7.0,1171.0,0,1320797000.0,1383682000.0,1612346000.0,1935298000.0,2206466000.0,1.418416
12,Albania,ALB,1962,Earthquake,Geophysical,,Southern Europe,Europe,15.0,154.0,0,,,,,,
14,Colombia,COL,1962,Earthquake,Geophysical,,South America,Americas,47.0,300.0,0,4540448000.0,4955544000.0,4836167000.0,5973367000.0,5760762000.0,1.163319
22,Bangladesh,BGD,1963,Storm,Meteorological,,Southern Asia,Asia,22000.0,1000000.0,0,5081413000.0,5319458000.0,5386055000.0,5906637000.0,6439688000.0,1.136596
26,Costa Rica,CRI,1963,Volcanic activity,Geophysical,,Central America,Americas,15.0,5200.0,0,479180800.0,511902100.0,542578400.0,592981200.0,647305600.0,1.199271


Dropping Rows with NaN Values in one or more GDP column

In [282]:
df = df.drop( df[df['gdp-1'].isna() | (df['gdp'].isna()) | (df['gdp+1'].isna()) | (df['gdp+2'].isna()) | (df['gdp+3'].isna())].index )
df.shape[0]

7149

Combining multiple disaster from the same year and country into one column

In [283]:
# ToDO FoR LATER

## Machine Learning

In [284]:
df_ml = df[ml_features + ['gdp_change']]
df_ml.head()

Unnamed: 0,type,group,total_deaths,total_affected,gdp_change
8,Storm,Meteorological,7.0,1171.0,1.418416
14,Earthquake,Geophysical,47.0,300.0,1.163319
22,Storm,Meteorological,22000.0,1000000.0,1.136596
26,Volcanic activity,Geophysical,15.0,5200.0,1.199271
38,Storm,Meteorological,300.0,800.0,1.220549


### Categorical Data

In [285]:
df_ml['type'].unique()

array(['Storm', 'Earthquake', 'Volcanic activity', 'Flood', 'Landslide',
       'Epidemic', 'Wildfire', 'Mass movement (dry)',
       'Extreme temperature ', 'Drought', 'Animal accident'], dtype=object)

In [286]:
df_ml['group'].unique()

array(['Meteorological', 'Geophysical', 'Hydrological', 'Biological',
       'Climatological'], dtype=object)

In [287]:
one_hot_encoded_data = pd.get_dummies(df_ml, columns = ['type','group'])
one_hot_encoded_data

Unnamed: 0,total_deaths,total_affected,gdp_change,type_Animal accident,type_Drought,type_Earthquake,type_Epidemic,type_Extreme temperature,type_Flood,type_Landslide,type_Mass movement (dry),type_Storm,type_Volcanic activity,type_Wildfire,group_Biological,group_Climatological,group_Geophysical,group_Hydrological,group_Meteorological
8,7.0,1171.0,1.418416,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
14,47.0,300.0,1.163319,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
22,22000.0,1000000.0,1.136596,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
26,15.0,5200.0,1.199271,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
38,300.0,800.0,1.220549,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13693,10.0,50000.0,0.765751,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
13695,11.0,500052.0,0.765751,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
13699,12.0,11000.0,0.954276,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
13700,3.0,100000.0,1.150856,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


### Splitting Data

In [289]:
X = one_hot_encoded_data.drop('gdp_change', axis=1)
y = one_hot_encoded_data['gdp_change']

In [291]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [293]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model = model.fit(X_train, y_train)

In [302]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print(f'The models mean absolute error when predicting the gdp change after a natural disaster is {mae.round(4)*100}%')

The models mean absolute error when predicting the gdp change after a natural disaster is 18.87%


###