In [275]:
#%matplotlib inline

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np
import math

from matplotlib import pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import statsmodels.api as sm

from scipy import stats

from sklearn.ensemble import RandomForestRegressor

# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

In [276]:
filepath = 'dataset'
X = pd.read_csv(filepath + '/dengue_features_train.csv')
Y = pd.read_csv(filepath + '/dengue_labels_train.csv')
T = pd.read_csv(filepath + '/dengue_features_test.csv')

# concating total cases to train data frame - beacuse of issue in removing outliers
X = pd.concat([Y['total_cases'], X], axis=1)

In [277]:
# plot heat map of dataset
def plotHeatMap(dataset):
    f,ax = plt.subplots(figsize=(18, 18))
    sn.heatmap(dataset.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [278]:
# remove correlated columns
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i, j] >= threshold:
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

In [279]:
# drop columns having more than 10% NaN values
NaNDic = (X.isnull().sum()*100/X.shape[0])>=10
for i in X.columns.values:
    if(NaNDic[i]):
        X.drop(i,axis=1,inplace=True)
        T.drop(i,axis=1,inplace=True)

In [280]:
# seperate into two cities
X_sj = X[X['city'] == "sj"]
X_iq = X[X['city'] == "iq"]
T_sj = T[T['city'] == "sj"]
T_iq = T[T['city'] == "iq"]
# drop columns
dropping_columns = ['city']
X_sj = X_sj.drop(dropping_columns, axis=1)
X_iq = X_iq.drop(dropping_columns, axis=1)
T_sj = T_sj.drop(dropping_columns, axis=1)
T_iq = T_iq.drop(dropping_columns, axis=1)
# fill NaN values
X_sj.interpolate(inplace=True)
X_iq.interpolate(inplace=True)
T_sj.interpolate(inplace=True)
T_iq.interpolate(inplace=True)

In [281]:
# remove outliers
X_sj = X_sj[(np.abs(stats.zscore(X_sj.drop(['year','weekofyear','week_start_date','total_cases'],axis=1))) < 5).all(axis=1)]
X_iq = X_iq[(np.abs(stats.zscore(X_iq.drop(['year','weekofyear','week_start_date','total_cases'],axis=1))) < 5).all(axis=1)]

In [282]:
# sperating total_cases label again
L_sj = pd.DataFrame(X_sj['total_cases'])
L_iq = pd.DataFrame(X_iq['total_cases'])

# drop total_cases and back X_sj,X_iq in dataset
X_sj = X_sj.drop(['total_cases'],axis=1)
X_iq = X_iq.drop(['total_cases'],axis=1)

In [283]:
# concating test and train
XandT_sj = pd.concat([X_sj, T_sj])
XandT_iq = pd.concat([X_iq, T_iq])

# stores droped columns from XandT_sj, XandT_iq
XandT_sj_rest = pd.DataFrame(XandT_sj[['year','week_start_date']])
XandT_iq_rest = pd.DataFrame(XandT_iq[['year','week_start_date']])

#
XandT_sj.drop(['year','week_start_date'], axis=1, inplace=True)
XandT_iq.drop(['year','week_start_date'], axis=1, inplace=True)

# scaling training set with test set together
XandT_sj[XandT_sj.columns] = MinMaxScaler().fit_transform(XandT_sj)
XandT_iq[XandT_iq.columns] = MinMaxScaler().fit_transform(XandT_iq)

XandT_sj = pd.concat([XandT_sj_rest, XandT_sj], axis=1)
XandT_iq = pd.concat([XandT_iq_rest, XandT_iq], axis=1)

# final scaled X
X_sj = XandT_sj[:926]
X_iq = XandT_iq[:515]

# final scaled T
T_sj = XandT_sj[926:]
T_iq = XandT_iq[515:]

In [284]:
# correlations
sj_correlations = pd.concat([X_sj, L_sj], axis=1).corr().total_cases.drop('total_cases')
iq_correlations = pd.concat([X_iq, L_iq], axis=1).corr().total_cases.drop('total_cases')

# low results
for i in X_sj.drop(['year','week_start_date'], axis=1).columns.values:
    X_sj[i] = X_sj[i] * np.absolute(sj_correlations[i]) * 100
    X_iq[i] = X_iq[i] * np.absolute(iq_correlations[i]) * 100
    
X_sj

Unnamed: 0,year,week_start_date,weekofyear,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,1990,1990-04-30,0.027126,0.000779,0.001119,0.000891,0.000317,0.008499,0.008132,0.014145,...,0.002327,0.008336,0.000317,0.012891,0.001817,0.013271,0.000618,0.010799,0.007622,0.000318
1,1990,1990-05-07,0.028722,0.000852,0.000930,0.000810,0.000582,0.011820,0.011627,0.021990,...,0.001304,0.012287,0.000582,0.020523,0.001449,0.019761,0.000504,0.019997,0.015244,0.000171
2,1990,1990-05-14,0.030317,0.000910,0.000903,0.000866,0.000881,0.014785,0.013802,0.029558,...,0.001898,0.016910,0.000881,0.028805,0.001347,0.019761,0.000528,0.021997,0.017322,0.000823
3,1990,1990-05-21,0.031913,0.001045,0.001271,0.001107,0.000392,0.015855,0.015550,0.028923,...,0.001011,0.015217,0.000392,0.027819,0.001531,0.023626,0.000590,0.026396,0.019055,0.000080
4,1990,1990-05-28,0.033509,0.001078,0.001394,0.001149,0.000192,0.018618,0.017726,0.031534,...,0.000887,0.015338,0.000192,0.030833,0.002368,0.031137,0.001152,0.033196,0.021133,0.000115
5,1990,1990-06-04,0.035104,0.000913,0.001410,0.000907,0.000244,0.019198,0.018225,0.031687,...,0.001926,0.014777,0.000244,0.030849,0.001062,0.026907,0.000627,0.030796,0.021133,0.000778
6,1990,1990-06-11,0.036700,0.000758,0.001153,0.001012,0.000089,0.016999,0.015514,0.031760,...,0.002806,0.016858,0.000089,0.030969,0.000980,0.023334,0.000590,0.021997,0.019055,0.000591
7,1990,1990-06-18,0.038296,0.000720,0.000873,0.000727,0.003853,0.018997,0.017048,0.035157,...,0.002181,0.018216,0.003853,0.035138,0.000306,0.028220,0.000788,0.028796,0.017322,0.000420
8,1990,1990-06-25,0.039891,0.000859,0.000738,0.000692,0.000493,0.018930,0.017191,0.034377,...,0.002727,0.017617,0.000493,0.034088,0.000755,0.028001,0.000723,0.028796,0.017322,0.000420
9,1990,1990-07-02,0.041487,0.000813,0.000921,0.000984,0.000367,0.021924,0.020793,0.035770,...,0.002065,0.016149,0.000367,0.035667,0.000939,0.028001,0.000534,0.028796,0.022866,0.000022


## Spliting test data set

In [285]:
# XTT = train_test_split(X, test_size=0.33, shuffle=False)
# X_train = XTT[0]
# X_test = XTT[1]

## Training Model

In [286]:
forest_model_sj = RandomForestRegressor(n_estimators=10000)
forest_model_sj.fit(X_sj.drop(['week_start_date','year'], axis=1), L_sj)

forest_model_iq = RandomForestRegressor(n_estimators=10000)
forest_model_iq.fit(X_iq.drop(['week_start_date','year'], axis=1), L_iq)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [287]:
forest_predict_sj = forest_model_sj.predict(T_sj.drop(['week_start_date','year'], axis=1))
forest_predict_iq = forest_model_iq.predict(T_iq.drop(['week_start_date','year'], axis=1))

In [288]:
predict_list = list((forest_predict_sj).astype(int)) + list((forest_predict_iq).astype(int))

## Write to CSV

In [289]:
# forest_predict_frame = pd.DataFrame((forest_predict).astype(int))
S = pd.read_csv(filepath + '/submission_format.csv')

S['total_cases'] = predict_list

S

S.to_csv(filepath + '/forest_test_new.csv', index=False)