In [4]:
%matplotlib inline

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np
import math

from matplotlib import pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import statsmodels.api as sm

from scipy import stats

from sklearn.ensemble import RandomForestRegressor

# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

In [5]:
filepath = '../dataset'
X = pd.read_csv(filepath + '/dengue_features_train.csv')
Y = pd.read_csv(filepath + '/dengue_labels_train.csv')
T = pd.read_csv(filepath + '/dengue_features_test.csv')

# concating total cases to train data frame - beacuse of issue in removing outliers
X = pd.concat([Y['total_cases'], X], axis=1)

In [6]:
# plot heat map of dataset
def plotHeatMap(dataset):
    f,ax = plt.subplots(figsize=(18, 18))
    sn.heatmap(dataset.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [7]:
# remove correlated columns
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if np.absolute(corr_matrix.iloc[i, j]) >= threshold:
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

In [8]:
# drop columns having more than 10% NaN values
NaNDic = (X.isnull().sum()*100/X.shape[0])>=10
for i in X.columns.values:
    if(NaNDic[i]):
        X.drop(i,axis=1,inplace=True)
        T.drop(i,axis=1,inplace=True)

In [9]:
# seperate into two cities
X_sj = X[X['city'] == "sj"]
X_iq = X[X['city'] == "iq"]
T_sj = T[T['city'] == "sj"]
T_iq = T[T['city'] == "iq"]
# drop columns
dropping_columns = ['city']
X_sj = X_sj.drop(dropping_columns, axis=1)
X_iq = X_iq.drop(dropping_columns, axis=1)
T_sj = T_sj.drop(dropping_columns, axis=1)
T_iq = T_iq.drop(dropping_columns, axis=1)
# fill NaN values
X_sj.interpolate(inplace=True)
X_iq.interpolate(inplace=True)
T_sj.interpolate(inplace=True)
T_iq.interpolate(inplace=True)

In [10]:
# remove outliers
X_sj = X_sj[(np.abs(stats.zscore(X_sj.drop(['year','weekofyear','week_start_date','total_cases'],axis=1))) < 5).all(axis=1)]
X_iq = X_iq[(np.abs(stats.zscore(X_iq.drop(['year','weekofyear','week_start_date','total_cases'],axis=1))) < 5).all(axis=1)]

In [11]:
# sperating total_cases label again
L_sj = pd.DataFrame(X_sj['total_cases'])
L_iq = pd.DataFrame(X_iq['total_cases'])

# drop total_cases and back X_sj,X_iq in dataset
X_sj = X_sj.drop(['total_cases'],axis=1)
X_iq = X_iq.drop(['total_cases'],axis=1)

In [12]:
# concating test and train
XandT_sj = pd.concat([X_sj, T_sj])
XandT_iq = pd.concat([X_iq, T_iq])

# stores droped columns from XandT_sj, XandT_iq
XandT_sj_rest = pd.DataFrame(XandT_sj[['year','week_start_date']])
XandT_iq_rest = pd.DataFrame(XandT_iq[['year','week_start_date']])

#
XandT_sj.drop(['year','week_start_date'], axis=1, inplace=True)
XandT_iq.drop(['year','week_start_date'], axis=1, inplace=True)

# scaling training set with test set together
XandT_sj[XandT_sj.columns] = MinMaxScaler().fit_transform(XandT_sj)
XandT_iq[XandT_iq.columns] = MinMaxScaler().fit_transform(XandT_iq)

XandT_sj = pd.concat([XandT_sj_rest, XandT_sj], axis=1)
XandT_iq = pd.concat([XandT_iq_rest, XandT_iq], axis=1)

# final scaled X
X_sj = XandT_sj[:926]
X_iq = XandT_iq[:515]

# final scaled T
T_sj = XandT_sj[926:]
T_iq = XandT_iq[515:]

In [16]:
# correlations
sj_correlations = pd.concat([X_sj, L_sj], axis=1).corr().total_cases.drop('total_cases')
iq_correlations = pd.concat([X_iq, L_iq], axis=1).corr().total_cases.drop('total_cases')


# low results
for i in X_sj.drop(['year','week_start_date'], axis=1).columns.values:
    X_sj[i] = X_sj[i] * np.absolute(sj_correlations[i]) * 100
    X_iq[i] = X_iq[i] * np.absolute(iq_correlations[i]) * 100
    
X_sj

year                                    -0.212506
weekofyear                               0.288052
ndvi_nw                                  0.042507
ndvi_se                                 -0.046217
ndvi_sw                                  0.040561
precipitation_amt_mm                     0.075728
reanalysis_air_temp_k                    0.180454
reanalysis_avg_temp_k                    0.173806
reanalysis_dew_point_temp_k              0.203985
reanalysis_max_air_temp_k                0.193488
reanalysis_min_air_temp_k                0.186487
reanalysis_precip_amt_kg_per_m2          0.148030
reanalysis_relative_humidity_percent     0.148630
reanalysis_sat_precip_amt_mm             0.075728
reanalysis_specific_humidity_g_per_kg    0.208200
reanalysis_tdtr_k                       -0.066252
station_avg_temp_c                       0.194726
station_diur_temp_rng_c                  0.035631
station_max_temp_c                       0.188667
station_min_temp_c                       0.175596


## Spliting test data set

In [285]:
# XTT = train_test_split(X, test_size=0.33, shuffle=False)
# X_train = XTT[0]
# X_test = XTT[1]

## Training Model

In [286]:
forest_model_sj = RandomForestRegressor(n_estimators=10000)
forest_model_sj.fit(X_sj.drop(['week_start_date','year'], axis=1), L_sj)

forest_model_iq = RandomForestRegressor(n_estimators=10000)
forest_model_iq.fit(X_iq.drop(['week_start_date','year'], axis=1), L_iq)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [287]:
forest_predict_sj = forest_model_sj.predict(T_sj.drop(['week_start_date','year'], axis=1))
forest_predict_iq = forest_model_iq.predict(T_iq.drop(['week_start_date','year'], axis=1))

In [288]:
predict_list = list((forest_predict_sj).astype(int)) + list((forest_predict_iq).astype(int))

## Write to CSV

In [289]:
# forest_predict_frame = pd.DataFrame((forest_predict).astype(int))
S = pd.read_csv(filepath + '/submission_format.csv')

S['total_cases'] = predict_list

S

S.to_csv(filepath + '/forest_test_new.csv', index=False)