In [1]:
%matplotlib inline

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np
import math

from matplotlib import pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import statsmodels.api as sm

from scipy import stats

from sklearn.ensemble import RandomForestRegressor

# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

  from pandas.core import datetools


### Reading all dataset - train and test, Concating total_cases

In [2]:
filepath = 'dataset'
X = pd.read_csv(filepath + '/dengue_features_train.csv')
Y = pd.read_csv(filepath + '/dengue_labels_train.csv')
T = pd.read_csv(filepath + '/dengue_features_test.csv')

# concating total cases to train data frame - beacuse of issue in removing outliers
X = pd.concat([Y['total_cases'], X], axis=1)

### Plots HeatMap of a given dataset

In [3]:
# plot heat map of dataset
def plotHeatMap(dataset):
    f,ax = plt.subplots(figsize=(18, 18))
    sn.heatmap(dataset.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

### Drops features which have correlation (between two features) greater than or equal given threshold

In [4]:
# remove correlated columns
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if np.absolute(corr_matrix.iloc[i, j]) >= threshold:
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

### Drops columns having more than 10% NaN values

In [5]:
NaNDic = (X.isnull().sum()*100/X.shape[0])>=10
for i in X.columns.values:
    if(NaNDic[i]):
        X.drop(i,axis=1,inplace=True)
        T.drop(i,axis=1,inplace=True)

### Seperating into two cities, Filling null values - interpolation

In [6]:
# seperate into two cities
X_sj = X[X['city'] == "sj"]
X_iq = X[X['city'] == "iq"]
T_sj = T[T['city'] == "sj"]
T_iq = T[T['city'] == "iq"]
# drop columns
dropping_columns = ['city']
X_sj = X_sj.drop(dropping_columns, axis=1)
X_iq = X_iq.drop(dropping_columns, axis=1)
T_sj = T_sj.drop(dropping_columns, axis=1)
T_iq = T_iq.drop(dropping_columns, axis=1)
# fill NaN values
X_sj.interpolate(inplace=True)
X_iq.interpolate(inplace=True)
T_sj.interpolate(inplace=True)
T_iq.interpolate(inplace=True)

### Removing outliers

In [7]:
# remove outliers
X_sj = X_sj[(np.abs(stats.zscore(X_sj.drop(['year','weekofyear','week_start_date','total_cases'],axis=1))) < 5).all(axis=1)]
X_iq = X_iq[(np.abs(stats.zscore(X_iq.drop(['year','weekofyear','week_start_date','total_cases'],axis=1))) < 5).all(axis=1)]

In [8]:
# sperating total_cases label again
L_sj = pd.DataFrame(X_sj['total_cases'])
L_iq = pd.DataFrame(X_iq['total_cases'])

# drop total_cases and back X_sj,X_iq in dataset
X_sj = X_sj.drop(['total_cases'],axis=1)
X_iq = X_iq.drop(['total_cases'],axis=1)

### Scaling combined dataset of test and train - MinMax Scaler

In [9]:
# concating test and train
XandT_sj = pd.concat([X_sj, T_sj])
XandT_iq = pd.concat([X_iq, T_iq])

# stores droped columns from XandT_sj, XandT_iq
XandT_sj_rest = pd.DataFrame(XandT_sj[['year','week_start_date']])
XandT_iq_rest = pd.DataFrame(XandT_iq[['year','week_start_date']])

# drop to normalize
XandT_sj.drop(['year','week_start_date'], axis=1, inplace=True)
XandT_iq.drop(['year','week_start_date'], axis=1, inplace=True)

# scaling training set with test set together
XandT_sj[XandT_sj.columns] = MinMaxScaler().fit_transform(XandT_sj)
XandT_iq[XandT_iq.columns] = MinMaxScaler().fit_transform(XandT_iq)

XandT_sj = pd.concat([XandT_sj_rest, XandT_sj], axis=1)
XandT_iq = pd.concat([XandT_iq_rest, XandT_iq], axis=1)

# final scaled X
X_sj = XandT_sj[:926]
X_iq = XandT_iq[:515]

# final scaled T
T_sj = XandT_sj[926:]
T_iq = XandT_iq[515:]

### Assigning weights based on correlation

In [10]:
# correlations
sj_correlations = pd.concat([X_sj, L_sj], axis=1).corr().total_cases.drop('total_cases')
iq_correlations = pd.concat([X_iq, L_iq], axis=1).corr().total_cases.drop('total_cases')


# low results
for i in X_sj.drop(['year','week_start_date'], axis=1).columns.values:
    X_sj[i] = X_sj[i] * np.absolute(sj_correlations[i]) * 100
    X_iq[i] = X_iq[i] * np.absolute(iq_correlations[i]) * 100
    
X_sj

Unnamed: 0,year,week_start_date,weekofyear,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,1990,1990-04-30,9.417087,1.832800,2.420392,2.197934,0.418204,4.709989,4.678590,6.934130,...,1.571654,5.608605,0.418204,6.191710,2.742512,6.815411,1.733868,5.723606,4.340572,0.495131
1,1990,1990-05-07,9.971033,2.003298,2.011827,1.996160,0.768392,6.550343,6.689563,10.780070,...,0.881109,8.266810,0.768392,9.857449,2.187846,10.148222,1.413102,10.599271,8.681144,0.266133
2,1990,1990-05-14,10.524980,2.139837,1.953504,2.136176,1.163026,8.193075,7.941291,14.490187,...,1.281880,11.377536,1.163026,13.835084,2.033773,10.148222,1.482457,11.659198,9.864936,1.281151
3,1990,1990-05-21,11.078926,2.459549,2.749198,2.729188,0.517200,8.785940,8.946778,14.179223,...,0.682687,10.238170,0.517200,13.361464,2.311105,12.132930,1.655844,13.991037,10.851430,0.123783
4,1990,1990-05-28,11.632872,2.535523,3.016584,2.833620,0.253212,10.317510,10.198506,15.458820,...,0.599193,10.319756,0.253212,14.809277,3.574510,15.990003,3.233664,17.594789,12.035222,0.179485
5,1990,1990-06-04,12.186818,2.148188,3.051805,2.235554,0.322576,10.638646,10.485788,15.533880,...,1.301035,9.942181,0.322576,14.816978,1.602366,13.818059,1.759876,16.322877,12.035222,1.209976
6,1990,1990-06-11,12.740765,1.784355,2.494899,2.495656,0.117178,9.419977,8.926258,15.569623,...,1.895808,11.342435,0.117178,14.874737,1.479107,11.983140,1.655844,11.659198,10.851430,0.919086
7,1990,1990-06-18,13.294711,1.694339,1.888717,1.791413,5.088489,10.527483,9.808623,17.235244,...,1.473426,12.256015,5.088489,16.877031,0.462221,14.492110,2.210682,15.262950,9.864936,0.652954
8,1990,1990-06-25,13.848657,2.021035,1.595805,1.705450,0.650540,10.490429,9.890704,16.852795,...,1.842273,11.852826,0.650540,16.372607,1.140145,14.379768,2.028626,15.262950,9.864936,0.652954
9,1990,1990-07-02,14.402604,1.911841,1.992898,2.425410,0.485211,12.149630,11.963237,17.535485,...,1.394843,10.865248,0.485211,17.131168,1.417478,14.379768,1.499796,15.262950,13.021716,0.034040


### Training Models

In [11]:
forest_model_sj = RandomForestRegressor(n_estimators=10000)
forest_model_sj.fit(X_sj.drop(['week_start_date','year'], axis=1), L_sj)

forest_model_iq = RandomForestRegressor(n_estimators=10000)
forest_model_iq.fit(X_iq.drop(['week_start_date','year'], axis=1), L_iq)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [12]:
forest_predict_sj = forest_model_sj.predict(T_sj.drop(['week_start_date','year'], axis=1))
forest_predict_iq = forest_model_iq.predict(T_iq.drop(['week_start_date','year'], axis=1))

In [13]:
predict_list = list((forest_predict_sj).astype(int)) + list((forest_predict_iq).astype(int))

### Write to CSV

In [14]:
# forest_predict_frame = pd.DataFrame((forest_predict).astype(int))
S = pd.read_csv(filepath + '/submission_format.csv')

S['total_cases'] = predict_list

S

S.to_csv('results/forest_normalized_weighted.csv', index=False)