## All Imports

In [5]:
%matplotlib inline

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import statsmodels.api as sm

# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

  from pandas.core import datetools


## Reading Data and Test Sets

In [6]:
filepath = 'dataset'
X = pd.read_csv(filepath + '/dengue_features_train.csv')
Y = pd.read_csv(filepath + '/dengue_labels_train.csv')
T = pd.read_csv(filepath + '/dengue_features_test.csv')

## Function  Definitions

### Preprocess function 1

In [7]:
def preprocess1(data):
    # y['city']=y['city'].map({'iq':1,'sj':0})  Ordinal
    
    # X = X.interpolate()
    
    # X[X.columns] = StandardScaler().fit_transform(X)

    # encodes "city column"
    city_features = pd.get_dummies(data['city'], prefix='city_')
    # concats city_features to data
    data = pd.concat([city_features, data], axis=1)
    # drops columns
    dropping_columns = ['year','weekofyear','week_start_date','city']
    data = data.drop(dropping_columns, axis=1)
    # fills NaN values
    data.fillna(method='ffill', inplace=True) #bfill
    # normalize
    data[data.columns] = MinMaxScaler().fit_transform(data)
    
    return data

### Preprocess function 2

In [8]:
def preprocess_train(data):
    # fill NaN values
    data.interpolate(inplace=True)
    # seperate into two cities
    data_sj = data[data['city'] == "sj"]
    data_iq = data[data['city'] == "iq"]
    # drop columns
    dropping_columns = ['city']
    data_sj = data_sj.drop(dropping_columns, axis=1)
    data_iq = data_iq.drop(dropping_columns, axis=1)
    
    
    return X_sj, X_iq


### Drop Columns

In [None]:
def dropColumns(train,test):
    # drop columns having more than 10% NaN values
    NaNDic = (train.isnull().sum()*100/train.shape[0])>=10
    for i in train.columns.values:
        if(NaNDic[i]):
            train.drop(i,axis=1,inplace=True)
            test.drop(i,axis=1,inplace=True)

### Remove correlated columns

In [9]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i, j] >= threshold:
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

### Plot HeatMap of features

In [10]:
def plotHeatMap(dataset):
    f,ax = plt.subplots(figsize=(18, 18))
    sn.heatmap(dataset.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [31]:
((T.isnull().sum()*100/X.shape[0]))

city                                     0.000000
year                                     0.000000
weekofyear                               0.000000
week_start_date                          0.000000
ndvi_ne                                  2.953297
ndvi_nw                                  0.755495
ndvi_se                                  0.068681
ndvi_sw                                  0.068681
precipitation_amt_mm                     0.137363
reanalysis_air_temp_k                    0.137363
reanalysis_avg_temp_k                    0.137363
reanalysis_dew_point_temp_k              0.137363
reanalysis_max_air_temp_k                0.137363
reanalysis_min_air_temp_k                0.137363
reanalysis_precip_amt_kg_per_m2          0.137363
reanalysis_relative_humidity_percent     0.137363
reanalysis_sat_precip_amt_mm             0.137363
reanalysis_specific_humidity_g_per_kg    0.137363
reanalysis_tdtr_k                        0.137363
station_avg_temp_c                       0.824176
