## All Imports

In [None]:
%matplotlib inline

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import statsmodels.api as sm

# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

## Reading Data and Test Sets

In [None]:
filepath = 'dataset'
X = pd.read_csv(filepath + '/dengue_features_train.csv')
Y = pd.read_csv(filepath + '/dengue_labels_train.csv')
T = pd.read_csv(filepath + '/dengue_features_test.csv')

## Function  Definitions

### Preprocess function 1

In [None]:
def preprocess1(data):
    # y['city']=y['city'].map({'iq':1,'sj':0})  Ordinal
    
    # X = X.interpolate()
    
    # X[X.columns] = StandardScaler().fit_transform(X)

    # encodes "city column"
    city_features = pd.get_dummies(data['city'], prefix='city_')
    # concats city_features to data
    data = pd.concat([city_features, data], axis=1)
    # drops columns
    dropping_columns = ['year','weekofyear','week_start_date','city']
    data = data.drop(dropping_columns, axis=1)
    # fills NaN values
    data.fillna(method='ffill', inplace=True) #bfill
    # normalize
    data[data.columns] = MinMaxScaler().fit_transform(data)
    
    return data

### Remove correlated columns

In [None]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i, j] >= threshold:
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset

### Plot HeatMap of features

In [None]:
def plotHeatMap(dataset):
    f,ax = plt.subplots(figsize=(18, 18))
    sn.heatmap(dataset.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)