In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline
plt.rcParams['figure.figsize']= (8,8)

dengue_train = pd.read_csv('./data/dengue_features_train.csv')
dengue_test = pd.read_csv('./data/dengue_features_test.csv')
dengue_labels_train = pd.read_csv('./data/dengue_labels_train.csv')

print("Dengue Features Train shape:", dengue_train.shape)
print("Dengue Features Test shape:", dengue_test.shape)
print("Dengue Labels Train shape:", dengue_labels_train.shape)

Dengue Features Train shape: (1456, 24)
Dengue Features Test shape: (416, 24)
Dengue Labels Train shape: (1456, 4)


## Dengue Features Dataset
This describes the factors of climate and vegetation during the training period.
- city: City abbreviations: sj for San Juan and iq for Iquitos (string)
- week_start_date: Date given in yyyy-mm-dd format (string)
- weekofyear: The number of week of the year (integer)
- year: The year in yyyy format (integer)

- station_max_temp_c: Maximum temperature (float)
- station_min_temp_c: Minimum temperature (float)
- station_avg_temp_c: Average temperature (float)
- station_precip_mm: Total precipitation (float)
- station_diur_temp_rng_c: Diurnal temperature range (float)

- precipitation_amt_mm: Total precipitation (float)

- reanalysis_sat_precip_amt_mm: Total precipitation (float)
- reanalysis_dew_point_temp_k: Mean dew point temperature (float)
- reanalysis_air_temp_k: Mean air temperature (float)
- reanalysis_relative_humidity_percent: Mean relative humidity (float)
- reanalysis_specific_humidity_g_per_kg: Mean specific humidity (float)
- reanalysis_precip_amt_kg_per_m2: Total precipitation (float)
- reanalysis_max_air_temp_k: Maximum air temperature (float)
- reanalysis_min_air_temp_k: Minimum air temperature (float)
- reanalysis_avg_temp_k: Average air temperature (float)
- reanalysis_tdtr_k: Diurnal temperature range (float)

- ndvi_se: Pixel southeast of city centroid (float)
- ndvi_sw: Pixel southwest of city centroid (float)
- ndvi_ne: Pixel northeast of city centroid (float)
- ndvi_nw: Pixel northwest of city centroid (float)

## Dengue Labels Dataset
This describes the factors of climate and vegetation during the training period.
- city: City abbreviations: sj for San Juan and iq for Iquitos
- year: The year in yyyy format
- weekofyear: The number of week of the year
- total_cases: The total number of cases active in that city for the week and year.

In [5]:
# Merge labels and features
train_set = pd.merge(dengue_train.copy(), dengue_labels_train.copy(), how='right', on=['city', 'year', 'weekofyear'])

# Change or drop non-number columns
train_set['city'] = np.where(train_set.city == 'sj', 1, 0)
train_set = train_set.drop(columns=['week_start_date'])

# Drop null valued columns
print(train_set.shape)
train_set = train_set.dropna()
print(train_set.shape)

# Inspect dataset
train_set.head()

(1456, 24)
(1199, 24)


Unnamed: 0,city,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,1,1990,18,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,...,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4
1,1,1990,19,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,...,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5
2,1,1990,20,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,...,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4
3,1,1990,21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,...,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3
4,1,1990,22,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,...,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6


In [6]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(train_set.drop(columns=['total_cases']), train_set.total_cases, test_size=0.2)

## Explorartory Analysis
Exploring the coefficients

## Prediction Analysis
Using L1-Lasso, L2-Ridge, Decision Tree, Random Forest and kNN analysis because it was literally this week.


In [7]:
# Imports for analysis
from sklearn import metrics

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")