# Ensembling, normalising columns

In [5]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
sns.set(style="white", color_codes=True)

import matplotlib.pyplot as plt

import tensorflow as tf
import csv

from glob import glob
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import datasets
from sklearn import metrics
from sklearn import model_selection

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, VarianceThreshold

import sys;

# Import Data

In [6]:
features_test = pd.read_csv('./input/dengue_features_test.csv', parse_dates=True, delimiter=',') # , index_col=0
features_train = pd.read_csv('./input/dengue_features_train.csv', parse_dates=True, delimiter=',')
labels_train = pd.read_csv('./input/dengue_labels_train.csv', parse_dates=True, delimiter=',')

In [7]:
features_test.shape

(416, 24)

# Drop Certain Features

In [8]:
#features_train.drop('year', axis=1, inplace=True)
#features_train.drop('weekofyear', axis=1, inplace=True)
features_train.drop('week_start_date', axis=1, inplace=True)

labels_train.drop('year', axis=1, inplace=True)
labels_train.drop('weekofyear', axis=1, inplace=True)
labels_train.drop('city', axis=1, inplace=True)

#features_train.drop('year', axis=1, inplace=True)
#features_train.drop('weekofyear', axis=1, inplace=True)
features_test.drop('week_start_date', axis=1, inplace=True)

# Recode Cities

In [9]:
features_train = features_train.replace('sj',0)
features_train = features_train.replace('iq',1)

labels_train = labels_train.replace('sj',0)
labels_train = labels_train.replace('iq',1)

features_test = features_test.replace('sj',0)
features_test = features_test.replace('iq',1)

In [10]:
features_test.shape

(416, 23)

# Normalize

In [11]:
features_train = (features_train -  features_train.mean()) / features_train.std()
features_test = (features_test -  features_test.mean()) / features_test.std()

In [12]:
features_test.shape

(416, 23)

# Creating New Features

In [13]:
sum(features_test["station_min_temp_c"]>26)

0

In [14]:
features_test["station_precip_mm"].mean()

-2.0583696952904362e-16

# Fill in NAs

In [15]:
features_train.fillna(method='bfill', inplace=True)
features_test.fillna(method='bfill', inplace=True)
labels_train.fillna(method='bfill', inplace=True)

In [16]:
features_test.shape

(416, 23)

# Split Train/Test

In [17]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(features_train, 
                                                                    labels_train, test_size=0.3, random_state=101)

In [18]:
#train, test, target, y_validation = model_selection.train_test_split(features_train, labels_train, 
                                                                    #test_size=0.3, random_state=101)

In [19]:
x_train.shape

(1019, 23)

In [20]:
x_train.head(2)

Unnamed: 0,city,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
880,-0.7451,1.103561,-0.832484,-0.878288,-0.810861,-0.226457,-0.458419,1.027543,-0.678726,-1.078922,...,-0.088681,-0.831389,1.027543,-1.279466,-0.798638,-1.227057,-0.96076,-1.251679,-0.255485,0.01841
267,-0.7451,-1.115245,-0.16668,-0.375316,-0.423775,-0.298795,-0.552066,-0.585613,0.826999,0.534528,...,-0.54293,-0.422821,-0.585613,0.356104,-0.730159,0.453164,-0.276195,0.17739,0.443342,0.087949


In [21]:
x_val.shape

(437, 23)

In [22]:
y_train.head(2)

Unnamed: 0,total_cases
880,3
267,7


In [23]:
y_val.head(2)

Unnamed: 0,total_cases
18,24
104,24


In [24]:
features_test.shape

(416, 23)

In [25]:
import xgboost as xgb

In [26]:
from sklearn.cross_validation import KFold

In [27]:
xgbc = xgb.XGBRegressor(n_estimators = 100, # number of boosted trees
                             learning_rate = 0.01, # step size shrinkage used in update to prevent overfitting
                             max_depth = 7, # maximum depth of a tree
                             subsample = 0.7, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.7,
                             silent = 0) # subsample features
    
xgbc.fit(x_train, y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=0, subsample=0.7)

In [28]:
    #Print the MAE

    #Training error
print('Training Error:', metrics.mean_absolute_error(y_train, xgbc.predict(x_train)))
    
    #Val error
print('Validation Error:', metrics.mean_absolute_error(y_val, xgbc.predict(x_val)))

Training Error: 11.9530039634
Validation Error: 11.3997063708


# Final XGBoost Model

In [29]:
features_test.shape

xgbc.fit(features_train, labels_train)
final = xgbc.predict(features_test)
final.shape

final = np.hstack(final)
final[final < 0] = 0
final = final.astype(dtype='int64')

np.savetxt("final15.csv", final, delimiter=",")

final



array([13, 15, 13, 16, 13, 15, 19, 18, 16, 18, 18, 20, 22, 31, 31, 20, 17,
       41, 21, 19, 43, 45, 43, 52, 52, 45, 48, 47, 49, 60, 52, 47, 39, 41,
       39, 15, 19, 19, 22, 20, 18, 16, 15, 13, 13, 14, 13, 13, 13, 12, 12,
       13, 13, 13, 14, 14, 16, 14, 14, 16, 20, 32, 21, 24, 23, 29, 35, 29,
       35, 50, 42, 29, 32, 33, 32, 36, 37, 34, 51, 31, 30, 33, 26, 28, 28,
       23, 23, 21, 17, 26, 84, 28, 25, 19, 26, 24, 25, 20, 21, 24, 24, 20,
       20, 21, 25, 29, 26, 28, 29, 41, 45, 36, 38, 46, 57, 82, 58, 63, 69,
       80, 77, 90, 32, 28, 55, 42, 40, 42, 38, 41, 41, 38, 38, 33, 32, 30,
       24, 32, 27, 10, 10, 11, 11, 10, 10,  9,  9,  9,  7,  5,  5,  4,  6,
        5,  5,  5,  4,  5,  4,  6,  7,  8,  8, 14, 12, 15, 15, 14, 12, 18,
       12, 18, 14, 15, 18, 11, 13, 14, 16, 16, 13, 12, 13, 16, 14, 14, 23,
       14, 11, 18, 12, 12, 10,  9,  8,  9,  9,  8,  7,  8,  7,  6,  5,  5,
        5,  6,  5,  6,  5,  6,  5,  6,  5,  5,  7, 12, 14, 14, 15,  9, 10,
       14, 11, 14, 17, 16