In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
from IPython.display import display
pd.options.display.max_columns = None
from sklearn.preprocessing import Imputer
import xgboost as xgb

In [13]:
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [3]:
X_train = pd.read_csv('dengue_features_train.csv', index_col=None)
y_train = pd.read_csv('dengue_labels_train.csv', index_col=None)
X_test = pd.read_csv('dengue_features_test.csv', index_col=None)

In [4]:
X_train['week_start_date'] = pd.to_datetime(X_train['week_start_date'])
X_test['week_start_date'] = pd.to_datetime(X_test['week_start_date'])

In [5]:
X_train['month'] = X_train['week_start_date'].dt.month
X_train['quarter'] = X_train['week_start_date'].dt.quarter
X_train = X_train.drop(['week_start_date'], axis=1)

In [6]:
X_test['month'] = X_test['week_start_date'].dt.month
X_test['quarter'] = X_test['week_start_date'].dt.quarter
X_test = X_test.drop(['week_start_date'], axis=1)

In [7]:
city_train = pd.get_dummies(X_train['city'])
X_train = pd.concat([X_train, city_train], axis=1)
X_train = X_train.drop(['city'], axis=1)
city_test = pd.get_dummies(X_test['city'])
X_test = pd.concat([X_test, city_test], axis=1)
X_test = X_test.drop(['city'], axis=1)

In [8]:
X_train.head()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,month,quarter,iq,sj
0,1990,18,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4,2,0,1
1,1990,19,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5,2,0,1
2,1990,20,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,5,2,0,1
3,1990,21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,297.0,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,5,2,0,1
4,1990,22,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,297.5,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,5,2,0,1


In [9]:
y_train = pd.read_csv('dengue_labels_train.csv', index_col=None)
y_train = y_train['total_cases']

In [10]:
model = xgb.XGBRegressor(colsample_bytree=0.4, gamma=0, reg_alpha=0.75, reg_lambda=0.45,
subsample=0.6, seed=42) 

In [11]:
parameters = {'learning_rate': [0.03,0.05,0.07,0.1], #so called `eta` value
              'max_depth': [3,4,5,6],
              'min_child_weight': [1.5,3,7,11],
              'subsample': [0.2,0.6,0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5,500,1000,2500],
              'reg_lambda': [0.2,0.45,0.6]}

In [17]:
clf = GridSearchCV(model, parameters, n_jobs=5, 
                   scoring='neg_mean_absolute_error',
                   verbose=2, refit=True)

In [18]:
clf.fit(X_train,y_train)

Fitting 3 folds for each of 2304 candidates, totalling 6912 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   55.5s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  3.4min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  7.2min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed: 12.5min
[Parallel(n_jobs=5)]: Done 1003 tasks      | elapsed: 20.0min
[Parallel(n_jobs=5)]: Done 1448 tasks      | elapsed: 31.1min
[Parallel(n_jobs=5)]: Done 1975 tasks      | elapsed: 43.5min
[Parallel(n_jobs=5)]: Done 2582 tasks      | elapsed: 54.5min


KeyboardInterrupt: 

In [None]:
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw MAE score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

In [19]:
result = model.predict(X_test)

In [22]:
X_test = pd.read_csv('dengue_features_test.csv', index_col=None)
result_columns = X_test[['city','year','weekofyear']]
result_cases = pd.DataFrame(result)
result_cases.columns = ['total_cases']
result_df = pd.concat([result_columns, result_cases], axis=1)
result_df['total_cases'] = result_df['total_cases'].round().astype(int)

In [24]:
result_df['total_cases'] = np.where(result_df['total_cases']<0, 0, result_df['total_cases'])

In [25]:
result_df.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,6
1,sj,2008,19,0
2,sj,2008,20,3
3,sj,2008,21,20
4,sj,2008,22,17


In [26]:
result_df.to_csv('xgboost.csv', index=False)