In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
from IPython.display import display
pd.options.display.max_columns = None
from sklearn.preprocessing import Imputer

In [2]:
df_X_train = pd.read_csv('dengue_features_train.csv', index_col=None)

In [3]:
df_y_train = pd.read_csv('dengue_labels_train.csv', index_col=None)

In [4]:
X_train = df_X_train
X_train['week_start_date'] = pd.to_datetime(X_train['week_start_date'])

In [23]:
y_train = df_y_train[['city','total_cases']]

In [25]:
y_train_sj = df_y_train[df_y_train['city']=='sj']['total_cases']
y_train_iq = df_y_train[df_y_train['city']=='iq']['total_cases']

In [26]:
y_train_iq.head()

936    0
937    0
938    0
939    0
940    0
Name: total_cases, dtype: int64

In [5]:
X_train['week_start_date'] = X_train['week_start_date'] - pd.to_datetime('1980-01-01')

In [6]:
X_train['week_start_date'] = pd.to_numeric(X_train['week_start_date'])/100000000000

In [7]:
df_city = pd.get_dummies(X_train['city'])

In [8]:
X_train_new = pd.concat([X_train, df_city], axis=1)

In [9]:
X_train_new = X_train_new.drop(['city'], axis=1)

In [10]:
X_train_new = X_train_new.fillna(X_train_new.mean())

In [11]:
df_X_test = pd.read_csv('dengue_features_test.csv', index_col=None)

In [12]:
X_test = df_X_test

In [13]:
X_test['week_start_date'] = pd.to_datetime(X_test['week_start_date'])
X_test['week_start_date'] = X_test['week_start_date'] - pd.to_datetime('1980-01-01')
X_test['week_start_date'] = pd.to_numeric(X_test['week_start_date'])/100000000000
df_city_test = pd.get_dummies(X_test['city'])
X_test_new = pd.concat([X_test, df_city_test], axis=1)
X_test_new = X_test_new.drop(['city'], axis=1)
X_test_new = X_test_new.fillna(X_test_new.mean())

In [15]:
X_train_sj = X_train_new[X_train_new['sj']==1]

In [17]:
X_train_iq = X_train_new[X_train_new['iq']==1]

In [19]:
X_train_sj = X_train_sj.drop(['sj','iq'], axis=1)
X_train_iq = X_train_iq.drop(['sj','iq'], axis=1)

In [21]:
X_test_sj = X_test_new[X_test_new['sj']==1]
X_test_iq = X_test_new[X_test_new['iq']==1]
X_test_sj = X_test_sj.drop(['sj','iq'], axis=1)
X_test_iq = X_test_iq.drop(['sj','iq'], axis=1)

In [22]:
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["mse"]}

In [27]:
tscv = TimeSeriesSplit(n_splits=3)
my_cv = TimeSeriesSplit(n_splits=3).split(X_train_sj)
regr = RandomForestRegressor(max_depth=30, random_state=0)
grid_search_sj = GridSearchCV(regr, cv=my_cv, param_grid=param_grid)
grid_search_sj.fit(X_train_sj, y_train_sj)

GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x00000289465BDFC0>,
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], 'criterion': ['mse']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
grid_search_sj.best_params_

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 3,
 'max_features': 3,
 'min_samples_leaf': 10,
 'min_samples_split': 2}

In [30]:
best_regr_sj = RandomForestRegressor(max_depth=3, bootstrap=True, max_features=3, min_samples_leaf=10, min_samples_split=2, n_estimators=30, random_state=0)

In [31]:
best_regr_sj.fit(X_train_sj, y_train_sj)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [33]:
result = best_regr_sj.predict(X_test_sj)

In [34]:
result_columns = df_X_test[df_X_test['city']=='sj'][['city','year','weekofyear']]
result_cases = pd.DataFrame(result)
result_cases.columns = ['total_cases']
result_df = pd.concat([result_columns, result_cases], axis=1)
result_df['total_cases'] = result_df['total_cases'].round().astype(int)
result_df.to_csv('rnd_forest_grid_time_sj.csv', index=False)

In [35]:
tscv = TimeSeriesSplit(n_splits=3)
my_cv = TimeSeriesSplit(n_splits=3).split(X_train_iq)
regr = RandomForestRegressor(max_depth=30, random_state=0)
grid_search_iq = GridSearchCV(regr, cv=my_cv, param_grid=param_grid)
grid_search_iq.fit(X_train_iq, y_train_iq)

GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x00000289465CEAF0>,
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], 'criterion': ['mse']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
grid_search_iq.best_params_

{'bootstrap': False,
 'criterion': 'mse',
 'max_depth': 3,
 'max_features': 1,
 'min_samples_leaf': 10,
 'min_samples_split': 2}

In [39]:
best_regr_iq = RandomForestRegressor(max_depth=3, bootstrap=False, max_features=1, min_samples_leaf=10, min_samples_split=2, n_estimators=30, random_state=0)

In [40]:
best_regr_iq.fit(X_train_iq, y_train_iq)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=3,
           max_features=1, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [41]:
result = best_regr_iq.predict(X_test_iq)

In [49]:
result_columns = df_X_test[df_X_test['city']=='iq'][['city','year','weekofyear']]
result_columns.head()

Unnamed: 0,city,year,weekofyear
260,iq,2010,26
261,iq,2010,27
262,iq,2010,28
263,iq,2010,29
264,iq,2010,30


In [52]:
result_columns = df_X_test[df_X_test['city']=='iq'][['city','year','weekofyear']]

In [53]:
result_columns.head()

Unnamed: 0,city,year,weekofyear
260,iq,2010,26
261,iq,2010,27
262,iq,2010,28
263,iq,2010,29
264,iq,2010,30


In [60]:
result_columns.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 260 to 415
Data columns (total 3 columns):
city          156 non-null object
year          156 non-null int64
weekofyear    156 non-null int64
dtypes: int64(2), object(1)
memory usage: 9.9+ KB


In [56]:
result_cases = pd.DataFrame(result)
result_cases.columns = ['total_cases']

In [57]:
result_cases.head()

Unnamed: 0,total_cases
0,8.72618
1,5.432023
2,7.880132
3,5.292099
4,5.504174


In [61]:
result_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 1 columns):
total_cases    156 non-null float64
dtypes: float64(1)
memory usage: 1.3 KB


In [68]:
result_columns = result_columns.reset_index()

In [69]:
result_df = pd.concat([result_columns, result_cases], axis=1, ignore_index=True)

In [75]:
result_df.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,iq,2010,26,8.72618
1,iq,2010,27,5.432023
2,iq,2010,28,7.880132
3,iq,2010,29,5.292099
4,iq,2010,30,5.504174


In [72]:
result_df.columns = [['drop', 'city','year','weekofyear','total_cases']]

In [74]:
result_df = result_df.drop(['drop'], axis=1)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [76]:
result_df['total_cases'] = result_df['total_cases'].round().astype(int)
result_df.to_csv('rnd_forest_grid_time_iq.csv', index=False)

In [46]:
result_cases = result_cases.round().astype(int)

In [47]:
result_columns = df_X_test[df_X_test['city']=='iq'][['city','year','weekofyear']]
result_df = pd.concat([result_columns, result_cases], axis=1)
result_df.to_csv('rnd_forest_grid_time_iq.csv', index=False)