<a href="https://colab.research.google.com/github/Camouflage10/disease-spread-model/blob/Camouflage10-bagging/disease_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix, recall_score, precision_score
import seaborn as sn
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold


In [18]:
kfoldruns=10

In [19]:
#function for retrieving data
def getData():
  x = pd.read_csv('train_x.csv')
  y = pd.read_csv('dengue_labels_train.csv')
  y=y.drop(columns=['city','year','weekofyear'])
  test =pd.read_csv('test.csv')

  x['year']=x['year'].astype(int)
  x['city'].replace(['sj', 'iq'],[0, 1], inplace=True)
  x[['year2', 'month', 'day']] = x['week_start_date'].str.split('-', expand=True)
  x['month']=x['month'].astype(int)
  x['day']=x['day'].astype(int)
  x=x.drop(columns=['year2','week_start_date'])
  
  test['year']=test['year'].astype(int)
  test['city'].replace(['sj', 'iq'],[0, 1], inplace=True)
  test[['year2', 'month', 'day']] = test['week_start_date'].str.split('-', expand=True)
  test['month']=test['month'].astype(int)
  test['day']=test['day'].astype(int)
  test=test.drop(columns=['year2','week_start_date'])

  return x, y, test

In [20]:
#tremove features with a corrilation of less than abs(mincorr)
def removeFeatures(x,y,test,minCorr):
  x['total_cases']=y['total_cases']
  corrMatrix = x.corr()
  lastCol=corrMatrix['total_cases']
  x=x.drop(columns=['total_cases'])
  print(lastCol)
  features=list(lastCol.loc[abs(lastCol) > minCorr].index)
  features.remove('total_cases')
  x=x[features]
  test=test[features]
  corrMatrix = x.corr()
  print(len(x.columns))
  return x,test

In [21]:
def getParams(x_train, y_train, model, hyperbag):
  model = model()
  #Applying GridSearchCV to get the best value for hyperparameters
  gridbag = GridSearchCV(model, hyperbag, scoring='neg_mean_absolute_error', cv = 2, verbose = 1, n_jobs = -1)
  gridbag.fit(x_train, y_train)
  print(gridbag.best_params_)
  return gridbag.best_params_, pd.DataFrame(gridbag.cv_results_)

In [22]:

def getPreds(x_train, x_test, y_train, y_test, model):
  temp=sklearn.base.clone(model)
  temp.fit(x_train,y_train)
  preds = temp.predict(x_test)
  preds=[round(num) for num in preds]
  rmse = np.sqrt(mean_squared_error(y_test, preds))

  #MSE and RMSE
  mse = mean_squared_error(y_test, preds)
  print("MSE: %.2f" % mse)
  rmse = np.sqrt(mean_squared_error(y_test, preds))
  mae= mean_absolute_error(y_test,preds)
  print("RMSE: %f" % (rmse))
  print("MaE: %f" % (mae))
  bag2=sklearn.base.clone(bag)
  #accuracy score round preds
  round_preds=[round(num) for num in preds]
  accuracy = accuracy_score(y_test, round_preds)
  print("Accuracy: %.2f%%" % (accuracy * 100.0))
  return temp,preds

In [23]:
#K-Fold cross-val
from sklearn.model_selection import RepeatedKFold
def kFoldValues(x,y,model,n):
  rkf = RepeatedKFold(n_splits=2, n_repeats=n)
  bagMae=[]
  for train_index, test_index in rkf.split(x,y):
    x_traint, x_testt= x.iloc[train_index], x.iloc[test_index]
    y_traint, y_testt =y.iloc[train_index], y.iloc[test_index]
    temp=sklearn.base.clone(model)
    temp.fit(x_traint,y_traint)
    preds = temp.predict(x_testt)
    mae= mean_absolute_error(y_testt,preds)
    bagMae.append(mae)

  bagMae=np.array(bagMae)
  print(bagMae)
  #std and mean of mae
  return np.std(bagMae), np.mean(bagMae)


In [25]:
from pyparsing import results
testmincorr=[.1,.15,.2,.24,.26,.28,.3,.32,.35]
col=["mincorr","#features","bag_mae_std", "bag_mae_mean", "forest_mae_std", "forest_mae_mean"]
df=pd.DataFrame(columns=col)
bags=[]
forests=[]
for n in testmincorr:
  x,y,test1=getData()
  x, test1=removeFeatures(x,y,test1,n)
  features=int(len(x.columns))
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
  hyperbag={'n_estimators': [75,85,90],'max_features':[3,4,5]}
  bagparam,results=getParams(x_train, y_train, BaggingRegressor, hyperbag)
  hyperbag={'n_estimators': [75,85,175],'max_features':[3,4,5,6], 'max_depth': [5,10,20]}
  forestparam, results=getParams(x_train, y_train, RandomForestRegressor, hyperbag)
  bag = BaggingRegressor(n_estimators=bagparam['n_estimators'],
                       max_features= bagparam['max_features'],)
  forest=RandomForestRegressor(max_features=forestparam['max_features'],
                             n_estimators=forestparam['n_estimators'],
                             max_depth=forestparam['max_depth'])
  bags.append(bag)
  forests.append(forest)
  bagstd, bagmean=kFoldValues(x,y,bag, kfoldruns)
  foreststd, forestmean=kFoldValues(x,y,forest, kfoldruns)
  df.loc[len(df)]=[n,features,bagstd,bagmean,foreststd,forestmean]
df


Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
station_diur_temp_rng_c                 -0.235323
station_min_temp_c                       0.259204
city                                    -0.292624
year                                    -0.306806
month                                    0.215737


  return column_or_1d(y, warn=True)


{'max_features': 5, 'n_estimators': 85}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 20, 'max_features': 6, 'n_estimators': 175}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[15.1240627  13.75898432 15.24429237 15.68133484 15.28672888 14.50647836
 14.69329347 13.55279977 15.49277057 14.24031459 14.46983926 13.57257595
 14.56877559 14.71383426 14.6466225  14.97372318 15.58180874 14.62329239
 14.12161239 15.20136932]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[14.90165428 12.76113614 13.3519418  15.80619343 13.4711063  13.32234697
 14.41467731 13.06288652 12.49722774 14.1860291  13.15724011 14.27881475
 13.25464324 13.69117743 13.1985767  14.42159724 14.54285914 13.2225457
 14.35299107 12.91169954]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
statio

  return column_or_1d(y, warn=True)


{'max_features': 5, 'n_estimators': 85}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 10, 'max_features': 4, 'n_estimators': 85}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[14.42487341 14.50007945 13.6720125  14.65746606 14.28096181 12.47747845
 14.12007111 13.12924531 15.05193627 12.2663016  13.55971693 13.78894635
 14.44109432 13.70882434 12.98910795 13.21899375 14.73004453 13.63587589
 13.35444374 14.68584357]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[14.04867818 12.53172426 12.56178361 14.39139821 14.06684357 13.14025274
 12.70151496 15.24304442 12.0368042  14.37220893 13.55819419 14.01634237
 15.08287241 12.01440937 13.41100142 13.45130197 12.80877297 14.34977619
 13.80404367 13.85597246]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
stati

  return column_or_1d(y, warn=True)


{'max_features': 5, 'n_estimators': 90}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 10, 'max_features': 6, 'n_estimators': 85}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[13.64713395 12.86222527 11.42064764 13.15149217 12.84689459 12.41880088
 12.82520753 12.77620574 13.17738786 12.97614469 13.31473113 12.99474359
 12.70806827 12.41356511 12.91739164 12.33092058 13.5989825  14.20176536
 12.62036706 12.48584605]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[12.15422436 12.89074017 13.19596911 12.22457793 14.26859548 12.00045593
 12.86386101 12.21748772 12.52150533 12.82589768 12.48284611 13.57120946
 11.68642066 14.08707862 11.81564603 13.51339327 12.8873821  13.75866889
 13.87634656 12.6964064 ]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
stati

  return column_or_1d(y, warn=True)


{'max_features': 5, 'n_estimators': 75}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 10, 'max_features': 6, 'n_estimators': 85}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[11.08781624 14.06263736 11.67790293 13.23815476 11.97031746 14.71410256
 11.46660256 13.44269994 12.13418071 12.34135531 12.38939194 12.59918803
 13.53702752 12.20539499 11.9952793  11.69386905 13.01421245 10.95210012
 12.55403846 13.09324176]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[12.2240733  13.33639552 11.98469148 12.4391318  12.39274471 12.4446585
 13.22308306 12.99243026 13.0950383  12.10100558 12.23783728 12.98671358
 12.88770893 12.8952801  12.40721767 12.42916295 13.64898539 11.96336699
 12.09329113 12.97976012]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
statio

  return column_or_1d(y, warn=True)


{'max_features': 3, 'n_estimators': 75}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


18 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

{'max_depth': 20, 'max_features': 3, 'n_estimators': 85}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[10.04895548 10.62167464 10.02181974  9.27452603  9.95405335 10.45453661
 10.62459333 11.74849241 10.49316484 11.45489959 10.28748993 12.49051281
  9.96320523 11.16065507  9.74101483 11.322617   11.09111288 10.12359521
  9.80610951 10.81382958]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[10.16383738 10.31188171  9.93776459 10.5639405  10.58784358 10.51023527
 10.98815387  9.49647454 10.78885309 10.73364085 10.5290307  10.27169808
 10.4526231   9.4081603  11.16073371  8.7449515   9.75928328  9.64881189
 10.40169527  9.67373389]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
stati

6 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 269, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 342, in _fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

 -11.79307781          nan          nan 

{'max_features': 3, 'n_estimators': 75}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


36 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

{'max_depth': 20, 'max_features': 3, 'n_estimators': 175}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[ 8.88851301  9.81576563  7.87337943  8.12366734  7.85698646  8.37876129
 10.26678825  7.44623215  8.1163532   9.15709942  7.19691074  9.03824565
  8.2297152   9.59712693  8.32480615  8.35185542  9.14109136  8.47494751
  9.19909395  8.75299128]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[7.59636364 8.82101202 8.02794655 7.86205317 7.64503629 7.85461036
 7.93559673 7.35178753 8.37391985 7.82937432 7.56778738 8.04494277
 8.34548113 7.72357945 7.37853508 7.8005608  7.99022647 7.76226324
 7.28549588 8.28393574]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
station_diur_temp_rng_c  

12 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 269, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 342, in _fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

         nan         nan         nan]


{'max_features': 3, 'n_estimators': 90}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


54 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

{'max_depth': 10, 'max_features': 3, 'n_estimators': 175}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[7.34230769 8.43194444 7.73461538 7.70677656 7.1581044  8.02530525
 7.61826923 9.06517094 6.81121795 8.25531136 7.2092033  8.52330586
 7.65053419 7.74551282 8.10421245 7.08980464 8.12454212 7.76085165
 8.83460012 7.41584249]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[8.92214661 7.61575786 7.90870026 8.40959895 8.14598548 8.79280128
 8.3719782  7.44153094 7.74964522 7.80580972 8.75483552 7.38623877
 6.63209499 8.97555656 8.19119418 6.74946158 8.33639432 8.17050239
 7.59898932 7.63145069]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
station_diur_temp_rng_c  

18 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 269, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 342, in _fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

  return column_or_1d(y, warn=True)


ValueError: ignored

In [None]:
bag_index=df[['bag_mae_mean']].idxmin()
forest_index=df[['forest_mae_mean']].idxmin() 
bag_model=sklearn.base.clone(bags[int(bag_index)])
forest_model=sklearn.base.clone(forests[int(forest_index)])
b_minCorr=float(df.iloc[bag_index]['mincorr'])
f_minCorr=float(df.iloc[forest_index]['mincorr'])
if float(df.iloc[bag_index]['bag_mae_mean'])< float(df.iloc[forest_index]['forest_mae_mean']):
  #use min corr and model of index
  best_index=bag_index
  best_minCorr=b_minCorr
  model=sklearn.base.clone(bag_model)
else:
   best_index=forest_index
   best_minCorr=f_minCorr
   model=sklearn.base.clone(forest_model)

In [None]:
#create metrics.json (optional metrics.txt also)
row=df.iloc[best_index]
value = {
        "bag_mae_std": float(row["bag_mae_std"]),
        "bag_mae_mean": float(row["bag_mae_mean"]),
        "forest_mae_std": float(row["forest_mae_std"]),
        "forest_mae_mean": float(row["forest_mae_mean"])
    }
with open('metrics.json', 'w') as outfile:
  outfile.write(json.dumps(value))

In [None]:
b_x, test1=removeFeatures(x,y,test1,b_minCorr)
f_x, test1=removeFeatures(x,y,test1,f_minCorr)
x_train1, x_test1, y_train1, y_test1 = train_test_split(b_x, y, test_size=0.33)
x_train2, x_test2, y_train2, y_test2 = train_test_split(f_x, y, test_size=0.33)
beg,bagpreds=getPreds(x_train1, x_test1, y_train1, y_test1, bag_model)
forest,forestpreds=getPreds(x_train2, x_test2, y_train2, y_test2, forest_model)
bagpreds=np.array(bagpreds)
forestpreds=np.array(forestpreds)
y_test=np.array(y_test1)
y_test=y_test.reshape(bagpreds.shape)

bagerror=y_test-bagpreds
foresterror=y_test-forestpreds

In [None]:
#visualization to see if it looks like it matches

#error=error.reshape(error.shape[0]*error.shape[1],1)
x_ax = range(len(y_test1))
plt.tight_layout()
fig1 = plt.gcf()
plt.hist(bagerror, bins='auto', alpha=.5, label='bag')
plt.hist(foresterror, bins='auto', alpha=.5, label='forest')
plt.title('error dist')
plt.xlabel('error')
plt.ylabel('count')
plt.legend()
plt.show()

#create a graph called stats.png
fig1.savefig('stats.png', dpi=100)

In [None]:
x,y,test=getData()
x, test=removeFeatures(x,y,test,best_minCorr)
model.fit(x,y)



In [None]:
#submit predictions fo test
#first convert float array to int
pred = model.predict(test)
pred=[int(round(num)) for num in pred]
test_original =pd.read_csv('test.csv')
#test_original =pd.read_csv('dengue_features_test.csv')
#then make data frame for predicted data
submission=pd.DataFrame()
submission[['city', 'year', 'weekofyear']]=test_original[['city', 'year', 'weekofyear']]
submission['total_cases']=pred
#submission.sort_values(["city","year","weekofyear"],axis=0, ascending=True, inplace=True, na_position='first')
submission.to_csv('submission.csv', index=False)

In [None]:
stds, means = kFoldValues(x,y,model,kfoldruns)
print(stds)
print(means)