<a href="https://colab.research.google.com/github/Camouflage10/disease-spread-model/blob/Camouflage10-bagging/disease_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix, recall_score, precision_score
import seaborn as sn
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold


In [2]:
kfoldruns=10

In [3]:
def getData():
  x = pd.read_csv('train_x.csv')
  y = pd.read_csv('dengue_labels_train.csv')
  y=y.drop(columns=['city','year','weekofyear'])
  test =pd.read_csv('test.csv')


  x['year']=x['year'].astype(int)
  x['city'].replace(['sj', 'iq'],[0, 1], inplace=True)
  x[['year2', 'month', 'day']] = x['week_start_date'].str.split('-', expand=True)
  x['month']=x['month'].astype(int)
  x['day']=x['day'].astype(int)
  x=x.drop(columns=['year2','week_start_date'])
  
  test['year']=test['year'].astype(int)
  test['city'].replace(['sj', 'iq'],[0, 1], inplace=True)
  test[['year2', 'month', 'day']] = test['week_start_date'].str.split('-', expand=True)
  test['month']=test['month'].astype(int)
  test['day']=test['day'].astype(int)
  test=test.drop(columns=['year2','week_start_date'])

  return x, y, test

In [4]:
#tremove features with a corrilation of less than abs(mincorr)
def removeFeatures(x,y,test,minCorr):
  x['total_cases']=y['total_cases']
  corrMatrix = x.corr()
  lastCol=corrMatrix['total_cases']
  x=x.drop(columns=['total_cases'])
  print(lastCol)
  features=list(lastCol.loc[abs(lastCol) > minCorr].index)
  features.remove('total_cases')
  x=x[features]
  test=test[features]
  corrMatrix = x.corr()
  print(len(x.columns))
  return x,test

In [5]:
def getParams(x_train, y_train, model, hyperbag):
  model = model()
  #Applying GridSearchCV to get the best value for hyperparameters
  gridbag = GridSearchCV(model, hyperbag, scoring='neg_mean_absolute_error', cv = 2, verbose = 1, n_jobs = -1)
  gridbag.fit(x_train, y_train)
  print(gridbag.best_params_)
  return gridbag.best_params_, pd.DataFrame(gridbag.cv_results_)

In [6]:

def getPreds(x_train, x_test, y_train, y_test, model):
  temp=sklearn.base.clone(model)
  temp.fit(x_train,y_train)
  preds = temp.predict(x_test)
  preds=[round(num) for num in preds]
  rmse = np.sqrt(mean_squared_error(y_test, preds))

  #MSE and RMSE
  mse = mean_squared_error(y_test, preds)
  print("MSE: %.2f" % mse)
  rmse = np.sqrt(mean_squared_error(y_test, preds))
  mae= mean_absolute_error(y_test,preds)
  print("RMSE: %f" % (rmse))
  print("MaE: %f" % (mae))
  bag2=sklearn.base.clone(bag)
  #accuracy score round preds
  round_preds=[round(num) for num in preds]
  accuracy = accuracy_score(y_test, round_preds)
  print("Accuracy: %.2f%%" % (accuracy * 100.0))
  return temp,preds

In [7]:
#K-Fold cross-val
from sklearn.model_selection import RepeatedKFold
def kFoldValues(x,y,model,n):
  rkf = RepeatedKFold(n_splits=2, n_repeats=n)
  bagMae=[]
  for train_index, test_index in rkf.split(x,y):
    x_traint, x_testt= x.iloc[train_index], x.iloc[test_index]
    y_traint, y_testt =y.iloc[train_index], y.iloc[test_index]
    temp=sklearn.base.clone(model)
    temp.fit(x_traint,y_traint)
    preds = temp.predict(x_testt)
    mae= mean_absolute_error(y_testt,preds)
    bagMae.append(mae)

  bagMae=np.array(bagMae)
  print(bagMae)
  #sd and mean of mae
  return np.std(bagMae), np.mean(bagMae)


In [8]:
from pyparsing import results
testmincorr=[.1,.15,.2,.24,.26,.28,.3]
col=["mincorr","#features","bag_mae_std", "bag_mae_mean", "forest_mae_std", "forest_mae_mean"]
df=pd.DataFrame(columns=col)
bags=[]
forests=[]
for n in testmincorr:
  x,y,test1=getData()
  x, test1=removeFeatures(x,y,test1,n)
  features=int(len(x.columns))
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
  hyperbag={'n_estimators': [75,85,90],'max_features':[3,4,5]}
  bagparam,results=getParams(x_train, y_train, BaggingRegressor, hyperbag)
  hyperbag={'n_estimators': [75,85,175],'max_features':[3,4,5,6], 'max_depth': [5,10,20]}
  forestparam, results=getParams(x_train, y_train, RandomForestRegressor, hyperbag)
  bag = BaggingRegressor(n_estimators=bagparam['n_estimators'],
                       max_features= bagparam['max_features'],)
  forest=RandomForestRegressor(max_features=forestparam['max_features'],
                             n_estimators=forestparam['n_estimators'],
                             max_depth=forestparam['max_depth'])
  bags.append(bag)
  forests.append(forest)
  bagstd, bagmean=kFoldValues(x,y,bag, kfoldruns)
  foreststd, forestmean=kFoldValues(x,y,forest, kfoldruns)
  df.loc[len(df)]=[n,features,bagstd,bagmean,foreststd,forestmean]
df


Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
station_diur_temp_rng_c                 -0.235323
station_min_temp_c                       0.259204
city                                    -0.292624
year                                    -0.306806
month                                    0.215737


  return column_or_1d(y, warn=True)


{'max_features': 5, 'n_estimators': 90}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 20, 'max_features': 6, 'n_estimators': 175}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[14.5195966  14.85866148 15.88669872 13.44629918 14.52854099 14.78499186
 15.03646215 16.1571107  14.96008932 12.3448199  13.70576923 14.35828733
 16.29696276 14.44733059 13.1208486  17.11473189 14.54886065 14.65556967
 15.10711233 13.29633518]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[15.214639   12.77675494 13.55647733 13.75926311 12.94360032 14.44679712
 14.31424718 13.04709947 14.41029973 13.00663291 13.3047126  13.39559873
 14.10234829 13.06262855 12.13331829 14.69809415 14.50240011 12.6297811
 13.12464737 14.23820234]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
statio

  return column_or_1d(y, warn=True)


{'max_features': 5, 'n_estimators': 85}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 10, 'max_features': 6, 'n_estimators': 75}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[13.15850032 13.46934139 13.77895981 12.25468056 14.29260935 12.72502963
 15.64196078 11.8625404  13.71911765 13.55600356 13.616532   12.88167349
 13.17340013 13.43082048 13.12097985 13.54575711 13.38341952 13.35654897
 13.62184066 13.77626104]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[12.61365024 14.69281131 13.51560214 12.25207113 13.66634467 12.17376947
 12.28514905 13.39706331 13.63636773 13.41110405 12.99731646 14.02507003
 13.8965049  11.81372121 13.92254805 12.25505462 13.26126001 13.01164893
 12.34444956 13.58606591]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
stati

  return column_or_1d(y, warn=True)


{'max_features': 5, 'n_estimators': 85}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 10, 'max_features': 4, 'n_estimators': 85}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[13.16186167 12.9806892  12.91917825 11.79477833 12.91994182 14.0802293
 13.87607755 13.98615061 12.5357214  13.22814156 13.30967549 12.89931319
 13.19562729 12.29916451 13.4450808  12.63637093 13.20463943 14.29896305
 14.48775664 11.62228668]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[12.86935636 14.18851056 12.24794592 13.74493664 12.71483139 12.97652755
 12.13698592 14.04465215 13.55013557 12.41139104 14.64095162 12.16711138
 13.15044347 13.10961294 12.87048633 13.55482324 11.95771909 13.81244375
 12.65873382 13.69081784]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
stati

  return column_or_1d(y, warn=True)


{'max_features': 5, 'n_estimators': 85}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 10, 'max_features': 6, 'n_estimators': 175}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[12.88579509 11.61760396 12.34478022 12.99903601 11.19984217 13.32257595
 11.71848739 14.02860375 12.25623519 11.73787977 11.77836512 12.0636999
 13.2281351  11.90608786 13.42467679 12.50501778 11.4260418  14.01047188
 12.50937529 12.42201034]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[13.2967447  13.05764084 11.62351579 13.46328003 13.0569165  12.23431466
 12.70118709 11.12509624 12.11710363 12.3416226  12.18324724 11.35521671
 11.71378721 13.34577847 13.93086946 11.19520654 11.27176273 13.73192338
 11.84422116 11.78450529]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
stati

  return column_or_1d(y, warn=True)


{'max_features': 3, 'n_estimators': 90}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


18 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

{'max_depth': 20, 'max_features': 3, 'n_estimators': 75}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[11.30156805 11.6136248  10.78100213  9.34205641 10.56380734 11.05285244
 10.95497546 10.64797621  9.65614188 10.21447779 10.52676402 10.04111043
 11.33380117 10.922208   11.67537452 10.52619447 12.09625442 10.11626733
  9.96776117 11.71704684]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[ 9.9322788  10.61110911 11.11213205  9.8379203  10.40947433  9.51032336
  9.77245997 11.38097963 11.16085332  9.88294545  9.92519491 11.76357613
  9.36877843 12.11811386 10.11421159  9.29035974 11.83514589  9.27216989
  9.71255246 10.20525221]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
stati

6 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 269, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 342, in _fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

  -9.97185229          nan          nan 

{'max_features': 4, 'n_estimators': 75}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


36 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

{'max_depth': 20, 'max_features': 4, 'n_estimators': 175}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[ 8.50684982  8.25049451  7.4678022   7.92974359  6.95326007  7.52967033
  8.68826007  8.82835165  9.17472527  7.29600733  8.86304029  6.95642857
  8.31029304  6.49701465  8.31294872  8.77712454  8.32302198  7.39584249
  6.61217949 10.09998168]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[8.397062   6.48857049 9.18622093 7.04415205 8.08385153 7.62530992
 7.80015176 7.13991209 7.77946761 7.33223089 7.96689633 7.63435138
 7.80162938 7.78259926 8.33322827 6.95069229 7.37139146 9.31655034
 7.56536462 8.33149125]
Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
station_diur_temp_rng_c  

12 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 269, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_bagging.py", line 342, in _fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

         nan         nan         nan]


{'max_features': 3, 'n_estimators': 90}
Fitting 2 folds for each of 36 candidates, totalling 72 fits


54 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

{'max_depth': 20, 'max_features': 3, 'n_estimators': 175}


  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


[8.03002137 7.66895604 7.7204823  8.12931929 7.28952991 7.63101343
 7.77528999 7.38626374 8.32617521 8.03788156 9.75700549 7.16289683
 6.54600122 9.02626679 7.86836081 8.92248168 7.49024725 7.73856838
 7.58893468 8.76135531]


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.

[7.5621753  7.80528595 7.56397918 7.39982829 9.01630265 6.96618556
 8.00775899 9.27360637 7.62567362 7.48457868 8.48812998 7.3240522
 8.8238385  7.47633022 8.31250354 7.3453235  7.46364636 8.61134247
 8.00993765 7.36830849]


Unnamed: 0,mincorr,#features,bag_mae_std,bag_mae_mean,forest_mae_std,forest_mae_mean
0,0.1,20.0,1.120181,14.658754,0.796184,13.633377
1,0.15,15.0,0.739607,13.418299,0.752948,13.137879
2,0.2,13.0,0.743568,13.144082,0.737933,13.124921
3,0.24,8.0,0.807047,12.469236,0.873005,12.368697
4,0.26,5.0,0.717136,10.752563,0.874782,10.360792
5,0.28,4.0,0.911772,8.038652,0.678876,7.796556
6,0.3,3.0,0.71471,7.942853,0.629679,7.896439


In [9]:
bag_index=df[['bag_mae_mean']].idxmin()
forest_index=df[['forest_mae_mean']].idxmin() 
bag_model=sklearn.base.clone(bags[int(bag_index)])
forest_model=sklearn.base.clone(forests[int(forest_index)])
b_minCorr=float(df.iloc[bag_index]['mincorr'])
f_minCorr=float(df.iloc[forest_index]['mincorr'])
if float(df.iloc[bag_index]['bag_mae_mean'])< float(df.iloc[forest_index]['forest_mae_mean']):
  #use min corr and model of index
  best_index=bag_index
  best_minCorr=b_minCorr
  model=sklearn.base.clone(bag_model)
else:
   best_index=forest_index
   best_minCorr=f_minCorr
   model=sklearn.base.clone(forest_model)

In [10]:
#create metrics.json (optional metrics.txt also)
row=df.iloc[best_index]
value = {
        "bag_mae_std": float(row["bag_mae_std"]),
        "bag_mae_mean": float(row["bag_mae_mean"]),
        "forest_mae_std": float(row["forest_mae_std"]),
        "forest_mae_mean": float(row["forest_mae_mean"])
    }
with open('metrics.json', 'w') as outfile:
  outfile.write(json.dumps(value))

In [11]:
b_x, test1=removeFeatures(x,y,test1,b_minCorr)
f_x, test1=removeFeatures(x,y,test1,f_minCorr)
x_train1, x_test1, y_train1, y_test1 = train_test_split(b_x, y, test_size=0.33)
x_train2, x_test2, y_train2, y_test2 = train_test_split(f_x, y, test_size=0.33)
beg,bagpreds=getPreds(x_train1, x_test1, y_train1, y_test1, bag_model)
forest,forestpreds=getPreds(x_train2, x_test2, y_train2, y_test2, forest_model)
bagpreds=np.array(bagpreds)
forestpreds=np.array(forestpreds)
y_test=np.array(y_test1)
y_test=y_test.reshape(bagpreds.shape)

bagerror=y_test-bagpreds
foresterror=y_test-forestpreds

Unnamed: 0                  -0.333340
reanalysis_min_air_temp_k    0.318676
year                        -0.306806
total_cases                  1.000000
Name: total_cases, dtype: float64
3
Unnamed: 0                  -0.333340
reanalysis_min_air_temp_k    0.318676
year                        -0.306806
total_cases                  1.000000
Name: total_cases, dtype: float64
3


  return column_or_1d(y, warn=True)


MSE: 116.76
RMSE: 10.805404
MaE: 6.332640
Accuracy: 10.40%


  This is separate from the ipykernel package so we can avoid doing imports until


ValueError: ignored

In [12]:
#visualization to see if it looks like it matches

#error=error.reshape(error.shape[0]*error.shape[1],1)
x_ax = range(len(y_test1))
plt.tight_layout()
fig1 = plt.gcf()
plt.hist(bagerror, bins='auto', alpha=.5, label='bag')
plt.hist(foresterror, bins='auto', alpha=.5, label='forest')
plt.title('error dist')
plt.xlabel('error')
plt.ylabel('count')
plt.legend()
plt.show()

#create a graph called stats.png
fig1.savefig('stats.png', dpi=100)

NameError: ignored

<Figure size 432x288 with 0 Axes>

In [13]:
x,y,test=getData()
x, test=removeFeatures(x,y,test,best_minCorr)
model.fit(x,y)



Unnamed: 0                              -0.333340
weekofyear                               0.216452
ndvi_ne                                 -0.210937
ndvi_nw                                 -0.184179
ndvi_se                                 -0.232385
ndvi_sw                                 -0.249576
reanalysis_air_temp_k                    0.258385
reanalysis_avg_temp_k                    0.146057
reanalysis_dew_point_temp_k              0.135880
reanalysis_max_air_temp_k               -0.190789
reanalysis_min_air_temp_k                0.318676
reanalysis_relative_humidity_percent    -0.132312
reanalysis_specific_humidity_g_per_kg    0.124578
reanalysis_tdtr_k                       -0.278156
station_avg_temp_c                       0.113804
station_diur_temp_rng_c                 -0.235323
station_min_temp_c                       0.259204
city                                    -0.292624
year                                    -0.306806
month                                    0.215737


  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(max_depth=20, max_features=4, n_estimators=175)

In [14]:
#submit predictions fo test
#first convert float array to int
pred = model.predict(test)
pred=[int(round(num)) for num in pred]
test_original =pd.read_csv('test.csv')
#test_original =pd.read_csv('dengue_features_test.csv')
#then make data frame for predicted data
submission=pd.DataFrame()
submission[['city', 'year', 'weekofyear']]=test_original[['city', 'year', 'weekofyear']]
submission['total_cases']=pred
#submission.sort_values(["city","year","weekofyear"],axis=0, ascending=True, inplace=True, na_position='first')
submission.to_csv('submission.csv', index=False)