In [130]:
#!pip install ruamel.yaml



In [131]:
import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Conv2D, MaxPooling2D, Flatten
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix, recall_score, precision_score
import seaborn as sn
from ruamel.yaml import YAML
import json

In [132]:
def load_params():
    "Updates FULL_PARAMS with the values in params.yaml and returns all as a dictionary"
    yaml = YAML(typ="safe")
    with open("params.yaml") as f:
        params = yaml.load(f)
    return params

In [133]:
from matplotlib import text
x_train = pd.read_csv ('dengue_features_train.csv', na_values=' NAN')
y_train=pd.read_csv('dengue_labels_train.csv', na_values=' NAN')
test =pd.read_csv ('dengue_features_test.csv', na_values=' NAN')
test=test.drop(columns=['city', 'year', 'week_start_date'])
y_train=y_train.drop(columns=['city', 'year'])
x_train=x_train.drop(columns=['city', 'year', 'week_start_date'])

In [134]:
#data analysis
minCorr = .2
x_train['total_cases']=y_train['total_cases']
corrMatrix = x_train.corr()
lastCol=corrMatrix.iloc[:,-1]
print(lastCol)
features=list(lastCol.loc[abs(lastCol) > minCorr].index)
print(features)
x_train=x_train[features]
corrMatrix = x_train.corr()

weekofyear                               0.216452
ndvi_ne                                 -0.241376
ndvi_nw                                 -0.202235
ndvi_se                                 -0.168612
ndvi_sw                                 -0.196461
precipitation_amt_mm                    -0.038740
reanalysis_air_temp_k                    0.264952
reanalysis_avg_temp_k                    0.151637
reanalysis_dew_point_temp_k              0.142531
reanalysis_max_air_temp_k               -0.191345
reanalysis_min_air_temp_k                0.325252
reanalysis_precip_amt_kg_per_m2         -0.010031
reanalysis_relative_humidity_percent    -0.132452
reanalysis_sat_precip_amt_mm            -0.038740
reanalysis_specific_humidity_g_per_kg    0.129861
reanalysis_tdtr_k                       -0.278483
station_avg_temp_c                       0.116109
station_diur_temp_rng_c                 -0.237844
station_max_temp_c                      -0.039219
station_min_temp_c                       0.267109


In [135]:
#preprocessing
#remove NaN
x_train['total_cases']=y_train['total_cases']
x_train=x_train.dropna()
test=test.dropna()
y_train=x_train['total_cases']
x_train=x_train.drop(columns=['total_cases'])
#separate again

In [136]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold

#X, y = x_train.iloc[:,:-1],y_train.iloc[:,-1]
X_train, X_test, Y_train, y_test = train_test_split(x_train, y_train, test_size=0.33)

#bagging model

dt_model = DecisionTreeRegressor()
X, y = make_regression(n_samples=1000, n_informative=15, noise=0.1, random_state=1)
bagModel = BaggingRegressor(dt_model, n_estimators=22, max_features=1.0)
bagModel.fit(X_train, Y_train)
preds = bagModel.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

#scoring the model
score = bagModel.score(X_train, Y_train)
print("Training Score: ", score)

#cross-val score
scores = cross_val_score(bagModel, X_train, Y_train, cv=10)
print("Mean cross-val score: %.2f" % scores.mean())

#K-Fold cross-val
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(bagModel, X_train, Y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

#MSE and RMSE
preds = bagModel.predict(X_test)
preds = [round(value) for value in preds]
mse = mean_squared_error(y_test, preds)
print("MSE: %.2f" % mse)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

#accuracy score
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

RMSE: 24.346916
Training Score:  0.877670896983071
Mean cross-val score: 0.12
K-fold CV average score: 0.09
MSE: 593.53
RMSE: 24.362420
Accuracy: 2.00%


In [137]:
bagModel.get_params()

{'base_estimator__ccp_alpha': 0.0,
 'base_estimator__criterion': 'squared_error',
 'base_estimator__max_depth': None,
 'base_estimator__max_features': None,
 'base_estimator__max_leaf_nodes': None,
 'base_estimator__min_impurity_decrease': 0.0,
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__min_samples_split': 2,
 'base_estimator__min_weight_fraction_leaf': 0.0,
 'base_estimator__random_state': None,
 'base_estimator__splitter': 'best',
 'base_estimator': DecisionTreeRegressor(),
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 22,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [119]:
#Fitting Bagging Classifier model with default hyper parameters
bagg = BaggingClassifier()
bagg.fit(x_train,y_train)
pred_bagg = bagg.predict(X_test)

#Checking different metrics for bagging model with default hyper parameters
print('Checking different metrics for bagging model with default hyper parameters:\n')
print("Training accuracy: ",bagg.score(X_train,Y_train))
acc_score = accuracy_score(y_test, pred_bagg)
print('Testing accuracy: ',acc_score)

#Setting values for the parameters
n_estimators = [100, 300, 500, 800, 1200]
#max_depth = [5, 10, 15, 25, 30]
max_samples = [5, 10, 25, 50, 100]
max_features = [1, 2, 5, 10, 13]

#Creating a dictionary for the hyper parameters
hyperbag = dict(n_estimators = n_estimators, max_samples = max_samples, 
              max_features = max_features)

#Applying GridSearchCV to get the best value for hyperparameters
gridbag = GridSearchCV(bagg, hyperbag, cv = 3, verbose = 1, n_jobs = -1)
bestbag = gridbag.fit(x_train, y_train)

#Printing the best hyperparameters
print('The best hyper parameters are:\n',gridbag.best_params_)

Checking different metrics for bagging model with default hyper parameters:

Training accuracy:  0.9950617283950617
Testing accuracy:  0.99
Fitting 3 folds for each of 125 candidates, totalling 375 fits


150 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/heathercornell/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/heathercornell/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 269, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/Users/heathercornell/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 342, in _fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: m

The best hyper parameters are:
 {'max_features': 2, 'max_samples': 25, 'n_estimators': 500}


In [120]:
#Fitting the bagging model with the best hyper parameters obtained through GridSearchCV
bagg1 = BaggingClassifier(max_features=10, max_samples=25,n_estimators= 100)
bagg1.fit(X_train,Y_train)
pred_bagg1 = bagg1.predict(x_test)

ValueError: max_features must be in (0, n_features]

In [None]:
#visualization to see if it looks like it matches
x_ax = range(len(y_test))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, preds, label="predicted")
plt.title("Predicted vs Original data")
plt.legend()
plt.show()

In [None]:
#create a graph called stats.png
plt.tight_layout()
fig1 = plt.gcf()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
fig1.savefig('stats.png', dpi=100)

In [None]:
#submit predictions fo test
def submit(model, test):
  pred=model.predict(x)