In [33]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append("../")
from src.data import *

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb

In [34]:
X, y = load_data()

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

$Preproccesing data for XGBoost and LightGBM$

In [36]:
wp = WellProcessor()

In [37]:
wp.fit(X_train)
X_train = wp.transform(X_train)

In [38]:
X_test = wp.transform(X_test)

$XGBoost$

Becuase it is one of the strongest, if not the strongest, model on the market currently. I wanted to start with XGBoost to see what kind of results it can give as a benchmark.

In [39]:
# Instantiate XGBClassifier
XGB = XGBClassifier()

# Fit XGBClassifier
XGB.fit(X_train, y_train)

# Predict on training and test sets
training_preds_xgb = XGB.predict(X_train)
test_preds_xgb = XGB.predict(X_test)

In [40]:
print(classification_report(y_test, test_preds_xgb))

              precision    recall  f1-score   support

           0       0.84      0.72      0.78      4572
           1       0.64      0.25      0.36       851
           2       0.77      0.91      0.84      6457

    accuracy                           0.79     11880
   macro avg       0.75      0.63      0.66     11880
weighted avg       0.79      0.79      0.78     11880



While XGBoost gave great results with default hyperparameters, it took a minimum of 12minutes to run. I wanted to compare its results to more efficent models.

*$DecisionTreeClassifier$*

My next model was a Decision Tree. I wanted something to compare to XGBoost that was a little bit more simple, but still could deal with complex models. I also wanted a model that could execute faster than XGBoost.

In [4]:
rn_dtc = run_model (X_train, y_train, DecisionTreeClassifier())

0.7634231192054719


In [5]:
rn_dtc.best_score_

0.7634231192054719

We had a nice score from our Decision Tree Classifier, but not as high as our XGBoost.

$Random Forest Classifier$

While my Decision Tree Classifier gave good results I wanted to try a more complex Random Forest Classifier to compare it to. I had great success with Random Forest in previous work, so it seemed like a natural choice.

In [56]:
rn_rfc = run_model (X_train, y_train, RandomForestClassifier())

0.7831470454707473


In [57]:
rn_rfc.best_score_

0.7831470454707473

My Random Forest gave a better score than my Decision Tree, which was great but the improvement wasnt as large as I was hoping. It was also slower than my Decision Tree.

$Naive Bayes$

While the Random Forest was a success, I wanted to see if we can improve a scores by a greater margin. One of the biggest challnenges for our data is the dataset is so large. Naive Bayes models are recorded to be better with larger datasets, so I wanted to use one next.

In [9]:
rn_gnb = run_model (X_train, y_train, GaussianNB())

0.1799257700085921


In [10]:
rn_gnb.best_score_

0.1799257700085921

Huge drop in score, the dream has ended, hope is gone.

$LightGBM$

After a misreble attempt with a Naive Bayes model I did some research into how to make our current best model even better. LightGBM is reported to be as competitive as XGBoost but with much faster times.

In [43]:
# Instantiate XGBClassifier
lgbm = lgb.LGBMClassifier()

# Fit XGBClassifier
lgbm.fit(X_train, y_train)

# Predict on training and test sets
training_preds_lgbm = lgbm.predict(X_train)
test_preds_lgbm = lgbm.predict(X_test)

In [44]:
print(classification_report(y_test, test_preds_lgbm))

              precision    recall  f1-score   support

           0       0.84      0.71      0.77      4572
           1       0.65      0.23      0.34       851
           2       0.76      0.91      0.83      6457

    accuracy                           0.79     11880
   macro avg       0.75      0.62      0.65     11880
weighted avg       0.79      0.79      0.77     11880



$Hypertuning our Best Model$

In [52]:
param_grid = {
    'estimator__max_depth': [5,6,7,8],
    'estimator__min_samples_split': [500,1000,1500],
    'estimator__max_leaf_nodes': [10,25,50],
    'estimator__min_samples_leaf': [100, 250]
}

In [49]:
rfc.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [53]:
rn_rfc_bp = run_model (X_train, y_train, RandomForestClassifier(), param_grid=param_grid)

0.5196142021514092


In [55]:
rn_rfc_bp.best_params_

{'estimator__max_depth': 8,
 'estimator__max_leaf_nodes': 50,
 'estimator__min_samples_leaf': 100,
 'estimator__min_samples_split': 500}

$Saving Y Preds$

In [66]:
y_pred_rfc = rn_rfc.predict(X_test)

In [67]:
y_pred_df = pd.DataFrame(data=y_pred_rfc, columns=y_test.columns, index=y_test.index)

In [69]:
!pwd

/Users/William/Documents/Flatiron/Project 3/Tanzania-Well-Repair-Predictor/data_exploration


In [70]:
y_pred_df.to_csv('../data/best_y_preds.csv')