<b>Dependencies:</b> <br>
    import pandas as pd <br><br>
    from sklearn.ensemble import GradientBoostingClassifier <br>
    from sklearn.model_selection import train_test_split <br>
    from sklearn.metrics import accuracy_score <br>
    from sklearn.metrics import precision_score <br>
    from sklearn.metrics import recall_score <br>
    from sklearn.metrics import f1_score <br>


In [3]:
# GRADIENT BOOSTING FEATURE SELECTION 80-20

%matplotlib inline
import pandas as pd

# Models
from sklearn.ensemble import GradientBoostingClassifier

# Evaluation methods
# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_training = pd.read_csv('../datasets/diabetes_train_data_80pc.csv')
diabetes_test = pd.read_csv('../datasets/diabetes_test_data_20pc.csv')
selected_features = ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X_train = diabetes_training[selected_features]
y_train = diabetes_training.Outcome
X_test = diabetes_test[selected_features]
y_test = diabetes_test.Outcome
random_seed = 3


model = GradientBoostingClassifier(random_state=random_seed)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = (accuracy_score(y_test, y_pred))
precision = (precision_score(y_test, y_pred))
recall = (recall_score(y_test, y_pred))
f1 = (f1_score(y_test, y_pred))

print("Train/Test Split after Feature Selection")
results_train_test = pd.DataFrame({'Model': ["GB"],
                                   'Accuracy': [accuracy],
                                   'Precision ': [precision],
                                   'Recall': [recall],
                                   'f1': [f1],})
print(results_train_test, "\n\n")



Train/Test Split after Feature Selection
  Model  Accuracy  Precision     Recall        f1
0    GB  0.744828    0.653061  0.615385  0.633663 




In [1]:
# AVERAGE GRADIENT BOOSTING FEATURE SELECTION 80-20

%matplotlib inline
import pandas as pd
import random

# Models
from sklearn.ensemble import GradientBoostingClassifier

# Evaluation methods
from sklearn.model_selection import train_test_split

# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_cleaned = pd.read_csv('../datasets/diabetes_cleaned.csv')
selected_features = ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = diabetes_cleaned[selected_features]
y = diabetes_cleaned.Outcome
accuracy = []
precision = []
recall = []
f1 = []

times_repeated = 100
for _ in range(times_repeated):
    random_seed = random.randint(0, 1000)

    model = GradientBoostingClassifier(random_state=random_seed)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        stratify = diabetes_cleaned.Outcome, random_state=random_seed)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))


print("Average Train/Test Split after Feature Selection")
results_train_test = pd.DataFrame({'Model': ["GB"],
                                   'Accuracy': [sum(accuracy)/len(accuracy)],
                                   'Precision ': [sum(precision)/len(precision)],
                                   'Recall': [sum(recall)/len(recall)],
                                   'f1': [sum(f1)/len(f1)],})
print(results_train_test, "\n\n")



Average Train/Test Split after Feature Selection
  Model  Accuracy  Precision   Recall        f1
0    GB  0.764138    0.679176  0.6064  0.638127 




In [3]:
# GRADIENT BOOSTING DATA AUGMENTATION 80-20

%matplotlib inline
import pandas as pd

# Models
from sklearn.ensemble import GradientBoostingClassifier

# Evaluation methods
# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_da_training = pd.read_csv('../datasets/diabetes_train_data_80pc_100times_10.csv')
diabetes_da_test = pd.read_csv('../datasets/diabetes_test_data_20pc.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X_train = diabetes_da_training[features]
y_train = diabetes_da_training.Outcome
X_test = diabetes_da_test[features]
y_test = diabetes_da_test.Outcome
random_seed = 3

model = GradientBoostingClassifier(random_state=random_seed)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = (accuracy_score(y_test, y_pred))
precision = (precision_score(y_test, y_pred))
recall = (recall_score(y_test, y_pred))
f1 = (f1_score(y_test, y_pred))

print("Train/Test Split after Data Augmentation")
results_train_test = pd.DataFrame({'Model': ["GB"],
                                   'Accuracy': [accuracy],
                                   'Precision ': [precision],
                                   'Recall': [recall],
                                   'f1': [f1],})
print(results_train_test, "\n\n")



Train/Test Split after Data Augmentation
  Model  Accuracy  Precision     Recall      f1
0    GB  0.793103        0.75  0.634615  0.6875 




In [7]:
# GRADIENT BOOSTING FEATURE SELECTION + DATA AUGMENTATION 80-20

%matplotlib inline
import pandas as pd

# Models
from sklearn.ensemble import GradientBoostingClassifier

# Evaluation methods
# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_da_training = pd.read_csv('../datasets/diabetes_train_data_80pc_100times_10.csv')
diabetes_da_test = pd.read_csv('../datasets/diabetes_test_data_20pc.csv')
selected_features = ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X_train = diabetes_da_training[selected_features]
y_train = diabetes_da_training.Outcome
X_test = diabetes_da_test[selected_features]
y_test = diabetes_da_test.Outcome
random_seed = 3

model = GradientBoostingClassifier(random_state=random_seed)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = (accuracy_score(y_test, y_pred))
precision = (precision_score(y_test, y_pred))
recall = (recall_score(y_test, y_pred))
f1 = (f1_score(y_test, y_pred))

print("Train/Test Split after Feature Selection and Data Augmentation")
results_train_test = pd.DataFrame({'Model': ["GB"],
                                   'Accuracy': [accuracy],
                                   'Precision ': [precision],
                                   'Recall': [recall],
                                   'f1': [f1],})
print(results_train_test, "\n\n")



Train/Test Split after Feature Selection and Data Augmentation
  Model  Accuracy  Precision     Recall        f1
0    GB  0.786207    0.733333  0.634615  0.680412 




In [4]:
# GRADIENT PARAMETER TUNING

%matplotlib inline
import pandas as pd

# Models
from sklearn.ensemble import GradientBoostingClassifier

# Parameter Tuning
from sklearn.model_selection import GridSearchCV, ShuffleSplit

# Using the 80% set for training (60%) and validation(%20), saving the remaining 20% for test
diabetes_cleaned = pd.read_csv('../datasets/diabetes_train_data_80pc.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
            'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = diabetes_cleaned[features]
y = diabetes_cleaned.Outcome
random_seed = 3


# Train test split equivalent, test_size refers to the validation set (0.80 * 0.25) == (1 * 0.20)
shuffle_split = ShuffleSplit(n_splits=1, test_size=0.25)

hyperparameters = {
    "n_estimators": [1, 2, 4, 8, 16, 32, 64, 100, 200, 300, 500],
    "max_depth": [1, 2, 4, 8, 16, 32],
    "learning_rate": [0.01, 0.05, 0.1, 0.3, 0.5, 1],
}

grid = GridSearchCV(GradientBoostingClassifier(), hyperparameters, cv=shuffle_split, scoring='f1')
grid.fit(X, y)

print("Best parameters: ", grid.best_params_)



Best parameters:  {'learning_rate': 0.3, 'max_depth': 2, 'n_estimators': 32}


In [3]:
# GRADIENT BOOSTING FEATURE SELECTION + DATA AUGMENTATION + PARAMETER TUNING 80-20

%matplotlib inline
import pandas as pd

# Models
from sklearn.ensemble import GradientBoostingClassifier

# Evaluation methods
# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_da_training = pd.read_csv('../datasets/diabetes_train_data_80pc_100times_10.csv')
diabetes_da_test = pd.read_csv('../datasets/diabetes_test_data_20pc.csv')
selected_features = ['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X_train = diabetes_da_training[selected_features]
y_train = diabetes_da_training.Outcome
X_test = diabetes_da_test[selected_features]
y_test = diabetes_da_test.Outcome
random_seed = 3

model = GradientBoostingClassifier(learning_rate=0.5, max_depth=4, n_estimators=16, random_state=random_seed)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = (accuracy_score(y_test, y_pred))
precision = (precision_score(y_test, y_pred))
recall = (recall_score(y_test, y_pred))
f1 = (f1_score(y_test, y_pred))

print("Train/Test Split after Feature Selection, Data Augmentation and Parameter Tuning")
results_train_test = pd.DataFrame({'Model': ["GB"],
                                   'Accuracy': [accuracy],
                                   'Precision ': [precision],
                                   'Recall': [recall],
                                   'f1': [f1],})
print(results_train_test, "\n\n")



Train/Test Split after Feature Selection, Data Augmentation and Parameter Tuning
  Model  Accuracy  Precision     Recall        f1
0    GB  0.793103     0.73913  0.653846  0.693878 


