In [237]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, matthews_corrcoef, recall_score, confusion_matrix
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from scipy.special import softmax
from sklearn.svm import SVC


In [238]:
# function for classification metrics + confusion matrix
def report (y_test, preds):
    stats = pd.DataFrame()  
    stats['Precision'] = [precision_score(y_test, preds)] 
    stats['Recall'] = [recall_score(y_test, preds)]
    stats['F1'] = [f1_score(y_test, preds)]
    stats['Matthews'] = [matthews_corrcoef(y_test, preds)]
    stats['Accuracy'] = [accuracy_score(y_test, preds)]
    display(stats)
    display(pd.DataFrame(confusion_matrix(y_test, preds), columns=['PP', 'PN'], index=['P', 'N']))

# 2 - Implement a Weighted Average Ensemble System in Python.

In [239]:
# The data
df = datasets.load_breast_cancer()
y = df.target
X = df.data

In [240]:
print("nº of 1's = " + str((df["target"] == 1).sum()))
print("nº of 0's = " + str((df["target"] == 0).sum()))

nº of 1's = 357
nº of 0's = 212


- set seems to be unabalanced


- The Procedure:
1) split and scale data:
    - split train / validation
    - split train once more to obtain test set
    - scale
2) train each expert on train set
    - predict with validation to get each accuracy
    apply soft max formula to each accuracy to  get the weights 
4) predict experts with test set
    - aply dot prod to this preds and weights
5) compare with the true labels to get the accuracy of the ensembled


- This process will be divided into 3 main phases for a better organizations

## 1st phase

- Spliting

In [241]:
#1st split of Data: To obtain last preds - last phase - (test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=222)
#2nd split of Data: For training and obtain 1st preds to get weights with softmax - first phase - (validation)
X_train_01, X_valid, y_train_01, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=22)

- Scaling

In [242]:
# Lets fit a scaler to the training set
scaler = StandardScaler()
scaler.fit(X_train_01)
# Lets transform the X's
X_train_01 = scaler.transform(X_train_01)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

- Training some experts

In [243]:
# Decision Trees
dt_gini = DecisionTreeClassifier(criterion="gini").fit(X_train_01, y_train_01) # critierion = Gini
dt_entropy = DecisionTreeClassifier(criterion="entropy").fit(X_train_01, y_train_01) # critierion = entropy
# Support Vector Machine
svc_rbf = SVC(kernel = "rbf").fit(X_train_01, y_train_01) # kernel = RBF
svc_poly = SVC(kernel = "poly").fit(X_train_01, y_train_01) # kernel = poly
# Logistic Regression
logreg_default = LogisticRegression(C = 1.0, max_iter = 10000).fit(X_train_01, y_train_01) # C = 1.0
logreg_C = LogisticRegression(C=.01,max_iter = 10000).fit(X_train_01, y_train_01) # C = .01


# 2nd phase

- With the experts trained we are going to obtain the predictions and accuracys using the validation set

In [244]:
models = [dt_gini, dt_entropy, svc_rbf, svc_poly, logreg_default, logreg_C]

def pred_acc(models, data, labels, pred=False, acc=False, mcc=False):
    accuracy_scores = []
    predictions = []
    mcc_scores = []
    
    # loop through various experts obtaining predictions and accuracies
    for model in models:
        preds_model = model.predict(data)
        accuracy_model = accuracy_score(labels, preds_model)
        mcc_model = matthews_corrcoef(labels, preds_model)
        predictions.append(preds_model)
        accuracy_scores.append(accuracy_model)
        mcc_scores.append(mcc_model)
    
    if pred:
        return predictions
    elif acc:
        return accuracy_scores
    elif mcc:
        return mcc_scores

In [245]:
# The list with the fited models
models = [dt_gini, dt_entropy, svc_rbf, svc_poly, logreg_default, logreg_C]

# Get Accuracys
acc = pred_acc(models, X_valid, y_valid, pred = False, acc = True, mcc = False)
acc

[0.9252336448598131,
 0.9158878504672897,
 0.9813084112149533,
 0.9158878504672897,
 0.9813084112149533,
 0.9719626168224299]

- The next step is to obtain the weights corresponding to each expert using softmax function: 
    - $ g_k = \frac{ exp(\mu_k) }{ \sum\limits_{j=1}^{K}exp(\mu_j) }, k = 1,2,...,K $


In [246]:
weights = (np.exp(acc)/np.sum(np.exp(acc)))
weights

array([0.16274487, 0.16123097, 0.17213147, 0.16123097, 0.17213147,
       0.17053025])

In [247]:
accuracys = acc
weights = weights
models 

df = pd.DataFrame({'Model': models, 'Accuracy of experts': accuracys, 'Weights': weights})
df_sorted_acc = df.sort_values(by='Accuracy of experts', ascending=False)
print(df_sorted_acc)

                                         Model  Accuracy of experts   Weights
2                                        SVC()             0.981308  0.172131
4           LogisticRegression(max_iter=10000)             0.981308  0.172131
5   LogisticRegression(C=0.01, max_iter=10000)             0.971963  0.170530
0                     DecisionTreeClassifier()             0.925234  0.162745
1  DecisionTreeClassifier(criterion='entropy')             0.915888  0.161231
3                           SVC(kernel='poly')             0.915888  0.161231


- we can observe that the highest weight is atributed to the expert with the highest accuracy value, in this case SVC with rbf kernel and and logistic regression with C = 1.0

## 3th phase

- obtain new preds from experts this time with the test set

In [248]:
test_preds = pred_acc(models, X_test, y_test, pred = True, mcc = False, acc= False)

- Now we need to make a dot product between the test set predictions and and weights obtained with softmax
    - Given a set of experts {f1, f2, . . . , fn}, with accuracy scores {a1, a2, . . . , an}, and input X, the
output of the system becomes:
        - round(softmax([a1, a2, . . . , an]) · [f1(X), f2(X), . . . , fn(X)])

In [249]:
# Get new preds with dot product, rounded values
final_preds = np.round(np.dot(weights, test_preds))
print("This are the final predicionts: " + str(final_preds))

This are the final predicionts: [1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1.
 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0.
 0. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.
 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.
 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1.
 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1.]


- The final step is to evaluate the quality of the ensembled model. To do this we will compare the "final_preds" with the true labels and get classification metrics

In [250]:
report(y_test, final_preds)

Unnamed: 0,Precision,Recall,F1,Matthews,Accuracy
0,0.967033,1.0,0.98324,0.956183,0.979021


Unnamed: 0,PP,PN
P,52,3
N,0,88


- as we saw in the begining the dataset can be considered unbalanced (nº of 1's = 357; nº of 0's = 212) so we will compare the matthews_corrcoef which is a more adequate measure for the model quality, given this type of dataset

In [251]:
# matthews_corrcoef for experts
mcc_experts = pred_acc(models, X_valid, y_valid, pred = False, acc = False, mcc = True)

experts_mcc = mcc_experts
models 

# Dataframes with MCC and ACC for Experts
df = pd.DataFrame({'Model': models, 'MCC of expert models': experts_mcc})
df_sorted_mcc = df.sort_values(by='MCC of expert models', ascending=False)
print("                                      Ensembled model MCC = 0.970681")
print(df_sorted_mcc)
print("")
print("                                Ensembled model Accuracy = 0.986014")
print(df_sorted_acc[["Model", "Accuracy of experts"]])

                                      Ensembled model MCC = 0.970681
                                         Model  MCC of expert models
4           LogisticRegression(max_iter=10000)              0.962130
2                                        SVC()              0.961119
5   LogisticRegression(C=0.01, max_iter=10000)              0.942645
0                     DecisionTreeClassifier()              0.846285
3                           SVC(kernel='poly')              0.832595
1  DecisionTreeClassifier(criterion='entropy')              0.825856

                                Ensembled model Accuracy = 0.986014
                                         Model  Accuracy of experts
2                                        SVC()             0.981308
4           LogisticRegression(max_iter=10000)             0.981308
5   LogisticRegression(C=0.01, max_iter=10000)             0.971963
0                     DecisionTreeClassifier()             0.925234
1  DecisionTreeClassifier(criterion='en

- the model with the highest matthews_corrcoef value is the Logistic Regression (Default) with 0.962130 which is samler than the obtained with the ensembled model, 0.970681.
- the model with the highest accuracy value is the Support vector machine (Default) with 0.981308 which is samler than the obtained with the ensembled model, 0.986014


## Conclusion
- It is possible to observe that by using a divide and conquer principle with just six experts, both metrics, accuracy and the matthews_corrcoef, which is a more adequate measure of the quality of this dataset given that is unbalanced, were higher when compared with the ones from each of the individuals models.
- Besides better metrics, another advantadge of using ensembled average models is that a model with lower variance is obtained, reducing the risk of overfitng when compared with a single model.
- One possible disavantage of this type of models is conected with the fact that the accuracy level of the emsembled model is derived from a weighted average which may leed to a  loss of  interpretation capability given that we cannot obtain the original values.
