In [1]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier

# Lista 2 - Regressão logística, métodos estatísticos, KNN e árvores de decisão

In [2]:
def z_score_norm(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    norm = lambda x: (x - mean) / std
    inv_norm = lambda norm_x: norm_x * std + mean
    return norm(data), norm, inv_norm

In [3]:
# Error functions
def MSE(Y, Y_hat):
    return np.mean((Y - Y_hat)**2)

def RMSE(Y, Y_hat):
    return np.sqrt(MSE(Y, Y_hat))

## Load dataset

In [4]:
dataset = np.genfromtxt('./breastcancer.csv', delimiter=',')
X = dataset[:, :-1]
Y = dataset[:, [-1]]

## Model Evaluation

In [5]:
def model_evaluation_scores(y, y_predict):
    # Make sure both have the same format
    y_predict = y_predict.reshape(-1, 1)
    y = y.reshape(-1, 1)
    
    # True Positves
    true_positives = np.nonzero((y == 1) & (y_predict == 1))[0]
    n_true_positives = len(true_positives)

    # False positives
    false_positives = np.nonzero((y == 0) & (y_predict == 1))[0]
    n_false_positives = len(false_positives) 

    # False Negatives
    false_negatives = np.nonzero((y == 1) & (y_predict==0))[0]
    n_false_negatives = len(false_negatives)

    # Accurracy
    accurracy = np.sum(y == y_predict) / len(y_predict)
    
    # Recall
    recall = n_true_positives / (n_true_positives + n_false_negatives)

    # Precision
    precision = n_true_positives / (n_true_positives + n_false_positives)

    # F1 Score
    f1_score = 2 * (recall * precision) / (recall + precision)
    
    return accurracy, recall, precision, f1_score

## Kfold Validation and Score calculation

In [6]:
def kfold_validation(model, x, y, verbose=False, n_folds=10, normalize_data=True):
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    scores_label = ['Accurracy', 'Recall', 'Precision', 'F1-Score']
    results = []
    
    for i, (train_index, test_index) in enumerate(kfold.split(x)):
        # get data
        fold_x_train, fold_x_test = x[train_index], x[test_index]
        fold_y_train, fold_y_test = y[train_index], y[test_index]

        # norm
        if(normalize_data):
            fold_x_train, norm_x, _ = z_score_norm(fold_x_train)
            fold_x_test = norm_x(fold_x_test)


        # Train
        model.fit(fold_x_train, fold_y_train)
        
        # Predict
        fold_y_predict = model.predict(fold_x_test)
        
        # Evaluate the model
        acc, recall, precision, f1_score = model_evaluation_scores(fold_y_test, fold_y_predict)
        if(verbose):
            print(f'Fold {i+1}')
            print(f'Accurracy: {acc}')
            print(f'Recall: {recall}')
            print(f'Precision: {precision}')
            print(f'F1-Score: {f1_score}\n')


        # Store Scores of Fold
        results.append([acc, recall, precision, f1_score])
    
    results = np.array(results)
    scores_mean, scores_std = results.mean(axis=0), results.std(axis=0)
    for i in range(len(scores_label)):
        print(f'{scores_label[i]}')
        print(f'Mean: {scores_mean[i] * 100:.2f}%')
        print(f'Std: {scores_std[i] * 100:.2f}%')
        print()
    return scores_mean, scores_std

## Regressão Logistica

In [7]:
def sig(x):
    return 1/(1 + np.exp(-x))

In [8]:
class LogisticRegression():
    def __init__(self, alpha=.1, n_epochs=1000, add_bias=True):
        self.alpha = alpha
        self.n_epochs = n_epochs
        self.add_bias=add_bias
    
    def fit(self, X, Y):
        
        if(self.add_bias):
            self.x = np.c_[np.ones(len(X)), X]
        else:
            self.x = X
        
        self.y = Y
        self.learning_curve = []
        n = len(Y)
        
        # GD algo
        self.w = np.zeros((self.x.shape[1], 1))
        for i in range(self.n_epochs):
            y_hat = sig(self.x @ self.w)

            # Calculate error
            e = self.y - y_hat

            gradient = (self.x.T @ e)/n

            # Update w
            self.w += self.alpha * gradient

            # Calculate Error
            error = MSE(self.y, y_hat)
            self.learning_curve.append(error)

    def predict(self, X):
        x = X
        if(self.add_bias):
            x = np.c_[np.ones(len(X)), X]
        return np.round(sig(x @ self.w))

In [9]:
alpha = .1
n_epochs = 1000
model = LogisticRegression(alpha, n_epochs)
print(f'Modelo: Regressão Logistica\n')
print(f'Hyperparametros')
print(f'  > Passo de Apredizagem: {alpha}')
print(f'  > Quantidade de Iterações: {n_epochs}')
print()
print()
print('Scores\n')
kfold_validation(model, X, Y);

Modelo: Regressão Logistica

Hyperparametros
  > Passo de Apredizagem: 0.1
  > Quantidade de Iterações: 1000


Scores

Accurracy
Mean: 97.89%
Std: 1.89%

Recall
Mean: 98.84%
Std: 1.43%

Precision
Mean: 97.69%
Std: 2.92%

F1-Score
Mean: 98.24%
Std: 1.67%



## Análise do discriminante Gaussiano

In [10]:
class GaussianDiscriminant():
    def __init__(self, n_classes=2):
        self.n_classes = 2
        self.prob_c = [1/self.n_classes for _ in range(self.n_classes)]
    
    def _create_prob_function(self, p_ck, det, mean, inv):
        return lambda x: (np.log(p_ck) - .5*np.log(det) - .5*(x - mean) @ inv @ (x - mean).T).diagonal()
    
    def fit(self, x, y):
        self.estimators = []
        for i in range(self.n_classes):
            # Get the elements of class k == i
            ck_index = np.nonzero(y == i)[0]
            ck = x[ck_index]
            
            mean = ck.mean(axis=0)
            var = np.cov(ck, rowvar=False)
            inv_var = np.linalg.inv(var)
            det = np.linalg.det(var)
            prob_fn = self._create_prob_function(self.prob_c[i], det, mean, inv_var)
            
            # estimators holds functions to calculate the p(C_k | x)
            self.estimators.append(prob_fn)
    
    def _predict_proba(self, x):
        return np.array([prob_fn(x) for prob_fn in self.estimators]).T

    def predict(self, x):
        return np.argmax(self._predict_proba(x), axis=1).reshape(-1, 1)

In [11]:
model = GaussianDiscriminant(n_classes=2)
print(f'Modelo: Discriminante Gaussiano\n')
print('Scores\n')
kfold_validation(model, X, Y);

Modelo: Discriminante Gaussiano

Scores

Accurracy
Mean: 95.77%
Std: 1.99%

Recall
Mean: 97.13%
Std: 2.66%

Precision
Mean: 96.26%
Std: 3.16%

F1-Score
Mean: 96.63%
Std: 1.49%



## Naive Bayes Gaussiano

In [12]:
class GaussianNaiveBayes():
    def __init__(self, n_classes=2):
        self.n_classes = 2
        self.p_ck = [1/self.n_classes for _ in range(self.n_classes)]
        
    def _create_prob_fn(self, p_ck, mean, var):
        return lambda x: np.log(p_ck) - .5*np.sum(np.log(2*np.pi*var), axis=1) - .5*np.sum(((x-mean)**2)/var, axis=1)
    
    def fit(self, x, y):
        self.estimators = []
        for i in range(self.n_classes):
            ck_index = np.nonzero(y == i)[0]
            ck = x[ck_index]

            mean = ck.mean(axis=0, keepdims=True)
            var = ck.var(axis=0, ddof=1, keepdims=True)
            prob_fn = self._create_prob_fn(self.p_ck[i], mean, var)
            
            # estimators holds functions to calculate the p(C_k | x)
            self.estimators.append(prob_fn)
    
    def _predict_proba(self, x):
        return np.array([prob(x) for prob in self.estimators]).T

    def predict(self, x):
        return np.argmax(self._predict_proba(x), axis=1).reshape(-1, 1)

In [13]:
model = GaussianNaiveBayes(n_classes=2)
print('Modelo: Nayve Bayes Gaussiano\n')
print('Scores\n')
kfold_validation(model, X, Y);

Modelo: Nayve Bayes Gaussiano

Scores

Accurracy
Mean: 93.14%
Std: 3.39%

Recall
Mean: 95.16%
Std: 3.96%

Precision
Mean: 94.13%
Std: 2.55%

F1-Score
Mean: 94.59%
Std: 2.46%



## KNN

In [14]:
class KNN():
    def __init__(self, n_classes=2, n_neighbors=3):
        self.n_classes = n_classes
        self.n_neighbors = n_neighbors
        
    def fit(self, x, y):
        self.x = x
        self.y = y
        self.dist_const = (self.x**2).sum(axis=1)
    
    def predict(self, x):
        dist_matrix = -2 * x @ self.x.T + (x**2).sum(axis=1, keepdims=True) + self.dist_const
        neighbors_index = np.argpartition(dist_matrix, self.n_neighbors, axis=1)[:, :self.n_neighbors]
        neighbors_classes = self.y[neighbors_index]
        return neighbors_classes.mean(axis=1).round().reshape(-1,1)

In [15]:
n_classes = 2
n_neighbors = 3
model = KNN(n_classes, n_neighbors)
print(f'Modelo: KNN\n')
print(f'Hyperparametros')
print(f'  > Quantidade de Vizinhos: {n_neighbors}')
print()
print()
print('Scores\n')
kfold_validation(model, X, Y);

Modelo: KNN

Hyperparametros
  > Quantidade de Vizinhos: 3


Scores

Accurracy
Mean: 96.30%
Std: 2.16%

Recall
Mean: 99.17%
Std: 1.71%

Precision
Mean: 95.16%
Std: 2.51%

F1-Score
Mean: 97.10%
Std: 1.66%



## Árvore de decisão

In [16]:
criterion = 'gini'
model = DecisionTreeClassifier(criterion=criterion, random_state=42)
print('Modelo: Árvore de Decisão\n')
print(f'Hyperparametros')
print(f'  > Indice de Impureza: {criterion}')
print()
print()
print('Scores\n')
kfold_validation(model, X, Y);

Modelo: Árvore de Decisão

Hyperparametros
  > Indice de Impureza: gini


Scores

Accurracy
Mean: 93.50%
Std: 3.59%

Recall
Mean: 94.62%
Std: 3.92%

Precision
Mean: 94.97%
Std: 2.81%

F1-Score
Mean: 94.78%
Std: 3.12%

