Implementación centralizada de un algoritmo de Machine Learning

In [15]:
import numpy as np

def readFile(filename):
    return np.loadtxt(filename, delimiter=',')

def _calculate_mean(numpy_Xy):
    # Calcula la media de cada columna (axis=0) desde x0 a x10, 
    # Se excluye del cálculo la etiqueta (columna 11).
    return np.mean(numpy_Xy[:, :-1], axis=0)

def _calculate_desvest(numpy_Xy):
    return np.std(numpy_Xy[:, :-1], axis=0)

def normalize(numpy_Xy):
    mean = _calculate_mean(numpy_Xy)
    desvest = _calculate_desvest(numpy_Xy)
    # La operación afecta sólo a los 11 primeros elemento
    # No normaliza las etiquetas.
    numpy_Xy[:, :-1] = (numpy_Xy[:, :-1] - mean) / desvest
    return numpy_Xy

def _model(W,b,numpy_Xy):
    # Son los cálculos del modelo lineal, f(xi) = sumatorio X*W + b 
    return numpy_Xy[:, :-1] @ W + b

def _sigmoid(z):
    return 1 / (1 + np.exp(-z))

def _cost_function_J(label, sigmoid):
    m = label.shape[0]
    return -np.sum(label * np.log(sigmoid) + (1 - label) * np.log(1 - sigmoid)) / m

def _convert_to_y_pred(value, threshold):
    return np.where(value > threshold, 1, 0)

def predict(W,b,numpy_Xy):
    THRESHOLD = 0.5
    z = _model(W,b,numpy_Xy)
    y_hat = _sigmoid(z)
    y_pred = _convert_to_y_pred(y_hat, THRESHOLD)
    return y_pred

def accuracy(W, b, numpy_Xy):
    y_label = numpy_Xy[:,-1] # Etiquetas
    m = numpy_Xy.shape[0]
    y_pred = predict(W,b,numpy_Xy)
    accuracy = np.sum(y_label == y_pred) / m
    return accuracy

def train(numpy_Xy, iterations, learning_rate):
    m = numpy_Xy.shape[0]
    W = 2 * np.random.rand(11) - 1
    b = 2 * np.random.rand(1) - 1
    data = numpy_Xy[:, :-1] # Datos
    y_label = numpy_Xy[:,-1] # Etiquetas

    for it in range(iterations):
        z = _model(W,b,numpy_Xy)
        y_hat = _sigmoid(z)
        
        dW = (data.T @ (y_hat - y_label)) / m
        db = np.sum((y_hat - y_label)) / m
        
        W = W - learning_rate * dW
        b = b - learning_rate * db
        
        cost = _cost_function_J(y_label, y_hat)
        print('Cost = ', cost)
    
    return W,b


#FILE_NAME = '../../0-SPAI/1-datos/botnet_tot_syn_l.csv'
FILE_NAME = 'botnet_sample.csv'
LEARNING_RATE = 1.5
N_ITER = 10

# read data
data_raw = readFile(FILE_NAME)

# standarize
normal_data = normalize(data_raw)

# train
ws = train(normal_data, N_ITER, LEARNING_RATE)

W,b = ws
acc = accuracy(W, b, normal_data)
print('Accuracy = ', acc)

Cost =  1.01963780925884
Cost =  0.6433273773578205
Cost =  0.46796209411428663
Cost =  0.37987923017913744
Cost =  0.32921535488171993
Cost =  0.29675923171757523
Cost =  0.2743834233609105
Cost =  0.2581379546125231
Cost =  0.24586574503587766
Cost =  0.23629467118118533
Accuracy =  0.908
