Implementación distribuida y paralelizada de un algoritmo de Machine Learning

In [37]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

import numpy as np
from pyspark import RDD
from typing import Tuple 


def readFile(filename):
    def transform_to_floats(line:str) -> np.ndarray:
        # Importante. Retornar como tupla
        element = [float(x) for x in line.split(',')]
        return (np.array(element))
    
    data_raw = contex_global.textFile(filename)
    return data_raw.map(transform_to_floats)


def _calculate_mean(rdd:RDD, total_rows:int) -> np.ndarray:
    def reduce_function(value1, value2):
        return value1[:11] + value2[:11]
    
    return rdd.reduce(reduce_function) / total_rows

def _calculate_stdev(rdd:RDD, mean:np.ndarray, total_rows:int) -> np.ndarray:
    def map_function(element):
        return (element[:11] - mean)**2

    def reduce_function(value1, value2):
        return value1[:11] + value2[:11]
    
    rdd_aux = rdd.map(map_function)
    return (rdd_aux.reduce(reduce_function) / total_rows)**(1/2)


def normalize(RDD_Xy:RDD) -> RDD:
    def map_function(element, mean, desvest):
        x_values = (element[:11] - mean) / desvest
        return np.concatenate((x_values, [element[-1]]))

    m = RDD_Xy.count()
    mean = _calculate_mean(RDD_Xy, m)
    desvest = _calculate_stdev(RDD_Xy, mean, m)
    return RDD_Xy.map(lambda element: map_function(element, mean, desvest))


def _sigmoid(z):
    return 1 / (1 + np.exp(-z))

def _map_label_y_hat(element, W:np.ndarray, b:np.ndarray):
    dot_product = 0
    n = W.size
    for i in range(n):
        dot_product += element[i] * W[i]
    # Sí hace falta mantener el orden etiqueta <-> y_hat para calcular
    # el coste.
    y_hat = float(_sigmoid(dot_product + b))
    return (element[-1], y_hat)

def _calculate_y_hat(RDD_Xy, W:np.ndarray, b:np.ndarray) -> RDD:
    return RDD_Xy.map(lambda element: _map_label_y_hat(element, W, b))

def _calculate_cost_J(RDD_label_y_hat) -> float:
    def _map_cost_function_J(element):
        label, y_hat = element
        return label * np.log(y_hat) + (1 - label) * np.log(1 - y_hat)
    
    aux_costs = RDD_label_y_hat.map(_map_cost_function_J)
    m = RDD_label_y_hat.count()
    return -aux_costs.reduce(lambda x, y: x + y) / m


#--------------------------------------------------
def _map_label_y_pred(element, W:np.ndarray, b:np.ndarray):
    dot_product = 0
    n = W.size
    for i in range(n):
        dot_product += element[i] * W[i]
    # ¿hace falta mantener el orden etiqueta <-> y_hat para calcular
    # accuracy?. Sí, para ver cuántas veces acierta.
    y_hat = float(_sigmoid(dot_product + b))
    THRESHOLD = 0.5
    if y_hat > THRESHOLD:
        y_pred = 1
    else:
        y_pred = 0
    return (element[-1], y_pred)

def predict(W:np.ndarray, b:np.ndarray, RDD_Xy) -> RDD:
    return RDD_Xy.map(lambda element: _map_label_y_pred(element, W, b))

def accuracy(W, b, RDD_Xy:RDD) -> float:
    def map_function(element):
        label, y_pred = element
        if label == y_pred:
            return 1
        else:
            return 0
    predictions_rdd = predict(W, b, RDD_Xy)
    aux_acc = predictions_rdd.map(map_function)
    m = aux_acc.count()
    accuracy = aux_acc.reduce(lambda x,y: x + y) / m
    return accuracy
#--------------------------------------------------



def train(RDD_Xy:RDD, iterations, learning_rate):
    def b_map(element):
        label, y_hat = element
        return y_hat - label
    
    m = RDD_Xy.count()
    #W = 2 * np.random.rand(11) - 1
    #b = 2 * np.random.rand(1) - 1
    W = np.ones(11)
    b = np.ones(1)
    
    for it in range(iterations):
        rdd_y_hat = _calculate_y_hat(RDD_Xy, W, b)
        #**** hace falta que se  X-label-y_hat todo en un rdd

        #dW = (data.T @ (y_hat - y_label)) / m
        #W = W - learning_rate * dW
        
        deb_aux = rdd_y_hat.map(b_map)
        db = deb_aux.reduce(lambda x,y: x + y) / m
        b = b - learning_rate * db

        cost = _calculate_cost_J(rdd_y_hat)
        print('[train]Cost = ', cost)

    
    #return W,b
    #return rdd_y_hat #Para probar si calcula bien dentro de la función
    return b




#FILE_NAME = '../../0-SPAI/1-datos/botnet_tot_syn_l.csv'
FILE_NAME = 'botnet_sample.csv'
LEARNING_RATE = 1.5
N_ITER = 1

session_global = SparkSession.builder.master('local[*]').getOrCreate()
contex_global = session_global.sparkContext

# TODO
# read data
data_raw_rdd = readFile(FILE_NAME)

#data_raw_rdd.take(5)
#m = data_raw_rdd.count()
#media = _calculate_mean(data_raw_rdd, m)
#print('Media', media)
#desvest = _calculate_stdev(data_raw_rdd, media, m)
#print('Desvest = ', desvest)

# standarize
normal_data = normalize(data_raw_rdd)
#normal_data.take(2)

W = np.ones(11)
b = np.ones(1)

#rdd_y_hat = _calculate_y_hat(normal_data, W, b)
#rdd_y_hat.take(10) # Calculo validado.
#cost = _calculate_cost_J(rdd_y_hat) # Calculo validado.
#print('Coste = ', cost)

#rdd_y_pred = predict(W, b, normal_data)
#rdd_y_pred.take(10) # Calculo validado.

## train
ws = train(normal_data, N_ITER, LEARNING_RATE)
ws.take(0)

#acc = accuracy(W, b, normal_data)
#print('Accuracy ', acc) # Calculo validado.


[train]Cost =  1.9231490251626502


0.8830944953296314