Implementación distribuida y paralelizada de un algoritmo de Machine Learning

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

import numpy as np
from pyspark import RDD

import time
import json


def readFile(filename):
    def transform_to_floats(line:str) -> np.ndarray:
        # Importante. Retornar como tupla
        element = [float(x) for x in line.split(',')]
        return (np.array(element))
    
    data_raw = contex_global.textFile(filename)
    return data_raw.map(transform_to_floats)


def _calculate_mean(rdd:RDD, total_rows:int) -> np.ndarray:
    def reduce_function(value1, value2):
        return value1[:11] + value2[:11]
    
    return rdd.reduce(reduce_function) / total_rows

def _calculate_stdev(rdd:RDD, mean:np.ndarray, total_rows:int) -> np.ndarray:
    def map_function(element):
        return (element[:11] - mean)**2

    def reduce_function(value1, value2):
        return value1[:11] + value2[:11]
    
    rdd_aux = rdd.map(map_function)
    return (rdd_aux.reduce(reduce_function) / total_rows)**(1/2)


def normalize(RDD_Xy:RDD) -> RDD:
    def map_function(element, mean, desvest):
        x_values = (element[:11] - mean) / desvest
        return np.concatenate((x_values, [element[-1]]))

    m = RDD_Xy.count()
    mean = _calculate_mean(RDD_Xy, m)
    desvest = _calculate_stdev(RDD_Xy, mean, m)
    return RDD_Xy.map(lambda element: map_function(element, mean, desvest))


def _sigmoid(z):
    return 1 / (1 + np.exp(-z))

def _map_label_y_hat_diffs(element, W:np.ndarray, b:np.ndarray):
    """Añade después de la y(etiqueta) el y_hat(ŷ) y la diferencia ŷ-y que se
    utiliza para calcular tanto el coste como las derivadas de W y b. Es necesario
    mantener el orden en el RDD.
    """
    dot_product = 0
    n = W.size
    for i in range(n):
        dot_product += element[i] * W[i]
    
    y_hat = float(_sigmoid(dot_product + b))
    diff = y_hat - element[-1]
    to_append = [y_hat, diff]

    return (np.append(element, to_append)) 

def _calculate_y_hat_and_diffs(RDD_Xy, W:np.ndarray, b:np.ndarray) -> RDD:
    return RDD_Xy.map(lambda element: _map_label_y_hat_diffs(element, W, b))

def _calculate_cost_J(RDD_XY) -> float:
    def _map_cost_function_J(element):
        label, y_hat = element
        #DEBUG. Cambiar log10 a log en la versión final
        return label * np.log(y_hat) + (1 - label) * np.log(1 - y_hat)
    
    # Cuidado con los índices, que es aquí donde se hace la "vista".
    RDD_label_y_hat = RDD_XY.map(lambda element: (element[-3], element[-2]))
    aux_costs = RDD_label_y_hat.map(_map_cost_function_J)
    m = RDD_label_y_hat.count()
    return -aux_costs.reduce(lambda x, y: x + y) / m

def _map_label_y_pred(element, W:np.ndarray, b:np.ndarray):
    dot_product = 0
    n = W.size
    for i in range(n):
        dot_product += element[i] * W[i]
    # ¿Hace falta mantener el orden etiqueta <-> y_hat para calcular
    # accuracy? Sí, para ver cuántas veces acierta.
    y_hat = float(_sigmoid(dot_product + b))
    THRESHOLD = 0.5
    if y_hat > THRESHOLD:
        y_pred = 1
    else:
        y_pred = 0
    return (element[-1], y_pred)

def predict(W:np.ndarray, b:np.ndarray, RDD_Xy) -> RDD:
    return RDD_Xy.map(lambda element: _map_label_y_pred(element, W, b))

def accuracy(W, b, RDD_Xy:RDD) -> float:
    def map_function(element):
        label, y_pred = element
        if label == y_pred:
            return 1
        else:
            return 0
    predictions_rdd = predict(W, b, RDD_Xy)
    aux_acc = predictions_rdd.map(map_function)
    m = aux_acc.count()
    accuracy = aux_acc.reduce(lambda x,y: x + y) / m
    return accuracy

def train(RDD_Xy:RDD, iterations, learning_rate):
    def b_map(element):
        return element[-1] # Se corresponde con las diferencias
    
    def W_map(element):
        return np.array([element[-1] * element[i] for i in range(num_W)])
    
    m = RDD_Xy.count()
    MODEL_PARAMS = 12 # 11 pesos + 1 sesgo
    num_W = MODEL_PARAMS - 1

    W = 2 * np.random.rand(num_W) - 1
    b = 2 * np.random.rand(1) - 1

    for it in range(iterations):
        rdd_xy_y_hat_diffs = _calculate_y_hat_and_diffs(RDD_Xy, W, b)

        dW_aux = rdd_xy_y_hat_diffs.map(W_map)
        dW = dW_aux.reduce(lambda x,y: x + y) / m
        W = W - learning_rate * dW
        
        diffs_label_hat = rdd_xy_y_hat_diffs.map(b_map)
        db = diffs_label_hat.reduce(lambda x,y: x + y) / m
        b = b - learning_rate * db

        cost = _calculate_cost_J(rdd_xy_y_hat_diffs)
        cost_list.append(cost)
        print(f'[train] Iteración {it}, Coste = {cost}')

    return W,b

import json
import numpy as np

def save_data(resultado, filename):
    # Convertir todos los tipos `numpy` a tipos de Python en el diccionario `resultado`
    for key, value in resultado.items():
        if isinstance(value, (np.integer, np.floating)):
            resultado[key] = value.item()  # Convertir a `int` o `float` de Python
        elif isinstance(value, np.ndarray):
            resultado[key] = value.tolist()  # Convertir arrays de `numpy` a listas

    try:
        with open(filename, 'r') as infile:
            data = json.load(infile)
    except FileNotFoundError:
        data = []  # Si el archivo no existe, comenzamos con una lista vacía
    
    # Añadir el resultado convertido a la lista `data`
    data.append(resultado)

    with open(filename, 'w') as outfile:
        json.dump(data, outfile, indent=4)



#FILE_NAME = '../../0-SPAI/1-datos/botnet_tot_syn_l.csv'
#FILE_NAME = 'botnet_sample.csv'
FILE_NAME = '../../0-SPAI/1-datos/botnet_100k.csv'

N_ITER = 20

OUTPUT_FILE = 'learning_rate_100k.json'


for learning_rate in np.arange(1 ,10, 1):
    if 'session_global' in globals():
        session_global.stop()
    
    session_global = SparkSession.builder.master(f'local[*]').getOrCreate()
    contex_global = session_global.sparkContext

    start = time.time()
    print('Learning Rate ', learning_rate)

    # read data
    data_raw_rdd = readFile(FILE_NAME)

    # standarize
    normal_data = normalize(data_raw_rdd)

    # train
    cost_list = []
    W, b = train(normal_data, N_ITER, learning_rate)

    acc = accuracy(W, b, normal_data)
    print('Accuracy ', acc)

    end = time.time()
    runtime = end - start
    print('Tiempo de ejecución (s) ', runtime)

    results = {
        "learn_rate": learning_rate,
        "accuracy": acc,
        "runtime": runtime,
        "costs": cost_list
    }

    save_data(results, OUTPUT_FILE)


24/10/26 19:30:42 WARN Utils: Your hostname, daniel-fpga resolves to a loopback address: 127.0.1.1; using 192.168.0.201 instead (on interface enp2s0f2)
24/10/26 19:30:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/26 19:30:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Learning Rate  1


                                                                                

[train] Iteración 0, Coste = 0.9418597471572787


                                                                                

[train] Iteración 1, Coste = 0.5636649914572222


                                                                                

[train] Iteración 2, Coste = 0.40473652030380136


                                                                                

[train] Iteración 3, Coste = 0.33162225635784154


                                                                                

[train] Iteración 4, Coste = 0.2922188504841883


                                                                                

[train] Iteración 5, Coste = 0.268146712790378


                                                                                

[train] Iteración 6, Coste = 0.2520144481269713


                                                                                

[train] Iteración 7, Coste = 0.24043513657419044


                                                                                

[train] Iteración 8, Coste = 0.2316822424994792


                                                                                

[train] Iteración 9, Coste = 0.2247976804109715


                                                                                

[train] Iteración 10, Coste = 0.2192122499565941


                                                                                

[train] Iteración 11, Coste = 0.21456836070572513


                                                                                

[train] Iteración 12, Coste = 0.21063065555581256


                                                                                

[train] Iteración 13, Coste = 0.2072380042944906


                                                                                

[train] Iteración 14, Coste = 0.20427631084741044


                                                                                

[train] Iteración 15, Coste = 0.20166239712223094


                                                                                

[train] Iteración 16, Coste = 0.19933407083235172


                                                                                

[train] Iteración 17, Coste = 0.1972437916055126


                                                                                

[train] Iteración 18, Coste = 0.19535450790764233


                                                                                

[train] Iteración 19, Coste = 0.19363684609753595


                                                                                

Accuracy  0.93407
Tiempo de ejecución (s)  247.05896067619324




Learning Rate  2


                                                                                

[train] Iteración 0, Coste = 0.7485376899184328


                                                                                

[train] Iteración 1, Coste = 0.31246589656251933


                                                                                

[train] Iteración 2, Coste = 0.24878943546626775


                                                                                

[train] Iteración 3, Coste = 0.22505289951969115


                                                                                

[train] Iteración 4, Coste = 0.21246788369730038


                                                                                

[train] Iteración 5, Coste = 0.20449097938006897


                                                                                

[train] Iteración 6, Coste = 0.19885543214678028


                                                                                

[train] Iteración 7, Coste = 0.19458313235944188


                                                                                

[train] Iteración 8, Coste = 0.1911861914995496


                                                                                

[train] Iteración 9, Coste = 0.18839370862824878


                                                                                

[train] Iteración 10, Coste = 0.1860421060743102


                                                                                

[train] Iteración 11, Coste = 0.18402574466192576


                                                                                

[train] Iteración 12, Coste = 0.18227259732893997


                                                                                

[train] Iteración 13, Coste = 0.18073135014035632


                                                                                

[train] Iteración 14, Coste = 0.17936412107723015


                                                                                

[train] Iteración 15, Coste = 0.17814212153713022


                                                                                

[train] Iteración 16, Coste = 0.1770429466855402


                                                                                

[train] Iteración 17, Coste = 0.17604881239157788


                                                                                

[train] Iteración 18, Coste = 0.1751453665821909


                                                                                

[train] Iteración 19, Coste = 0.17432086284298667


                                                                                

Accuracy  0.93701
Tiempo de ejecución (s)  242.7846601009369




Learning Rate  3


                                                                                

[train] Iteración 0, Coste = 0.9700536127608774


                                                                                

[train] Iteración 1, Coste = 0.3852971347618637


                                                                                

[train] Iteración 2, Coste = 0.28886456308360264


                                                                                

[train] Iteración 3, Coste = 0.2483585327863896


                                                                                

[train] Iteración 4, Coste = 0.22648391369108525


                                                                                

[train] Iteración 5, Coste = 0.21280558217318235


                                                                                

[train] Iteración 6, Coste = 0.20345634607753404


                                                                                

[train] Iteración 7, Coste = 0.19667704083794132


                                                                                

[train] Iteración 8, Coste = 0.19154771758333808


                                                                                

[train] Iteración 9, Coste = 0.18753926954014227


                                                                                

[train] Iteración 10, Coste = 0.18432584942632216


                                                                                

[train] Iteración 11, Coste = 0.18169608562094638


                                                                                

[train] Iteración 12, Coste = 0.17950707862971393


                                                                                

[train] Iteración 13, Coste = 0.1776588771181767


                                                                                

[train] Iteración 14, Coste = 0.17607953676923505


                                                                                

[train] Iteración 15, Coste = 0.174715984922317


                                                                                

[train] Iteración 16, Coste = 0.17352822774133528


                                                                                

[train] Iteración 17, Coste = 0.1724855604473604


                                                                                

[train] Iteración 18, Coste = 0.1715640195994598


                                                                                

[train] Iteración 19, Coste = 0.17074462871799767


                                                                                

Accuracy  0.93609
Tiempo de ejecución (s)  260.15212965011597




Learning Rate  4


                                                                                

[train] Iteración 0, Coste = 0.926301035079842


                                                                                

[train] Iteración 1, Coste = 0.2821214747490549


                                                                                

[train] Iteración 2, Coste = 0.2298595486235777


                                                                                

[train] Iteración 3, Coste = 0.20803494069126952


                                                                                

[train] Iteración 4, Coste = 0.19629400361486335


                                                                                

[train] Iteración 5, Coste = 0.18899828146948391


                                                                                

[train] Iteración 6, Coste = 0.18402337278973072


                                                                                

[train] Iteración 7, Coste = 0.18040678621579107


                                                                                

[train] Iteración 8, Coste = 0.1776539736476526


                                                                                

[train] Iteración 9, Coste = 0.17548589186076635


                                                                                

[train] Iteración 10, Coste = 0.1737332287929841


                                                                                

[train] Iteración 11, Coste = 0.17228714885996843


                                                                                

[train] Iteración 12, Coste = 0.17107435105593932


                                                                                

[train] Iteración 13, Coste = 0.17004354161441945


                                                                                

[train] Iteración 14, Coste = 0.16915766646587865


                                                                                

[train] Iteración 15, Coste = 0.16838923157399804


                                                                                

[train] Iteración 16, Coste = 0.16771736608796364


                                                                                

[train] Iteración 17, Coste = 0.16712591347158087


                                                                                

[train] Iteración 18, Coste = 0.16660215250746208


                                                                                

[train] Iteración 19, Coste = 0.16613591726637394


                                                                                

Accuracy  0.93782
Tiempo de ejecución (s)  249.53757500648499




Learning Rate  5


                                                                                

[train] Iteración 0, Coste = 0.5803826828575923


                                                                                

[train] Iteración 1, Coste = 0.2677482748362452


                                                                                

[train] Iteración 2, Coste = 0.2212769962776411


                                                                                

[train] Iteración 3, Coste = 0.20232342856132643


                                                                                

[train] Iteración 4, Coste = 0.19224686834263718


                                                                                

[train] Iteración 5, Coste = 0.18591530605667864


                                                                                

[train] Iteración 6, Coste = 0.181506874467881


                                                                                

[train] Iteración 7, Coste = 0.1782340088995101


                                                                                

[train] Iteración 8, Coste = 0.17570009231140127


                                                                                

[train] Iteración 9, Coste = 0.17368048766912106


                                                                                

[train] Iteración 10, Coste = 0.17203620822951787


                                                                                

[train] Iteración 11, Coste = 0.17067543322408227


                                                                                

[train] Iteración 12, Coste = 0.16953446431909444


                                                                                

[train] Iteración 13, Coste = 0.16856744181496164


                                                                                

[train] Iteración 14, Coste = 0.16774037953480186


                                                                                

[train] Iteración 15, Coste = 0.16702749927529478


                                                                                

[train] Iteración 16, Coste = 0.16640887142089342


                                                                                

[train] Iteración 17, Coste = 0.16586883826628707


                                                                                

[train] Iteración 18, Coste = 0.1653949275486885


                                                                                

[train] Iteración 19, Coste = 0.16497708442494313


                                                                                

Accuracy  0.93753
Tiempo de ejecución (s)  250.5202910900116




Learning Rate  6


                                                                                

[train] Iteración 0, Coste = 0.6100866735610575


                                                                                

[train] Iteración 1, Coste = 0.21955279072733166


                                                                                

[train] Iteración 2, Coste = 0.19376697405305401


                                                                                

[train] Iteración 3, Coste = 0.18449734425928546


                                                                                

[train] Iteración 4, Coste = 0.17921067944628313


                                                                                

[train] Iteración 5, Coste = 0.17568216487564903


                                                                                

[train] Iteración 6, Coste = 0.17313567447351416


                                                                                

[train] Iteración 7, Coste = 0.17120691971673754


                                                                                

[train] Iteración 8, Coste = 0.16969654597217787


                                                                                

[train] Iteración 9, Coste = 0.1684844735308511


                                                                                

[train] Iteración 10, Coste = 0.16749335106492116


                                                                                

[train] Iteración 11, Coste = 0.16667077681556472


                                                                                

[train] Iteración 12, Coste = 0.16597982179461906


                                                                                

[train] Iteración 13, Coste = 0.16539362021520587


                                                                                

[train] Iteración 14, Coste = 0.1648921099332908


                                                                                

[train] Iteración 15, Coste = 0.16445997943309307


                                                                                

[train] Iteración 16, Coste = 0.16408532582601074


                                                                                

[train] Iteración 17, Coste = 0.16375874942487545


                                                                                

[train] Iteración 18, Coste = 0.16347272598458382


                                                                                

[train] Iteración 19, Coste = 0.1632211610159304


                                                                                

Accuracy  0.93786
Tiempo de ejecución (s)  251.82562589645386




Learning Rate  7


                                                                                

[train] Iteración 0, Coste = 1.8255530520673953


                                                                                

[train] Iteración 1, Coste = 0.3029637966931556


                                                                                

[train] Iteración 2, Coste = 0.2168300574351602


                                                                                

[train] Iteración 3, Coste = 0.1869572661692127


                                                                                

[train] Iteración 4, Coste = 0.17479749733973715


                                                                                

[train] Iteración 5, Coste = 0.1690957547590704


                                                                                

[train] Iteración 6, Coste = 0.16618867126746525


                                                                                

[train] Iteración 7, Coste = 0.16459138434784334


                                                                                

[train] Iteración 8, Coste = 0.16364140559512935


                                                                                

[train] Iteración 9, Coste = 0.16303311541426388


                                                                                

[train] Iteración 10, Coste = 0.16261937411832833


                                                                                

[train] Iteración 11, Coste = 0.16232436599449912


                                                                                

[train] Iteración 12, Coste = 0.16210590016745902


                                                                                

[train] Iteración 13, Coste = 0.16193886565902513


                                                                                

[train] Iteración 14, Coste = 0.1618075462774578


                                                                                

[train] Iteración 15, Coste = 0.16170174633831394


                                                                                

[train] Iteración 16, Coste = 0.16161467134547544


                                                                                

[train] Iteración 17, Coste = 0.16154169090590195


                                                                                

[train] Iteración 18, Coste = 0.16147958325850936


                                                                                

[train] Iteración 19, Coste = 0.16142605992412096


                                                                                

Accuracy  0.93789
Tiempo de ejecución (s)  247.5750913619995




Learning Rate  8


                                                                                

[train] Iteración 0, Coste = 0.5136207401404069


                                                                                

[train] Iteración 1, Coste = 0.23385865704557707


                                                                                

[train] Iteración 2, Coste = 0.18476186828293548


                                                                                

[train] Iteración 3, Coste = 0.1769646260692778


                                                                                

[train] Iteración 4, Coste = 0.17313571924118354


                                                                                

[train] Iteración 5, Coste = 0.17061260175286194


                                                                                

[train] Iteración 6, Coste = 0.1687922277110751


                                                                                

[train] Iteración 7, Coste = 0.16741936340751556


                                                                                

[train] Iteración 8, Coste = 0.16635339043904826


                                                                                

[train] Iteración 9, Coste = 0.1655076812807045


                                                                                

[train] Iteración 10, Coste = 0.16482535738964638


                                                                                

[train] Iteración 11, Coste = 0.16426736532080738


                                                                                

[train] Iteración 12, Coste = 0.1638059495199166


                                                                                

[train] Iteración 13, Coste = 0.16342082743293687


                                                                                

[train] Iteración 14, Coste = 0.16309683257335877


                                                                                

[train] Iteración 15, Coste = 0.16282240264133135


                                                                                

[train] Iteración 16, Coste = 0.16258857674087776


                                                                                

[train] Iteración 17, Coste = 0.1623883110528026


                                                                                

[train] Iteración 18, Coste = 0.16221600019795976


                                                                                

[train] Iteración 19, Coste = 0.1620671352023283


                                                                                

Accuracy  0.93785
Tiempo de ejecución (s)  257.8212454319




Learning Rate  9


                                                                                

[train] Iteración 0, Coste = 1.2463755029375003


                                                                                

[train] Iteración 1, Coste = 0.4070245119354466


                                                                                

[train] Iteración 2, Coste = 0.26080155395297683


                                                                                

[train] Iteración 3, Coste = 0.20873583407674837


                                                                                

[train] Iteración 4, Coste = 0.1863895555318697


                                                                                

[train] Iteración 5, Coste = 0.17524969532197152


                                                                                

[train] Iteración 6, Coste = 0.16931932682866346


                                                                                

[train] Iteración 7, Coste = 0.16602680477267417


                                                                                

[train] Iteración 8, Coste = 0.1641197991658796


                                                                                

[train] Iteración 9, Coste = 0.16297077264348517


                                                                                

[train] Iteración 10, Coste = 0.16225764653846728


                                                                                

[train] Iteración 11, Coste = 0.16180668293408845


                                                                                

[train] Iteración 12, Coste = 0.16151827150232217


                                                                                

[train] Iteración 13, Coste = 0.1613324012981967


                                                                                

[train] Iteración 14, Coste = 0.16121180842400432


                                                                                

[train] Iteración 15, Coste = 0.1611329991675359


                                                                                

[train] Iteración 16, Coste = 0.16108105805834774


                                                                                

[train] Iteración 17, Coste = 0.16104648198001364


                                                                                

[train] Iteración 18, Coste = 0.16102319924368222


                                                                                

[train] Iteración 19, Coste = 0.16100731775227786


[Stage 85:>                                                         (0 + 2) / 2]

Accuracy  0.93821
Tiempo de ejecución (s)  256.2562675476074


                                                                                