# Intro ML SUP en BD

Creación de la sesión Spark:

In [90]:
#import SparkSession

from pyspark.sql import SparkSession

In [91]:
# Crear el spark session object, llamarle "supervised_ml"

supervised_ml=SparkSession.builder.appName('clase_6').getOrCreate()

## Regression 

Carga de datos, archivo *Linear_regression_dataset.csv*:

In [92]:
# Carga de datos

df = supervised_ml.read.format('csv').options(header='true', inferSchema='true').load('Linear_regression_dataset.csv')

Se invocan las librerias correcpondientes a **LinearRegression**, asi como las de OneHotEncoder, StringIndexer, VectorAssembler:

In [93]:
# Importacion de libs y operaciones

from pyspark.ml.regression  import LinearRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

Se visualizan algunos datos:

In [94]:
print((df.count(), len(df.columns))) 

(1232, 6)


Se muestran los primeros 10 datos:

In [95]:
# primeros 10 datos

df.toPandas().head(10)

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,label
0,734,688,81,0.328,0.259,0.418
1,700,600,94,0.32,0.247,0.389
2,712,705,93,0.311,0.247,0.417
3,734,806,69,0.315,0.26,0.415
4,613,759,61,0.302,0.24,0.378
5,748,676,85,0.318,0.255,0.422
6,669,588,97,0.315,0.251,0.411
7,667,845,68,0.324,0.251,0.381
8,758,890,64,0.33,0.274,0.436
9,726,670,88,0.335,0.268,0.422


## Feature Engineering

Creamos un solo vector con todos los features i.e 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', a este le llamaremos "features" y como salida colocamos a 'label':

In [96]:
# Vector Ensamblador

df_assembler = VectorAssembler(inputCols=['var_1','var_2','var_3','var_4','var_5'], outputCol="features")
df = df_assembler.transform(df) 

In [97]:
# visulizacion de vector ensamblado compuesto por features y label

df.printSchema()
df.select(['features','label']).toPandas().head(10)

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



Unnamed: 0,features,label
0,"[734.0, 688.0, 81.0, 0.328, 0.259]",0.418
1,"[700.0, 600.0, 94.0, 0.32, 0.247]",0.389
2,"[712.0, 705.0, 93.0, 0.311, 0.247]",0.417
3,"[734.0, 806.0, 69.0, 0.315, 0.26]",0.415
4,"[613.0, 759.0, 61.0, 0.302, 0.24]",0.378
5,"[748.0, 676.0, 85.0, 0.318, 0.255]",0.422
6,"[669.0, 588.0, 97.0, 0.315, 0.251]",0.411
7,"[667.0, 845.0, 68.0, 0.324, 0.251]",0.381
8,"[758.0, 890.0, 64.0, 0.33, 0.274]",0.436
9,"[726.0, 670.0, 88.0, 0.335, 0.268]",0.422


Partimos a continuación el set de datos en 75% training y 25% testing:

In [98]:
# Particion del data set

model=df.select(['features','label'])
train,test = model.randomSplit([0.75,0.25])

print(f"Size of train Dataset : {train.count()}" )
print(f"Size of test Dataset : {test.count()}" )

Size of train Dataset : 923
Size of test Dataset : 309


Creamos el Regresor Lineal: 

In [99]:
lr = LinearRegression()

Entrenamos el modelo de regresión lineal:

In [100]:
# Fit the model, le llamamos lr_model
lr_model=lr.fit(train)

Creamos el dataframe de prediciones (*predictions_df*) a partir del modelo de entrenamiento y el conjunto de datos test: 

In [101]:
predictions_df = lr_model.transform(test)

Visualizamos el contenido de *predictions_df*:

In [102]:
# visulizacion de predictions_df

predictions_df.toPandas().head(10)

Unnamed: 0,features,label,prediction
0,"[463.0, 527.0, 67.0, 0.284, 0.228]",0.311,0.312438
1,"[464.0, 640.0, 66.0, 0.283, 0.22]",0.301,0.313789
2,"[470.0, 509.0, 76.0, 0.289, 0.23]",0.319,0.313237
3,"[519.0, 595.0, 73.0, 0.301, 0.236]",0.332,0.328683
4,"[531.0, 491.0, 89.0, 0.291, 0.225]",0.32,0.331208
5,"[534.0, 609.0, 69.0, 0.304, 0.229]",0.329,0.328326
6,"[536.0, 531.0, 83.0, 0.292, 0.214]",0.318,0.327422
7,"[543.0, 615.0, 76.0, 0.294, 0.233]",0.333,0.340894
8,"[550.0, 631.0, 76.0, 0.306, 0.235]",0.318,0.3375
9,"[550.0, 637.0, 76.0, 0.288, 0.223]",0.326,0.342746


Ahora, evaluamos el modelo de Regresión Lineal, con los datos de TEST:

In [103]:
# evaluacion del modelo, le llamaremos model_predictions

model_predictions = lr_model.evaluate(test)

Imprimimos el valor de R2:

In [104]:
# valor de R2

print(f"R2 : {model_predictions.r2}" )

R2 : 0.8792703448789344


Imprimimos el valor del meanSquaredError:

In [105]:
# valor del meanSquaredError

print(f"Mean Squared Error : {model_predictions.rootMeanSquaredError}" )

Mean Squared Error : 0.012004896556408599


## Regresión con Árboles de Decisión

Importamos la librería *DecisionTreeRegressor*: 

In [106]:
# import lib

from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor

Creamos el Regresor DT, le llamaremos *dec_tree*:

In [107]:
# dec_tree

dec_tree = DecisionTreeRegressor(featuresCol="features")

Entrenamos el modelo:

In [108]:
# Train model, le llamaremos dec_tree_model

dec_tree_model = dec_tree.fit(train)

Cuánto es la profundidad máxima por defecto, de este algoritmo?

R/ El Max Depth no puede exceder la cantidad de 30 en una maquina de 32-bits.

Desplegamos las *featureImportances*:

In [109]:
dec_tree_model.featureImportances

SparseVector(5, {0: 0.9703, 1: 0.0165, 2: 0.003, 3: 0.0034, 4: 0.0068})

Evaluamos el modelo con los datos de entrenamiento:

In [110]:
# Make predictions, le llamaremos model_predictions 

model_predictions = dec_tree_model.transform(test)


In [111]:
# visualizamos

model_predictions.show()

+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[463.0,527.0,67.0...|0.311|0.31962500000000005|
|[464.0,640.0,66.0...|0.301|0.31962500000000005|
|[470.0,509.0,76.0...|0.319|0.31962500000000005|
|[519.0,595.0,73.0...|0.332|0.33675000000000005|
|[531.0,491.0,89.0...| 0.32|0.31962500000000005|
|[534.0,609.0,69.0...|0.329|0.31962500000000005|
|[536.0,531.0,83.0...|0.318|0.31962500000000005|
|[543.0,615.0,76.0...|0.333|0.31962500000000005|
|[550.0,631.0,76.0...|0.318|0.31962500000000005|
|[550.0,637.0,76.0...|0.326|0.31962500000000005|
|[550.0,789.0,54.0...|0.359|0.33675000000000005|
|[555.0,741.0,54.0...|0.348|             0.3435|
|[556.0,674.0,62.0...|0.348|0.35039285714285723|
|[559.0,613.0,75.0...|0.359|0.35039285714285723|
|[567.0,587.0,84.0...|0.349|0.35039285714285723|
|[569.0,620.0,77.0...|0.349|0.35039285714285723|
|[570.0,786.0,57.0...|0.366|0.35039285714285723|
|[571.0,577.0,83.0..

Importamos el **RegressionEvaluator**

In [112]:
# import Evaluator

from pyspark.ml.evaluation import RegressionEvaluator

Usando *RegressionEvaluator* calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [113]:
# R2 value of the model on test data 

dt_evaluator_r2 = RegressionEvaluator(metricName='r2')

dt_r2 = dt_evaluator_r2.evaluate(model_predictions)
print(f'The r-square value of DecisionTreeRegressor is {dt_r2}')

# RMSE value of the model on test data 

dt_evaluator_rmse = RegressionEvaluator(metricName='rmse')
dt_rmse = dt_evaluator_rmse.evaluate(model_predictions)
print(f'Root Mean Squared Error (RMSE) is = {dt_rmse}')

The r-square value of DecisionTreeRegressor is 0.8505959987615451
Root Mean Squared Error (RMSE) is = 0.013354651414976012


## RandomForestRegressor

Importamos a *RandomForestRegressor*

In [114]:
# import lib

from pyspark.ml.regression import RandomForestRegressor


Creamos el Regresor RF:

In [115]:
# Regresor 

rf = RandomForestRegressor(featuresCol="features")

Entrenamos el modelo:

In [116]:
# Train model, le llamaremos rf_model

rf_model = rf.fit(train)

Desplegamos las *featureImportances*:

In [117]:
# importances 

rf_model.featureImportances

SparseVector(5, {0: 0.5521, 1: 0.0251, 2: 0.0109, 3: 0.198, 4: 0.214})

Evaluamos el modelo con los datos de entrenamiento, le llamaremos model_predictions:


In [118]:
# model_predictions

model_predictions = rf_model.transform(test)


Desplegamos los valores del *model_predictions*

In [119]:
model_predictions.show(10)

+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[463.0,527.0,67.0...|0.311|0.32700038143068705|
|[464.0,640.0,66.0...|0.301| 0.3272666693094749|
|[470.0,509.0,76.0...|0.319|0.32809296079576644|
|[519.0,595.0,73.0...|0.332| 0.3440041794465851|
|[531.0,491.0,89.0...| 0.32|0.32921871837152394|
|[534.0,609.0,69.0...|0.329| 0.3317527587942838|
|[536.0,531.0,83.0...|0.318|0.32748462746243306|
|[543.0,615.0,76.0...|0.333|0.32809296079576644|
|[550.0,631.0,76.0...|0.318| 0.3331797254533557|
|[550.0,637.0,76.0...|0.326|0.32835924867455435|
+--------------------+-----+-------------------+
only showing top 10 rows



Usando *RegressionEvaluator* calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [120]:
# R2 value of the model on test data 

dt_evaluator_r2 = RegressionEvaluator(metricName='r2')

dt_r2 = dt_evaluator_r2.evaluate(model_predictions)
print(f'The r-square value of DecisionTreeRegressor is {dt_r2}')

# RMSE value of the model on test data 

dt_evaluator_rmse = RegressionEvaluator(metricName='rmse')
dt_rmse = dt_evaluator_rmse.evaluate(model_predictions)
print(f'Root Mean Squared Error (RMSE) is = {dt_rmse}')

The r-square value of DecisionTreeRegressor is 0.8443875102996787
Root Mean Squared Error (RMSE) is = 0.013629303662658293


## Gradient-Boosted Tree Regressor

Importamos a GBTRegressor


In [121]:
# import

from pyspark.ml.regression import GBTRegressor

Creamos el Regresor GBTR, le llamaremos gbt:


In [122]:
# regresor

gbt = GBTRegressor(featuresCol = 'features', labelCol = 'label', maxIter=10)

Entrenamos el modelo:

In [123]:
# Train model, le llamaremos gbt_model

gbt_model = gbt.fit(train)

Desplegamos las featureImportances:

In [124]:
#Importances

gbt_model.featureImportances

SparseVector(5, {0: 0.2845, 1: 0.1716, 2: 0.1575, 3: 0.199, 4: 0.1874})

Evaluamos el modelo con los datos de entrenamiento, le llamaremos model_predictions:

In [126]:
# Model

model_predictions = gbt_model.transform(test)
model_predictions.select('prediction', 'label', 'features').show(5)


+-------------------+-----+--------------------+
|         prediction|label|            features|
+-------------------+-----+--------------------+
|0.31843026962538284|0.311|[463.0,527.0,67.0...|
| 0.3188094205571148|0.301|[464.0,640.0,66.0...|
|0.31843026962538284|0.319|[470.0,509.0,76.0...|
|0.33622875032550487|0.332|[519.0,595.0,73.0...|
| 0.3180430634800169| 0.32|[531.0,491.0,89.0...|
+-------------------+-----+--------------------+
only showing top 5 rows



Desplegamos los valores del *model_predictions*

In [127]:
# show 

model_predictions.show()

+--------------------+-----+-------------------+
|            features|label|         prediction|
+--------------------+-----+-------------------+
|[463.0,527.0,67.0...|0.311|0.31843026962538284|
|[464.0,640.0,66.0...|0.301| 0.3188094205571148|
|[470.0,509.0,76.0...|0.319|0.31843026962538284|
|[519.0,595.0,73.0...|0.332|0.33622875032550487|
|[531.0,491.0,89.0...| 0.32| 0.3180430634800169|
|[534.0,609.0,69.0...|0.329|0.31910375032550486|
|[536.0,531.0,83.0...|0.318|0.31392511707777176|
|[543.0,615.0,76.0...|0.333|0.32088645717895425|
|[550.0,631.0,76.0...|0.318|0.31862808540785575|
|[550.0,637.0,76.0...|0.326|0.32088645717895425|
|[550.0,789.0,54.0...|0.359|0.33517262279154536|
|[555.0,741.0,54.0...|0.348|0.34418099456264384|
|[556.0,674.0,62.0...|0.348|  0.349577277699972|
|[559.0,613.0,75.0...|0.359|0.35064360719033105|
|[567.0,587.0,84.0...|0.349|0.35831576454020136|
|[569.0,620.0,77.0...|0.349|0.35262139541317317|
|[570.0,786.0,57.0...|0.366|0.35368040900400344|
|[571.0,577.0,83.0..

Usando RegressionEvaluator calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [130]:
 #Select (prediction, true label) and compute test error
# R2 value of the model on test data 

gbt_evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
    
rmse = gbt_evaluator_r2.evaluate(model_predictions)
print("R2 on test data = %g" % rmse)


# RMSE value of the model on test data 

gbt_evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
    
rmse = gbt_evaluator_rmse.evaluate(model_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)




R2 on test data = 0.853636
Root Mean Squared Error (RMSE) on test data = 0.0132181


 ## Exploracion de datos...

Usaremos el dataset https://archive.ics.uci.edu/ml/datasets/Bank+Marketing 

Indique a grandes razgos de que se trata este dataset:


El dataet contiene datos relacionados con campañas publicitarias realizada vía telefónica, para una institución bancaria portuguesa. El objetivo principal de la clasificación es predecir si el cliente se suscribe o no a depósitos a plazo (variable target_class).
En el link dice que contiene 17 atributos, sin embargo al ver el dataset tiene 20, algunos de ellos son: edad, tipo de trabajo, estado civil, nivel de educación, tiene crédito en incumplimiento, tiene préstamo de vivienda, tiene préstamo personal, tipo de contacto, último mes que se le contacto, día de la semana del último contacto, duración en segundos del último contacto y otros.


Carga de datos, archivo bank_data.csv:


In [131]:
# Load csv Dataset 
df=spark.read.csv('bank_data.csv',inferSchema=True,header=True)

Determine la cantidad de datos en el dataset:

In [132]:
#number of records

print("Cantidad de registros: ", df.count(), " y la cantidad de columnas: ", len(df.columns)) 

Cantidad de registros:  41188  y la cantidad de columnas:  21


A que dato corresponde cada columna?

In [135]:
# columns values

df.describe()

DataFrame[summary: string, age: string, job: string, marital: string, education: string, default: string, housing: string, loan: string, contact: string, month: string, day_of_week: string, duration: string, campaign: string, pdays: string, previous: string, poutcome: string, emp.var.rate: string, cons.price.idx: string, cons.conf.idx: string, euribor3m: string, nr.employed: string, target_class: string]

In [136]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('month', 'string'),
 ('day_of_week', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('emp.var.rate', 'double'),
 ('cons.price.idx', 'double'),
 ('cons.conf.idx', 'double'),
 ('euribor3m', 'double'),
 ('nr.employed', 'double'),
 ('target_class', 'string')]

In [137]:
df.take(1)

[Row(age=56, job='housemaid', marital='married', education='basic.4y', default='no', housing='no', loan='no', contact='telephone', month='may', day_of_week='mon', duration=261, campaign=1, pdays=999, previous=0, poutcome='nonexistent', emp.var.rate=1.1, cons.price.idx=93.994, cons.conf.idx=-36.4, euribor3m=4.857, nr.employed=5191.0, target_class='no')]

Imprima el Schema:

In [138]:
#dataype of input data - Schema

df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp.var.rate: double (nullable = true)
 |-- cons.price.idx: double (nullable = true)
 |-- cons.conf.idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr.employed: double (nullable = true)
 |-- target_class: string (nullable = true)



En cuanto a la salida, como es la distrubución de clases?

In [139]:
# YES/NO Class Distribution

df.groupBy('target_class').count().show()

+------------+-----+
|target_class|count|
+------------+-----+
|          no|36548|
|         yes| 4640|
+------------+-----+



Una tarea típica, resulta de convertir los valores binarios en 1 y 0, usando como referencia "label", convierta los no/yes en 0/1:

In [141]:
from pyspark.sql import functions as F
from pyspark.sql import *
from pyspark.sql.functions import regexp_replace,col

In [142]:
# Ingrese acá la instrucción: 

df_Nuevo = df.withColumn('target_class', regexp_replace('target_class', 'no', '0'))
df_Nuevo = df_Nuevo.withColumn('target_class', regexp_replace('target_class', 'yes', '1'))


In [143]:
# New 1/0 Class Distribution

df_Nuevo.groupBy('target_class').count().show()

+------------+-----+
|target_class|count|
+------------+-----+
|           0|36548|
|           1| 4640|
+------------+-----+



A continuación se presenta un ejercicio de Deep Learning para su revisión...

# Deep Learning 

Importamos las librerias necesarias:

In [72]:
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer

Inicializamos la sesion SPARK:

In [73]:
spark = SparkSession.builder.appName('deep_learning').getOrCreate()

Leemos el dataset:

In [74]:
data = spark.read.csv('dl_data.csv', header=True, inferSchema=True)

In [75]:
data.printSchema()

root
 |-- Visit_Number_Bucket: string (nullable = true)
 |-- Page_Views_Normalized: double (nullable = true)
 |-- Orders_Normalized: integer (nullable = true)
 |-- Internal_Search_Successful_Normalized: double (nullable = true)
 |-- Internal_Search_Null_Normalized: double (nullable = true)
 |-- Email_Signup_Normalized: double (nullable = true)
 |-- Total_Seconds_Spent_Normalized: double (nullable = true)
 |-- Store_Locator_Search_Normalized: double (nullable = true)
 |-- Mapped_Last_Touch_Channel: string (nullable = true)
 |-- Mapped_Mobile_Device_Type: string (nullable = true)
 |-- Mapped_Browser_Type: string (nullable = true)
 |-- Mapped_Entry_Pages: string (nullable = true)
 |-- Mapped_Site_Section: string (nullable = true)
 |-- Mapped_Promo_Code: string (nullable = true)
 |-- Maped_Product_Name: string (nullable = true)
 |-- Mapped_Search_Term: string (nullable = true)
 |-- Mapped_Product_Collection: string (nullable = true)



Renombramos la columna TARGET:

In [76]:
data = data.withColumnRenamed('Orders_Normalized', 'label')

In [77]:
data.printSchema()

root
 |-- Visit_Number_Bucket: string (nullable = true)
 |-- Page_Views_Normalized: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- Internal_Search_Successful_Normalized: double (nullable = true)
 |-- Internal_Search_Null_Normalized: double (nullable = true)
 |-- Email_Signup_Normalized: double (nullable = true)
 |-- Total_Seconds_Spent_Normalized: double (nullable = true)
 |-- Store_Locator_Search_Normalized: double (nullable = true)
 |-- Mapped_Last_Touch_Channel: string (nullable = true)
 |-- Mapped_Mobile_Device_Type: string (nullable = true)
 |-- Mapped_Browser_Type: string (nullable = true)
 |-- Mapped_Entry_Pages: string (nullable = true)
 |-- Mapped_Site_Section: string (nullable = true)
 |-- Mapped_Promo_Code: string (nullable = true)
 |-- Maped_Product_Name: string (nullable = true)
 |-- Mapped_Search_Term: string (nullable = true)
 |-- Mapped_Product_Collection: string (nullable = true)



Partimos lo datos en Train, Validation y Test:

In [79]:
train, validation, test  = data.randomSplit([0.7, 0.2, 0.1], 1234)

Construimos el Pipeline

In [80]:
categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]
numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('double')]

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]

featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol="features")

layers = [len(featuresCreator.getInputCols()), 4, 2, 2]

classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)

pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])

Entrenamos...

In [81]:
model = pipeline.fit(train)

Validamos y Evaluamos

In [83]:
train_output_df = model.transform(train)
validation_output_df = model.transform(validation)
test_output_df = model.transform(test)

Llevamos a cabo, algunas predicciones:

In [84]:
train_predictionAndLabels = train_output_df.select("prediction", "label")
validation_predictionAndLabels = validation_output_df.select("prediction", "label")
test_predictionAndLabels = test_output_df.select("prediction", "label")

metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']

for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(metricName=metric)
    print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))
    print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))
    print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))

Train weightedPrecision = 0.9713477415003594
Validation weightedPrecision = 0.9725121859664543
Test weightedPrecision = 0.9747121498465416
Train weightedRecall = 0.9707451988625297
Validation weightedRecall = 0.9718157181571816
Test weightedRecall = 0.974272173324306
Train accuracy = 0.9707451988625297
Validation accuracy = 0.9718157181571816
Test accuracy = 0.974272173324306


Puede mejorar el test accuracy del modelo variando alguno de los hyperparametros?

Correcto, esto es posible si se modifican los valores de algunos hyerparametros como el block size, las iteraciones y los layers.