In [97]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
import findspark
import matplotlib.pyplot as pl
import pandas as pd

In [2]:

findspark.init('/usr/lib/python3.7/site-packages/pyspark')

spark = SparkSession \
    .builder \
    .appName("Basic JDBC pipeline") \
    .config("spark.driver.extraClassPath", "postgresql-42.2.14.jar") \
    .config("spark.executor.extraClassPath", "postgresql-42.2.14.jar") \
    .getOrCreate()

ps: unrecognized option: p
BusyBox v1.30.1 (2019-10-26 11:23:07 UTC) multi-call binary.

Usage: ps [-o COL1,COL2=HEADER]

Show list of processes

	-o COL1,COL2=HEADER	Select columns for display
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/11 23:04:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [11]:
## Se carga la base de datos de construccion
construccion_df = spark.read.csv("Base_Anonimizada2022.csv",header=True,inferSchema=True)

##Se carga la base de datos que mapea los cantones con su respectivo codigo y se limpia

cantones_df =spark.read.csv("SEN_GEOGRAFICO_1.csv",header=True,inferSchema=True)

##Se carga la base de datos que mapea cada canton con la region

regiones_df = spark.read.csv("division_territorial_por_region.csv",header=True,inferSchema=True)

##Se carga la base de datos de ENAHO

enaho_2022_df = spark.read.csv("BdBasePublica.csv",header=True,inferSchema=True)

                                                                                

In [21]:
lowercase_columns = [col_name for col_name in enaho_2022_df.columns if col_name.islower()]


enaho_2022_df_1 = enaho_2022_df.select("ID_HOGAR","LINEA","REGION","V2A",*lowercase_columns)


In [13]:
def construcciones_region_df_func(constru_df,canton_df,region_df):

    construccion_residencial_df = constru_df.filter(constru_df.claobr==1)
    construccion_residencial_df = construccion_residencial_df.select("pro_num_prov","pc_num_cant","num_obras","arecon","numviv","numapo","numdor","valobr")
    construccion_residencial_agrupada_df = construccion_residencial_df.groupby("pro_num_prov","pc_num_cant").agg(F.avg("num_obras").alias("pro_num_obras"),F.avg("arecon").alias("prom_arecon"),
                                                                                                            F.avg("numviv").alias("prom_numviv"),F.avg("numapo").alias("prom_numapo"),
                                                                                                            F.avg("numdor").alias("prom_numdor"),F.avg("valobr").alias("prom_valobr"))
    
    cantones_codigo_df = canton_df.withColumn("Codigo_DTA",F.split(canton_df["CodigoDTA"],",")[0])\
                                .withColumn("Canton",F.split(canton_df["CodigoDTA"],",")[1]).drop("CodigoDTA","Nombre")
    
    for column in cantones_codigo_df.columns:
        cantones_codigo_df = cantones_codigo_df.withColumn(column,F.regexp_replace(column,'"',''))

    regiones_limpio_df = region_df.withColumn("Codigo_DTA",F.substring("CODIGO",1,3)).select("Codigo_DTA","CANTON","REGION").distinct()

    construccion_cantones_df=construccion_residencial_agrupada_df.join(cantones_codigo_df,construccion_residencial_agrupada_df["pc_num_cant"]==cantones_codigo_df["Codigo_DTA"],
                                                                   how="inner").drop("pc_num_cant","pro_num_prov")
    
    construccion_regiones_df=construccion_cantones_df.join(regiones_limpio_df,on="Codigo_DTA",how="left").drop("Canton","CANTON","Codigo_DTA")

    columnas_promedio = [col for col in construccion_regiones_df.columns if col!="REGION"]

    construccion_regiones_agrupada_df = construccion_regiones_df.groupby("REGION").agg(*(F.avg(col).alias("reg_"+col) for col in columnas_promedio))

    construccion_regiones_agrupada_df=construccion_regiones_agrupada_df.withColumn("Codigo_Region",
    F.when(construccion_regiones_agrupada_df["REGION"]=="CENTRAL", 1)
    .when(construccion_regiones_agrupada_df["REGION"]=="CHOROTEGA", 2)
    .when(construccion_regiones_agrupada_df["REGION"]=="PACIFICO CENTRAL", 3)
    .when(construccion_regiones_agrupada_df["REGION"]=="BRUNCA", 4)
    .when(construccion_regiones_agrupada_df["REGION"]=="HUETAR CARIBE", 5)
    .when(construccion_regiones_agrupada_df["REGION"]=="HUETAR NORTE", 6))

    return construccion_regiones_agrupada_df

In [14]:
construccion_prom_region_2022 = construcciones_region_df_func(construccion_df,cantones_df,regiones_df)

construccion_prom_region_2022.show()

+----------------+------------------+------------------+------------------+-----------------+------------------+--------------------+-------------+
|          REGION| reg_pro_num_obras|   reg_prom_arecon|   reg_prom_numviv|  reg_prom_numapo|   reg_prom_numdor|     reg_prom_valobr|Codigo_Region|
+----------------+------------------+------------------+------------------+-----------------+------------------+--------------------+-------------+
|    HUETAR NORTE|1.0503136153898356| 71.42357793198545|0.9486050771143969|4.714806704849421| 2.172745293336891|1.9334500496198073E7|            6|
|   HUETAR CARIBE|1.1466133138681152|59.878152501614835|0.9699997159750016| 3.98014685977476|1.8833681343661388|1.5375398208581805E7|            5|
|         CENTRAL|1.3386748779531557|133.60794488188338|1.0805488324926336|6.747149989782027|2.8766782482438202| 4.539048286811655E7|            1|
|          BRUNCA|1.0385502365644046|  69.4399004299654|0.9254513138431006| 4.61359800403076| 2.096031027513646|

In [24]:
def enaho_func(enaho_df):

    "La variable 'Tenencia de Viviennda' contiene 5 categorias. Para efectos de este trabajo se agrupan en solo 2. Casa Propia (1) Casa que no es propia (0)"
    enaho_2022_variables_binario_df = enaho_df.withColumn("Tenencia_Vivienda", F.when(enaho_df.V2A.isin([1,2]),1).otherwise(0)).drop("ID_HOGAR","V2A")

    '''Los datos de ENAHO vienen a nivel de hogar y nivel individual. Para el siguiente trabajo nos interesa utilizar las variables a nivel de hogar y agrupar aquellas que vienen a nivel individual.
    Para esto se crea un identificador a nivel de cada hogar y se grupan las variables a nivel individual'''

    windowSpec = Window.orderBy(F.monotonically_increasing_id()).rowsBetween(Window.unboundedPreceding, 0)

    enaho_2022_variables_binario_df= enaho_2022_variables_binario_df.withColumn("id", F.sum(F.when(F.col("LINEA") == 1, 1).otherwise(0)).over(windowSpec)).drop("LINEA")
  
    enaho_2022_hogar_agr_df = enaho_2022_variables_binario_df.groupBy("id","REGION","Tenencia_Vivienda")\
    .avg(*[c for c in enaho_2022_variables_binario_df.columns if c not in ["id", "REGION","Tenencia_Vivienda"]])
                                                                                                                              
    enaho_2022_hogar_renombrado_df = enaho_2022_hogar_agr_df.withColumnRenamed("REGION","Region_Geo")

    
    return enaho_2022_hogar_renombrado_df

In [25]:
enaho_2022 = enaho_func(enaho_2022_df_1)

enaho_2022.show()

24/05/11 23:42:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:42:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:42:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:42:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 31:>                                                         (0 + 1) / 1]

+---+----------+-----------------+---------+--------+---------+--------+----------+------------+----------+---------+-----------+---------+---------+---------+--------+---------+--------+---------+----------+----------+----------+-----------+----------+---------+---------+----------+---------+--------+----------+---------+---------+---------+--------+--------+-------+---------+-----------+----------+----------+-------+--------+--------+---------+----------+---------+--------+---------+---------+--------+---------+---------+-------+--------+-------+--------+----------+---------+---------+-----------+----------+----------+----------+----------+----------+---------+---------+---------+---------+---------+---------+---------+----------+----------+--------+--------+-----------------+-----------------+--------+--------+---------+---------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+------

                                                                                

In [26]:
def unir_datos(enaho_df,construccion_df):

    tenencia_vivienda = enaho_df.join(construccion_df,enaho_df["Region_Geo"]==construccion_df["Codigo_Region"],how='left')
    tenencia_vivienda = tenencia_vivienda.drop('id','Region_Geo','Codigo_Region','REGION')                                                      
    return tenencia_vivienda

In [27]:
tenencia_vivienda_df = unir_datos(enaho_2022,construccion_prom_region_2022)

tenencia_vivienda_df.show()

24/05/11 23:48:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:48:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:48:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:48:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:48:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:48:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 2

+-----------------+---------+--------+---------+--------+----------+------------+----------+---------+-----------+---------+---------+---------+--------+---------+--------+---------+----------+----------+----------+-----------+----------+---------+---------+----------+---------+--------+----------+---------+---------+---------+--------+--------+-------+---------+-----------+----------+----------+-------+--------+--------+---------+----------+---------+--------+---------+---------+--------+---------+---------+-------+--------+-------+--------+----------+---------+---------+-----------+----------+----------+----------+----------+----------+---------+---------+---------+---------+---------+---------+---------+----------+----------+--------+--------+-----------------+-----------------+--------+--------+---------+---------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------

                                                                                

In [28]:
def guardar_a_postgres(construccion,enaho,base_unida):
    construccion \
    .write \
    .format("jdbc") \
    .mode('overwrite') \
    .option("url", "jdbc:postgresql://host.docker.internal:5433/postgres") \
    .option("user", "postgres") \
    .option("password", "testPassword") \
    .option("dbtable", "construccion_df_") \
    .save()
    
    enaho \
    .write \
    .format("jdbc") \
    .mode('overwrite') \
    .option("url", "jdbc:postgresql://host.docker.internal:5433/postgres") \
    .option("user", "postgres") \
    .option("password", "testPassword") \
    .option("dbtable", "enaho_df_1") \
    .save()
    
    base_unida \
    .write \
    .format("jdbc") \
    .mode('overwrite') \
    .option("url", "jdbc:postgresql://host.docker.internal:5433/postgres") \
    .option("user", "postgres") \
    .option("password", "testPassword") \
    .option("dbtable", "tenencia_vivienda_df_1") \
    .save()

In [29]:
guardar_a_postgres(construccion_prom_region_2022,enaho_2022,tenencia_vivienda_df)

24/05/11 23:49:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:49:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:49:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:49:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:49:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 23:49:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/11 2

In [107]:
construccion_df = spark \
    .read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://host.docker.internal:5433/postgres") \
    .option("user", "postgres") \
    .option("password", "testPassword") \
    .option("dbtable", "construccion_df") \
    .load()

construccion_df.show()

+----------------+------------------+------------------+------------------+-----------------+------------------+--------------------+-------------+
|          REGION| reg_pro_num_obras|   reg_prom_arecon|   reg_prom_numviv|  reg_prom_numapo|   reg_prom_numdor|     reg_prom_valobr|Codigo_Region|
+----------------+------------------+------------------+------------------+-----------------+------------------+--------------------+-------------+
|    HUETAR NORTE|1.0503136153898356| 71.42357793198545|0.9486050771143969|4.714806704849421| 2.172745293336891|1.9334500496198073E7|            6|
|   HUETAR CARIBE|1.1466133138681152|59.878152501614835|0.9699997159750016| 3.98014685977476|1.8833681343661388|1.5375398208581805E7|            5|
|         CENTRAL|1.3386748779531557|133.60794488188338|1.0805488324926336|6.747149989782027|2.8766782482438202| 4.539048286811655E7|            1|
|          BRUNCA|1.0385502365644046|  69.4399004299654|0.9254513138431006| 4.61359800403076| 2.096031027513646|

In [108]:
enaho_df = spark \
    .read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://host.docker.internal:5433/postgres") \
    .option("user", "postgres") \
    .option("password", "testPassword") \
    .option("dbtable", "enaho_df") \
    .load()

enaho_df.show()

+---+----------+-----------------+-----------------+------------------+---------------------+----+-------------------+---------------------+-------------------------+
| id|Region_Geo|Tenencia_Vivienda|Cantidad_Personas|Cantidad_vehiculos|Cantidad_Computadoras|ZONA|suma_escolari_hogar|suma_horas_trab_hogar|Ingreso_Total_Bruto_Hogar|
+---+----------+-----------------+-----------------+------------------+---------------------+----+-------------------+---------------------+-------------------------+
|  1|       1.0|                1|              2.0|               1.0|                 null| 1.0|               12.0|                 null|                1029167.0|
|  2|       1.0|                1|              4.0|              null|                 null| 1.0|               46.0|                 47.5|                2448333.0|
|  3|       1.0|                0|              3.0|              null|                 null| 1.0|               30.0|                 48.0|                 494000.0

In [31]:
tenencia_vivienda_df = spark \
    .read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://host.docker.internal:5433/postgres") \
    .option("user", "postgres") \
    .option("password", "testPassword") \
    .option("dbtable", "tenencia_vivienda_df_1") \
    .load()

tenencia_vivienda_df.show()

+-----------------+---------+--------+---------+--------+----------+------------+----------+---------+-----------+---------+---------+---------+--------+---------+--------+---------+----------+----------+----------+-----------+----------+---------+---------+----------+---------+--------+----------+---------+---------+---------+--------+--------+-------+---------+-----------+----------+----------+-------+--------+--------+---------+----------+---------+--------+---------+---------+--------+---------+---------+-------+--------+-------+--------+----------+---------+---------+-----------+----------+----------+----------+----------+----------+---------+---------+---------+---------+---------+---------+---------+----------+----------+--------+--------+-----------------+-----------------+--------+--------+---------+---------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------

In [39]:
def fillna_mean(df, include=set()): 
    means = df.agg(*(
        F.mean(x).alias(x) for x in df.columns if x in include
    ))
    return df.fillna(means.first().asDict())

In [40]:
tenencia_vivienda_nonulls = fillna_mean(tenencia_vivienda_df,tenencia_vivienda_df.columns)

                                                                                

In [41]:
tenencia_vivienda_nonulls.show()

+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+----------+------------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+-----------------+------------------+-----------------+-----------+-----------------+---------+---------+----------+---------+-----------------+------------------+-----------------+------------------+---------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+----------------+------------------+-----------------+------------------+-----------------+-----------------+---------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+---------------

In [170]:
#Se remplazan las observaciones nulas con 0
tenencia_vivienda_df_1 =tenencia_vivienda_df.fillna(0)

tenencia_vivienda_df_1.show()

print(tenencia_vivienda_df_1.columns)

+-----------------+-----------------+------------------+---------------------+----+-------------------+---------------------+-------------------------+------------------+------------------+------------------+-----------------+------------------+-------------------+
|Tenencia_Vivienda|Cantidad_Personas|Cantidad_vehiculos|Cantidad_Computadoras|ZONA|suma_escolari_hogar|suma_horas_trab_hogar|Ingreso_Total_Bruto_Hogar| reg_pro_num_obras|   reg_prom_arecon|   reg_prom_numviv|  reg_prom_numapo|   reg_prom_numdor|    reg_prom_valobr|
+-----------------+-----------------+------------------+---------------------+----+-------------------+---------------------+-------------------------+------------------+------------------+------------------+-----------------+------------------+-------------------+
|                1|              4.0|               2.0|                  2.0| 1.0|               50.0|                 44.0|                3550000.0|1.3386748779531557|133.60794488188338|1.08054883249

In [42]:
feature_columns = [col for col in tenencia_vivienda_nonulls.columns if col !='Tenencia_Vivienda']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
vector_df = assembler.transform(tenencia_vivienda_nonulls)
vector_df = vector_df.select(['features','Tenencia_Vivienda'])
vector_df.show()

+--------------------+-----------------+
|            features|Tenencia_Vivienda|
+--------------------+-----------------+
|[632512.505555555...|                1|
|[632512.505555555...|                1|
|[632512.505555555...|                0|
|[1500000.0,51866....|                1|
|[632512.505555555...|                1|
|[632512.505555555...|                1|
|[632512.505555555...|                1|
|[632512.505555555...|                0|
|[632512.505555555...|                0|
|[632512.505555555...|                0|
|[632512.505555555...|                1|
|[632512.505555555...|                1|
|[632512.505555555...|                0|
|[632512.505555555...|                1|
|[632512.505555555...|                0|
|[632512.505555555...|                0|
|[632512.505555555...|                0|
|[632512.505555555...|                1|
|[632512.505555555...|                0|
|[632512.505555555...|                0|
+--------------------+-----------------+
only showing top

                                                                                

In [43]:
standard_scaler = StandardScaler(inputCol='features', outputCol='columns_scaled')
scale_model = standard_scaler.fit(vector_df)

scaled_df = scale_model.transform(vector_df)

#Se muestran los resultados
vector_scaled_df = scaled_df.select(['columns_scaled','Tenencia_Vivienda'])
vector_scaled_df.show()
vector_scaled_df.printSchema()

                                                                                

+--------------------+-----------------+
|      columns_scaled|Tenencia_Vivienda|
+--------------------+-----------------+
|[9.85633252751066...|                1|
|[9.85633252751066...|                1|
|[9.85633252751066...|                0|
|[23.3742394994709...|                1|
|[9.85633252751066...|                1|
|[9.85633252751066...|                1|
|[9.85633252751066...|                1|
|[9.85633252751066...|                0|
|[9.85633252751066...|                0|
|[9.85633252751066...|                0|
|[9.85633252751066...|                1|
|[9.85633252751066...|                1|
|[9.85633252751066...|                0|
|[9.85633252751066...|                1|
|[9.85633252751066...|                0|
|[9.85633252751066...|                0|
|[9.85633252751066...|                0|
|[9.85633252751066...|                1|
|[9.85633252751066...|                0|
|[9.85633252751066...|                0|
+--------------------+-----------------+
only showing top

In [61]:
#Primer modelo (Regresion Logistica)

# Division en entrenamiento y prueba
train_data, test_data = vector_scaled_df.randomSplit([0.7, 0.3], seed=123)

lr = LogisticRegression(featuresCol="columns_scaled", labelCol="Tenencia_Vivienda")

#Parametros para validacion cruzada
param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

#Validacion cruzada
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

#Entrenamiento del modelo
cv_model = cv.fit(train_data)

#Prediccion en datos de prueba
prediccion_lr = cv_model.transform(test_data)



                                                                                

In [64]:
evaluator = BinaryClassificationEvaluator(labelCol='Tenencia_Vivienda')
roc_lr = evaluator.evaluate(prediccion_lr)
print("Area Under ROC Curve:", area_under_curve)


Area Under ROC Curve: 0.6895022753128561


                                                                                

In [46]:
# Segundo Modelo (arbol de decision)

# Division en entrenamiento y prueba
train_data, test_data = vector_scaled_df.randomSplit([0.7, 0.3], seed=123)


dt = DecisionTreeClassifier(featuresCol="columns_scaled", labelCol="Tenencia_Vivienda")

#Parametros para validacion cruzada
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10]) \
    .addGrid(dt.maxBins, [20, 30]) \
    .build()



#Validacion cruzada
cv = CrossValidator(estimator=dt,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

#Entrenamiento del modelo
cvModel = cv.fit(train_data)

#Prediccion en datos de prueba
prediccion_dt = cvModel.transform(test_data)



                                                                                

In [204]:
evaluator = BinaryClassificationEvaluator(labelCol='Tenencia_Vivienda')
roc_dt = evaluator.evaluate(prediccion_dt)
print(f"Area bajo la curva ROC: {roc_dt}")

Area bajo la curva ROC: 0.5696153330164866


In [111]:
# Metricas Primner Modelo
tp = prediccion_lr.filter((F.col('Tenencia_Vivienda') == 1) & (F.col('prediction') == 1)).count()
tn = prediccion_lr.filter((F.col('Tenencia_Vivienda') == 0) & (F.col('prediction') == 0)).count()
fp = prediccion_lr.filter((F.col('Tenencia_Vivienda') == 0) & (F.col('prediction') == 1)).count()
fn = prediccion_lr.filter((F.col('Tenencia_Vivienda') == 1) & (F.col('prediction') == 0)).count()

# accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)
print(f"Accuracy: {accuracy}")

# precision
precision = tp / (tp + fp) if (tp + fp) != 0 else 0  
print(f"Precision: {precision}")

# recall
recall = tp / (tp + fn) if (tp + fn) != 0 else 0.0  
print(f"Recall: {recall}")

# F1 measure
f1_measure = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0.0  
print(f"F1 measure: {f1_measure}")


                                                                                

Accuracy: 0.7440833070369202
Precision: 0.7492682926829268
Recall: 0.9829351535836177
F1 measure: 0.8503413914006274


In [112]:
#Metricas Segundo Modelo

tp = prediccion_dt.filter((F.col('Tenencia_Vivienda') == 1) & (F.col('prediction') == 1)).count()
tn = prediccion_dt.filter((F.col('Tenencia_Vivienda') == 0) & (F.col('prediction') == 0)).count()
fp = prediccion_dt.filter((F.col('Tenencia_Vivienda') == 0) & (F.col('prediction') == 1)).count()
fn = prediccion_dt.filter((F.col('Tenencia_Vivienda') == 1) & (F.col('prediction') == 0)).count()

# accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)
print(f"Accuracy: {accuracy}")

# precision
precision = tp / (tp + fp) if (tp + fp) != 0 else 0  
print(f"Precision: {precision}")

# recall
recall = tp / (tp + fn) if (tp + fn) != 0 else 0.0  
print(f"Recall: {recall}")

# measure
f1_measure = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0.0  
print(f"F1 measure: {f1_measure}")


                                                                                

Accuracy: 0.7349321552540233
Precision: 0.7605682605682605
Recall: 0.9364334470989761
F1 measure: 0.8393881453154876


                                                                                