# Definição do problema: Prever se um cliente está satisfeito

In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.9.7


In [2]:
# Importa o findspark e inicializa
import findspark
findspark.init()

In [3]:
# Imports
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import * 
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from imblearn.over_sampling import SMOTE

In [4]:
# Criando o Spark Context
sc = SparkContext(appName = "prjt3")

In [5]:
sc.setLogLevel("ERROR")

# Criando a sessão
spark = SparkSession.builder.getOrCreate()
spark

## Carregando o Dataset

In [6]:
# Carrega os dados
dados = spark.read.csv('train.csv', inferSchema = True, header = True)

In [7]:
# Número de registros
dados.count()

76020

In [8]:
# Visualiza os dados no padrão do Spark DataFrame
dados.show(10)

+---+----+-----+------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------+-----------------------+-----------------------+-----------------+------------------+----------+--------+----------+--------+----------+--------+----------+--------+----------+--------+-----------+---------+-----------+-----------------+---------------+-----------------+---------------+-----------------+---------------+---------+-----------+---------+-----------+---------+-----------+---------+---------+-----------+---------+-----------+---------+-------------+-----------+-------------+---------+-----------+---------+-----------+-----------+---------+---------+-----------+---------+-----------+---------+-----------+---------+-------------+-----------+---------+-----------+--------

In [9]:
# Visualiza os dados no formato do Pandas
dados.limit(10).toPandas()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0
5,13,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87975.75,0
6,14,2,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94956.66,0
7,18,2,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,251638.95,0
8,20,2,45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,101962.02,0
9,23,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,356463.06,0


In [10]:
# Schema
dados.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- var3: integer (nullable = true)
 |-- var15: integer (nullable = true)
 |-- imp_ent_var16_ult1: double (nullable = true)
 |-- imp_op_var39_comer_ult1: double (nullable = true)
 |-- imp_op_var39_comer_ult3: double (nullable = true)
 |-- imp_op_var40_comer_ult1: double (nullable = true)
 |-- imp_op_var40_comer_ult3: double (nullable = true)
 |-- imp_op_var40_efect_ult1: double (nullable = true)
 |-- imp_op_var40_efect_ult3: double (nullable = true)
 |-- imp_op_var40_ult1: double (nullable = true)
 |-- imp_op_var41_comer_ult1: double (nullable = true)
 |-- imp_op_var41_comer_ult3: double (nullable = true)
 |-- imp_op_var41_efect_ult1: double (nullable = true)
 |-- imp_op_var41_efect_ult3: double (nullable = true)
 |-- imp_op_var41_ult1: double (nullable = true)
 |-- imp_op_var39_efect_ult1: double (nullable = true)
 |-- imp_op_var39_efect_ult3: double (nullable = true)
 |-- imp_op_var39_ult1: double (nullable = true)
 |-- imp_sal_var16_ult1: dou

In [11]:
# Separamos os dados ausentes (se existirem) e removemos (se existirem)
dados_com_linhas_removidas = dados.na.drop()
print('Número de linhas antes de remover valores ausentes:', dados.count())
print('Número de linhas após remover valores ausentes:', dados_com_linhas_removidas.count())

Número de linhas antes de remover valores ausentes: 76020
Número de linhas após remover valores ausentes: 76020


In [12]:
dados = dados.drop(dados.ID)

In [13]:
dados.groupBy("var3").count().orderBy("var3").show()

+-------+-----+
|   var3|count|
+-------+-----+
|-999999|  116|
|      0|   75|
|      1|  105|
|      2|74165|
|      3|  108|
|      4|   86|
|      5|   63|
|      6|   82|
|      7|   97|
|      8|  138|
|      9|  110|
|     10|   72|
|     11|   66|
|     12|   85|
|     13|   98|
|     14|   61|
|     15|   34|
|     16|    9|
|     17|    7|
|     18|   10|
+-------+-----+
only showing top 20 rows



In [14]:
dados.groupBy("var36").count().show()

+-----+-----+
|var36|count|
+-----+-----+
|    1|14664|
|    3|22177|
|    2| 8704|
|   99|30064|
|    0|  411|
+-----+-----+



In [15]:
dados.groupBy("var21").count().show()

+-----+-----+
|var21|count|
+-----+-----+
| 3000|   84|
| 1500|   31|
| 1800|  206|
| 4500|   96|
| 3300|    2|
|  900|  236|
| 1200|   12|
| 7200|   62|
| 3600|   52|
| 6000|   27|
| 9000|   14|
| 2400|    3|
|10500|    1|
|    0|75152|
| 2700|   26|
| 5100|    2|
| 7500|    1|
| 6600|    1|
| 2100|    2|
| 5400|    4|
+-----+-----+
only showing top 20 rows



In [16]:
dados.groupBy("var38").count().show()

+---------+-----+
|    var38|count|
+---------+-----+
|108845.91|    1|
| 38272.44|    1|
|138101.94|    1|
| 23894.97|    1|
|258941.73|    1|
| 94576.56|    1|
| 60170.88|    3|
|  38587.5|    1|
| 42821.79|    1|
|153463.83|    1|
|111146.04|    1|
|266319.51|    2|
| 72774.12|    1|
| 43179.75|    1|
|134509.71|    1|
|133824.78|    2|
| 76321.14|    1|
|103554.93|    1|
|198463.14|    1|
|256363.47|    1|
+---------+-----+
only showing top 20 rows



In [17]:
# Lista de variáveis de entrada (todas menos a última)
variaveis_entrada = dados.columns[:-1]

In [18]:
assembler = VectorAssembler().setInputCols(variaveis_entrada).setOutputCol("features")
output = assembler.transform(dados).select("features", 'TARGET')
scaler = StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(True).setWithMean(False)
scalerModel = scaler.fit(output)
scaledData = scalerModel.transform(output).select("scaledFeatures", 'TARGET')
pca = PCA().setInputCol("scaledFeatures").setOutputCol("pcaFeatures").setK(5).fit(scaledData)
pcaDados = pca.transform(scaledData).select("pcaFeatures", 'TARGET')
pcaDados.show()
pcaDados.head(1)

+--------------------+------+
|         pcaFeatures|TARGET|
+--------------------+------+
|[0.51318271399369...|     0|
|[-1.2006486738358...|     0|
|[-0.1756015041533...|     0|
|[-6.6039314918786...|     0|
|[-4.2421084796185...|     0|
|[0.51297389903558...|     0|
|[-0.3692703297821...|     0|
|[-0.3065398322083...|     0|
|[0.42934077498377...|     0|
|[0.20745946968436...|     0|
|[-11.395458744975...|     0|
|[0.50163152683831...|     0|
|[-0.4421819178737...|     0|
|[-4.9585149488386...|     0|
|[-34.366102122896...|     0|
|[-0.2485399413801...|     0|
|[0.98990963904744...|     0|
|[-4.7700059810275...|     0|
|[-2.4970563333163...|     0|
|[-0.3271070611871...|     0|
+--------------------+------+
only showing top 20 rows



[Row(pcaFeatures=DenseVector([0.5132, -1.7794, -0.4578, -1.0572, 0.704]), TARGET=0)]

In [19]:
# Lista a correlação entre os atributos e a variável alvo
for item in (Correlation.corr(pcaDados, 'pcaFeatures', 'pearson').collect()[0][0].toArray()):
    print(item[4])

-9.500976341397068e-15
4.3099475079319745e-15
-2.8113798255344962e-15
6.369661806886836e-15
1.0


In [20]:
dados_pandas = pcaDados.toPandas()
dados_pandas.TARGET.value_counts()

0    73012
1     3008
Name: TARGET, dtype: int64

In [21]:
# Como não identifiquei uma forma de balancear a variável alvo, irei refazer o processo em pandas.
dados_pandas = dados.toPandas()
dados_pandas

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.170000,0
1,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.030000,0
2,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.770000,0
3,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.970000,0
4,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,2,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60926.490000,0
76016,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118634.520000,0
76017,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74028.150000,0
76018,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84278.160000,0


In [22]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import pandas as pd

# Cria um objeto separado para a variável target
y_treino = dados_pandas['TARGET']
# Cria um objeto separadado para as variáveis de entrada
X_treino = dados_pandas.drop('TARGET', axis = 1)

#Aplica a técnica de oversampling e aumentar o número de exemplos da classe minoritária
over_sampler = SMOTE(k_neighbors = 2)
X_treino, y_treino = over_sampler.fit_resample(X_treino, y_treino)

#Aplicar a técnica de padronização dos dados
scaler = StandardScaler()
scaler.fit(X_treino)
X_treino = scaler.transform(X_treino)
#X_teste = scaler.transform(X_teste)

#Aplicar a técnica de redu;áo de dimensionalidade nos dados
pca = PCA(n_components=5)
X_treino = pca.fit_transform(X_treino)

#Remonta o dataframe
principalDf = pd.DataFrame(data = X_treino)
X_treino_pd = pd.concat([principalDf, y_treino], axis = 1)

#Transforma em datafram do spark
X_treino = spark.createDataFrame(X_treino_pd)
X_treino.show()

+-------------------+--------------------+--------------------+--------------------+--------------------+------+
|                  0|                   1|                   2|                   3|                   4|TARGET|
+-------------------+--------------------+--------------------+--------------------+--------------------+------+
| -2.460122907558048| -1.2195762006232647| -0.1083713683735262|  0.0546572863865167| 0.38856540639440335|     0|
|-0.6750213142653857|  10.796764816713505| -5.7797769356133815| -3.0744935419535637|  6.0294777371185235|     0|
|-1.5755636793998269| -0.3435907738423141| 0.08532430813449997|  0.1594659909421124|-0.14342255970183604|     0|
|  5.273205417854175| -0.5935630887654062| 0.20728071577058757| -0.7882313271144347| -1.1220333285103865|     0|
|  2.568560568623748|   10.75278098035109|   5.211735960471619|   5.772569742850093| -13.932179970656524|     0|
|-2.4595809655606726| -1.2156867484565685|-0.10742386841984262| 0.05503693282121379|  0.38906091

In [23]:
# Lista de variáveis de entrada (todas menos a última)
assembler = VectorAssembler().setInputCols(X_treino.columns[:-1]).setOutputCol("features")
dados_finais = assembler.transform(X_treino).select("features", 'TARGET')
dados_finais = dados_finais.withColumnRenamed("TARGET","label")
dados_finais.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[-2.4601229075580...|    0|
|[-0.6750213142653...|    0|
|[-1.5755636793998...|    0|
|[5.27320541785417...|    0|
|[2.56856056862374...|    0|
|[-2.4595809655606...|    0|
|[-1.3649054829003...|    0|
|[-1.4307871583069...|    0|
|[-2.4047227142080...|    0|
|[-1.9528862685650...|    0|
|[9.9823295507198,...|    0|
|[-2.4522615804606...|    0|
|[-1.3424736666728...|    0|
|[3.33740342667152...|    0|
|[32.9945359771718...|    0|
|[-1.5194412232662...|    0|
|[-2.9661191051233...|    0|
|[3.31944054412200...|    0|
|[0.91122239275092...|    0|
|[-1.4004325595703...|    0|
+--------------------+-----+
only showing top 20 rows



In [24]:
# Divisão com proporção 70/30
dados_treino, dados_teste = dados_finais.randomSplit([0.7,0.3])

In [25]:
# Módulo de Machine Learning
def func_modulo_ml(algoritmo_classificacao):

    # Função para obter o tipo do algoritmo de regressão e criar a instância do objeto
    # Usaremos isso para automatizar nosso processo
    def func_tipo_algo(algo_classificacao):
        algoritmo = algo_classificacao
        tipo_algo = type(algoritmo).__name__
        return tipo_algo
    
    # Aplica a função anterior
    tipo_algo = func_tipo_algo(algoritmo_classificacao)

    # Se o algoritmo for regressão logística, entramos neste bloco if
    if tipo_algo == "LogisticRegression":
        
        # Treinamos a primeira versão do modelo sem validação cruzada
        modelo = classificador.fit(dados_treino)
        
        # Métricas do modelo
        print('\033[1m' + "Modelo de Regressão Linear Sem Validação Cruzada:" + '\033[0m')
        print("")
        
        # Avalia o modelo com dados de teste
        resultado_teste = modelo.evaluate(dados_teste)        
        
        # Imprime as métricas de erro do modelo com dados de teste        
        print("AUC em Teste: {}".format(resultado_teste.areaUnderROC))
        print("")
        
        # Agora vamos criar a segunda versão do modelo com mesmo algoritmo, mas usando validação cruzada
        
        # Prepara o grid de hiperparâmetros
        paramGrid = (ParamGridBuilder()
             .addGrid(classificador.regParam, [0.01, 0.1, 0.5, 1.0, 2.0])
             .addGrid(classificador.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
             .addGrid(classificador.maxIter, [1, 5, 10, 20, 50])
             .build())
        
        # Cria os avaliadores
        eval_auc = BinaryClassificationEvaluator(metricName = "areaUnderROC")
        
        # Cria o Cross Validator
        crossval = CrossValidator(estimator = classificador,
                                  estimatorParamMaps = paramGrid,
                                  evaluator = eval_auc,
                                  numFolds = 5)
        
        print('\033[1m' + "Modelo de Regressão Linear Com Validação Cruzada:" + '\033[0m')
        print("")
        
        # Treina o modelo com validação cruzada
        modelo = crossval.fit(dados_treino)
        
        # Salva o melhor modelo da versão 2
        global LR_BestModel
        LR_BestModel = modelo.bestModel
                
        # Previsões com dados de teste
        previsoes = LR_BestModel.transform(dados_teste)
        
        # Avaliação do melhor modelo
        resultado_teste_auc = eval_auc.evaluate(previsoes)
        print('AUC em Teste:', resultado_teste_auc)
    
        # Lista de colunas para colocar no dataframe de resumo
        colunas = ['Classificador', 'AUC_Score']
        
        # Formata os resultados e cria o dataframe
        
        # Formata as métricas e nome do algoritmo
        auc_str = [str(resultado_teste_auc)]
        tipo_algo = [tipo_algo] 
        
        # Cria o dataframne
        df_resultado = spark.createDataFrame(zip(tipo_algo, auc_str), schema = colunas)
        
        # Grava os resultados no dataframe
        df_resultado = df_resultado.withColumn('AUC_Score', df_resultado.AUC_Score.substr(0, 5))
        
        return df_resultado

    else:
        
        # Verificamos se o algoritmo é o Decision Tree e criamos o grid de hiperparâmetros
        if tipo_algo in("DecisionTreeClassifier"):
            paramGrid = (ParamGridBuilder()
             .addGrid(classificador.maxDepth, [2, 5, 10, 20, 30])
             .addGrid(classificador.maxBins, [10, 20, 40, 80, 100])
             .build())

        # Verificamos se o algoritmo é o Random Forest e criamos o grid de hiperparâmetros
        if tipo_algo in("RandomForestClassifier"):
            paramGrid = (ParamGridBuilder()
             .addGrid(classificador.maxDepth, [2, 5, 10, 20, 30])
             .addGrid(classificador.maxBins, [10, 20, 40, 80, 100])
             .addGrid(classificador.numTrees, [5, 20, 50, 100, 500])
             .build())

        # Verificamos se o algoritmo é o GBT e criamos o grid de hiperparâmetros
        if tipo_algo in("GBTClassifier"):
            paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [10,30,60]) \
    .addGrid(gbt.maxDepth, [6, 8,12]) \
    .build()

        # Cria os avaliadores
        eval_auc = BinaryClassificationEvaluator(metricName = "areaUnderROC")
        
        # Prepara o Cross Validator
        crossval = CrossValidator(estimator = classificador,
                                  estimatorParamMaps = paramGrid,
                                  evaluator = eval_auc,
                                  numFolds = 3) 
        
        # Treina o modelo usando validação cruzada
        modelo = crossval.fit(dados_treino)
        
        # Extrai o melhor modelo
        BestModel = modelo.bestModel

        # Resumo de cada modelo
        
        # Métricas do modelo
        if tipo_algo in("DecisionTreeClassifier"):
            
            # Variável global
            global DT_BestModel 
            DT_BestModel = modelo.bestModel
            
            # Previsões com dados de teste
            previsoes_DT = DT_BestModel.transform(dados_teste)
            
            print('\033[1m' + "Modelo Decision Tree Com Validação Cruzada:" + '\033[0m')
            print(" ")
            
            # Avaliação do modelo
            resultado_teste_auc = eval_auc.evaluate(previsoes_DT)
            print('AUC em Teste:', resultado_teste_auc)
        
        # Métricas do modelo
        if tipo_algo in("RandomForestClassifier"):
            
            # Variável global
            global RF_BestModel 
            RF_BestModel = modelo.bestModel
            
            # Previsões com dados de teste
            previsoes_RF = RF_BestModel.transform(dados_teste)
            
            print('\033[1m' + "Modelo RandomForest Com Validação Cruzada:" + '\033[0m')
            print(" ")
            
            # Avaliação do modelo
            resultado_teste_auc = eval_auc.evaluate(previsoes_RF)
            print('AUC em Teste:', resultado_teste_auc)
        
        # Métricas do modelo
        if tipo_algo in("GBTClassifier"):

            # Variável global
            global GBT_BestModel 
            GBT_BestModel = modelo.bestModel
            
            # Previsões com dados de teste
            previsoes_GBT = GBT_BestModel.transform(dados_teste)
            
            print('\033[1m' + "Modelo Gradient Boosted Tree (GBT) Com Validação Cruzada:" + '\033[0m')
            print(" ")
            
            # Avaliação do modelo
            resultado_teste_auc = eval_auc.evaluate(previsoes_GBT)
            print('AUC em Teste:', resultado_teste_auc)

        # Lista de colunas para colocar no dataframe de resumo
        colunas = ['Classificador', 'AUC_Score']
        
        # Faz previsões com dados de teste
        previsoes = modelo.transform(dados_teste)
        
        # Cria os avaliadores
        eval_auc = BinaryClassificationEvaluator(metricName = "areaUnderROC")
        auc = eval_auc.evaluate(previsoes)
        auc_str = [str(auc)]

        tipo_algo = [tipo_algo] 
        
        # Cria o dataframe
        df_resultado = spark.createDataFrame(zip(tipo_algo, auc_str), schema = colunas)
        
        # Grava o resultado no dataframe
        df_resultado = df_resultado.withColumn('AUC_Score', df_resultado.AUC_Score.substr(0, 5))
        
        return df_resultado

In [26]:
# Lista de algoritmos
classificadores = [LogisticRegression(),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               GBTClassifier()] 

# Lista de colunas e valores
colunas = ['Classificador', 'AUC_Score']
valores = [("N/A", "N/A")]

# Prepara a tabela de resumo
df_resultados_treinamento = spark.createDataFrame(valores, colunas)

# Loop de treinamento
for classificador in classificadores:
    
    # Para cada classificador obtém o resultado
    resultado_modelo = func_modulo_ml(classificador)
    
    # Grava os resultados
    df_resultados_treinamento = df_resultados_treinamento.union(resultado_modelo)

# Retorna as linhas diferentes de N/A
df_resultados_treinamento = df_resultados_treinamento.where("Classificador!='N/A'")

# Imprime
df_resultados_treinamento.show(10, False)

[1mModelo de Regressão Linear Sem Validação Cruzada:[0m

AUC em Teste: 0.7067277331545622

[1mModelo de Regressão Linear Com Validação Cruzada:[0m

AUC em Teste: 0.7127580292090944
[1mModelo Decision Tree Com Validação Cruzada:[0m
 
AUC em Teste: 0.82630459426402


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 62835)
Traceback (most recent call last):
  File "C:\Users\Yoh\anaconda3\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "C:\Users\Yoh\anaconda3\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "C:\Users\Yoh\anaconda3\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "C:\Users\Yoh\anaconda3\lib\socketserver.py", line 747, in __init__
    self.handle()
  File "C:\Users\Yoh\anaconda3\lib\site-packages\pyspark\accumulators.py", line 281, in handle
    poll(accum_updates)
  File "C:\Users\Yoh\anaconda3\lib\site-packages\pyspark\accumulators.py", line 253, in poll
    if func():
  File "C:\Users\Yoh\anaconda3\lib\site-packages\pyspark\accumulators.py", line 257, in accum_updates
    num_updates 

ConnectionRefusedError: [WinError 10061] Nenhuma conexão pôde ser feita porque a máquina de destino as recusou ativamente