# Definição do problema: Prever se um cliente está satisfeito

In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.9.7


In [3]:
# Importa o findspark e inicializa
import findspark
findspark.init()

In [32]:
# Imports
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import * 
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [5]:
# Criando o Spark Context
sc = SparkContext(appName = "prjt3")

In [6]:
sc.setLogLevel("ERROR")

# Criando a sessão
spark = SparkSession.builder.getOrCreate()
spark

## Carregando o Dataset

In [8]:
# Carrega os dados
dados = spark.read.csv('train.csv', inferSchema = True, header = True)

In [9]:
# Número de registros
dados.count()

76020

In [10]:
# Visualiza os dados no padrão do Spark DataFrame
dados.show(10)

+---+----+-----+------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------+-----------------------+-----------------------+-----------------+------------------+----------+--------+----------+--------+----------+--------+----------+--------+----------+--------+-----------+---------+-----------+-----------------+---------------+-----------------+---------------+-----------------+---------------+---------+-----------+---------+-----------+---------+-----------+---------+---------+-----------+---------+-----------+---------+-------------+-----------+-------------+---------+-----------+---------+-----------+-----------+---------+---------+-----------+---------+-----------+---------+-----------+---------+-------------+-----------+---------+-----------+--------

In [11]:
# Visualiza os dados no formato do Pandas
dados.limit(10).toPandas()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0
5,13,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87975.75,0
6,14,2,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94956.66,0
7,18,2,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,251638.95,0
8,20,2,45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,101962.02,0
9,23,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,356463.06,0


In [12]:
# Schema
dados.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- var3: integer (nullable = true)
 |-- var15: integer (nullable = true)
 |-- imp_ent_var16_ult1: double (nullable = true)
 |-- imp_op_var39_comer_ult1: double (nullable = true)
 |-- imp_op_var39_comer_ult3: double (nullable = true)
 |-- imp_op_var40_comer_ult1: double (nullable = true)
 |-- imp_op_var40_comer_ult3: double (nullable = true)
 |-- imp_op_var40_efect_ult1: double (nullable = true)
 |-- imp_op_var40_efect_ult3: double (nullable = true)
 |-- imp_op_var40_ult1: double (nullable = true)
 |-- imp_op_var41_comer_ult1: double (nullable = true)
 |-- imp_op_var41_comer_ult3: double (nullable = true)
 |-- imp_op_var41_efect_ult1: double (nullable = true)
 |-- imp_op_var41_efect_ult3: double (nullable = true)
 |-- imp_op_var41_ult1: double (nullable = true)
 |-- imp_op_var39_efect_ult1: double (nullable = true)
 |-- imp_op_var39_efect_ult3: double (nullable = true)
 |-- imp_op_var39_ult1: double (nullable = true)
 |-- imp_sal_var16_ult1: dou

In [13]:
# Separamos os dados ausentes (se existirem) e removemos (se existirem)
dados_com_linhas_removidas = dados.na.drop()
print('Número de linhas antes de remover valores ausentes:', dados.count())
print('Número de linhas após remover valores ausentes:', dados_com_linhas_removidas.count())

Número de linhas antes de remover valores ausentes: 76020
Número de linhas após remover valores ausentes: 76020


In [38]:
dados = dados.drop(dados.ID)

In [37]:
dados.groupBy("var3").count().orderBy("var3").show()

+-------+-----+
|   var3|count|
+-------+-----+
|-999999|  116|
|      0|   75|
|      1|  105|
|      2|74165|
|      3|  108|
|      4|   86|
|      5|   63|
|      6|   82|
|      7|   97|
|      8|  138|
|      9|  110|
|     10|   72|
|     11|   66|
|     12|   85|
|     13|   98|
|     14|   61|
|     15|   34|
|     16|    9|
|     17|    7|
|     18|   10|
+-------+-----+
only showing top 20 rows



In [40]:
dados.groupBy("var36").count().show()

+-----+-----+
|var36|count|
+-----+-----+
|    1|14664|
|    3|22177|
|    2| 8704|
|   99|30064|
|    0|  411|
+-----+-----+



In [20]:
dados.groupBy("var21").count().show()

+-----+-----+
|var21|count|
+-----+-----+
| 3000|   84|
| 1500|   31|
| 1800|  206|
| 4500|   96|
| 3300|    2|
|  900|  236|
| 1200|   12|
| 7200|   62|
| 3600|   52|
| 6000|   27|
| 9000|   14|
| 2400|    3|
|10500|    1|
|    0|75152|
| 2700|   26|
| 5100|    2|
| 7500|    1|
| 6600|    1|
| 2100|    2|
| 5400|    4|
+-----+-----+
only showing top 20 rows



In [21]:
dados.groupBy("var38").count().show()

+---------+-----+
|    var38|count|
+---------+-----+
|108845.91|    1|
| 38272.44|    1|
|138101.94|    1|
| 23894.97|    1|
|258941.73|    1|
| 94576.56|    1|
| 60170.88|    3|
|  38587.5|    1|
| 42821.79|    1|
|153463.83|    1|
|111146.04|    1|
|266319.51|    2|
| 72774.12|    1|
| 43179.75|    1|
|134509.71|    1|
|133824.78|    2|
| 76321.14|    1|
|103554.93|    1|
|198463.14|    1|
|256363.47|    1|
+---------+-----+
only showing top 20 rows



In [43]:
# Lista de variáveis de entrada (todas menos a última)
variaveis_entrada = dados.columns[:-1] 

In [46]:
assembler = VectorAssembler().setInputCols(variaveis_entrada).setOutputCol("features")
output = assembler.transform(dados).select("features")

In [53]:
scaler = StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(True).setWithMean(False)
scalerModel = scaler.fit(output)
scaledData = scalerModel.transform(output)

In [54]:
pca = PCA().setInputCol("scaledFeatures").setOutputCol("pcaFeatures").setK(5).fit(scaledData)
pcaDados = pca.transform(scaledData)
results = pcaDados.select("pcaFeatures")
results.show()
results.head(1)

+--------------------+
|         pcaFeatures|
+--------------------+
|[0.51318271399369...|
|[-1.2006486738358...|
|[-0.1756015041533...|
|[-6.6039314918786...|
|[-4.2421084796185...|
|[0.51297389903558...|
|[-0.3692703297821...|
|[-0.3065398322083...|
|[0.42934077498377...|
|[0.20745946968436...|
|[-11.395458744975...|
|[0.50163152683831...|
|[-0.4421819178737...|
|[-4.9585149488386...|
|[-34.366102122896...|
|[-0.2485399413801...|
|[0.98990963904744...|
|[-4.7700059810275...|
|[-2.4970563333163...|
|[-0.3271070611871...|
+--------------------+
only showing top 20 rows



[Row(pcaFeatures=DenseVector([0.5132, -1.7794, -0.4578, -1.0572, 0.704]))]