# Open session and import packages

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('project_churn').getOrCreate()

In [2]:
from pyspark.sql.types import *
import pyspark.sql.functions as F

# Import data

In [3]:
# import train set
train = spark.read.csv('train.csv', header = True)
# import test set
test = spark.read.csv('test.csv', header = True)

# Data Format

In [4]:
train.show(truncate = False)

+---+----------+--------------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+------+
|id |CustomerId|Surname       |CreditScore|Geography|Gender|Age |Tenure|Balance  |NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---+----------+--------------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+------+
|0  |15674932  |Okwudilichukwu|668        |France   |Male  |33.0|3     |0.0      |2            |1.0      |0.0           |181449.97      |0     |
|1  |15749177  |Okwudiliolisa |627        |France   |Male  |33.0|1     |0.0      |2            |1.0      |1.0           |49503.5        |0     |
|2  |15694510  |Hsueh         |678        |France   |Male  |40.0|10    |0.0      |2            |1.0      |0.0           |184866.69      |0     |
|3  |15741417  |Kao           |581        |France   |Male  |34.0|2     |148882.54|1            |1.0      |1.0           |84560.88 

In [5]:
# visualize all features type
train.printSchema()

root
 |-- id: string (nullable = true)
 |-- CustomerId: string (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: string (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Tenure: string (nullable = true)
 |-- Balance: string (nullable = true)
 |-- NumOfProducts: string (nullable = true)
 |-- HasCrCard: string (nullable = true)
 |-- IsActiveMember: string (nullable = true)
 |-- EstimatedSalary: string (nullable = true)
 |-- Exited: string (nullable = true)



In [6]:
# removing features that don't serve any purpose
train = train.drop('id','surname')

In [7]:
# rename all to lower case letters, feature and data, and removing all blank spaces
for col in train.columns:
    train = train.withColumnRenamed(col, col.lower())
    train = train.withColumn(col, F.lower(col))
    train = train.withColumn(col, F.trim(col))

In [8]:
# visualize how the feature data are arranged
train.show(5, truncate = False)

+----------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+------+
|CustomerId|CreditScore|Geography|Gender|Age |Tenure|Balance  |NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+----------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+------+
|15674932  |668        |france   |male  |33.0|3     |0.0      |2            |1.0      |0.0           |181449.97      |0     |
|15749177  |627        |france   |male  |33.0|1     |0.0      |2            |1.0      |1.0           |49503.5        |0     |
|15694510  |678        |france   |male  |40.0|10    |0.0      |2            |1.0      |0.0           |184866.69      |0     |
|15741417  |581        |france   |male  |34.0|2     |148882.54|1            |1.0      |1.0           |84560.88       |0     |
|15766172  |716        |spain    |male  |33.0|5     |0.0      |2            |1.0      |1.0           |15068.83       |

In [9]:
# changing data types to numeric type
numeric_columns = ['creditscore', 'tenure', 'balance', 'numofproducts', 'hascrcard', 'isactivemember', 'estimatedsalary', 'exited']

for col in numeric_columns:
    train = train.withColumn(col, F.col(col).cast(FloatType()))

In [10]:
train.printSchema()

root
 |-- CustomerId: string (nullable = true)
 |-- creditscore: float (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- tenure: float (nullable = true)
 |-- balance: float (nullable = true)
 |-- numofproducts: float (nullable = true)
 |-- hascrcard: float (nullable = true)
 |-- isactivemember: float (nullable = true)
 |-- estimatedsalary: float (nullable = true)
 |-- exited: float (nullable = true)



# Exploratory Data Analysis

In [12]:
train.summary().show()

+-------+--------------------+----------------+---------+------+-----------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+
|summary|          CustomerId|     creditscore|Geography|Gender|              Age|            tenure|           balance|     numofproducts|         hascrcard|     isactivemember|   estimatedsalary|             exited|
+-------+--------------------+----------------+---------+------+-----------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+
|  count|              165034|          165034|   165034|165034|           165034|            165034|            165034|            165034|            165034|             165034|            165034|             165034|
|   mean|1.5692005019026382E7|656.454373038283|     NULL|  NULL|38.12588787764945| 5.020353381727402| 55478.08669040132|1.554455

In [None]:
+-------+--------------------+----------------+---------+------+-----------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+
|summary|          CustomerId|     creditscore|Geography|Gender|              Age|            tenure|           balance|     numofproducts|         hascrcard|     isactivemember|   estimatedsalary|             exited|
+-------+--------------------+----------------+---------+------+-----------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+
|  count|              165034|          165034|   165034|165034|           165034|            165034|            165034|            165034|            165034|             165034|            165034|             165034|
|   mean|1.5692005019026382E7|656.454373038283|     NULL|  NULL|38.12588787764945| 5.020353381727402| 55478.08669040132|1.5544554455445545|0.7539537307463916|0.49777015645260975|112574.82270602613|0.21159882206090866|
| stddev|   71397.81679067112|80.1033404871783|     NULL|  NULL|8.867204591410792|2.8061585665860913|62817.663267958495|0.5471536788441764|0.4307071240449495|0.49999654260421705| 50292.86554962783| 0.4084431067117287|
|    min|            15565701|           350.0|   france|female|             18.0|               0.0|               0.0|               1.0|               0.0|                0.0|             11.58|                0.0|
|    25%|         1.5633112E7|           597.0|     NULL|  NULL|             32.0|               3.0|               0.0|               1.0|               1.0|                0.0|          74637.57|                0.0|
|    50%|         1.5690164E7|           659.0|     NULL|  NULL|             37.0|               5.0|               0.0|               2.0|               1.0|                0.0|          117946.3|                0.0|
|    75%|         1.5756821E7|           710.0|     NULL|  NULL|             42.0|               7.0|         119919.12|               2.0|               1.0|                1.0|         155061.97|                0.0|
|    max|            15815690|           850.0|    spain|  male|             92.0|              10.0|          250898.1|               4.0|               1.0|                1.0|         199992.48|                1.0|
+-------+--------------------+----------------+---------+------+-----------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+

In [None]:
trimmed_col = train.select().withColumn(feature, F.trim(F.col(feature)))

In [None]:
percent_missings = (train.select(feature).where(f'{feature} != "None" or {features} != "" ').count()/df.count())*100

In [27]:
(train.select('geography').where(f'geography != "None" or geography != "" ').count()/train.count())*100

100.0

In [None]:
unic_values = train.select(feature).distinct().count()

In [None]:
total_values = train.count()

In [None]:
def eda_basic(df:spark.DataFrame, cols:list):
    for feature in cols:
        # retirada de espaços vazios nos valores das colunas
        trimmed_col = df.select().withColumn(feature, F.trim(F.col(feature)))
        # cálculo de registros únicos
        unic_values = df.select(feature).distinct().count()
        # cálculo do total de registros
        total_values = df.count()
        # cálculo da porcentagem de missings nas features
        percent_missings = (df.select(feature).where(f'{feature} == "None" or {features} == "" ').count()/total_values)*100
        
        
    return