# Iniciar Sessão Spark

In [22]:
import os
# Set up environment variables
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-21'
os.environ['SPARK_HOME'] = r'C:\Users\kawda\Downloads\spark-3.5.4-bin-hadoop3\spark-3.5.4-bin-hadoop3'

# Initialize a Spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
	.master("local[*]") \
	.config("spark.executor.memory", "8g") \
	.config("spark.driver.memory", "8g") \
	.getOrCreate()

# Verify the Spark session
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000025AFFF379E0>


# Imports

In [23]:
from pyspark.sql import DataFrame
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Summarizer
from pyspark.sql.types import DoubleType, IntegerType, StringType, NumericType
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, when, isnan, lit, approx_count_distinct


# Dados

In [24]:
# Abrir os dados disponíveis sobre o titanic
df_test = spark.read.csv("test.csv", header=True, inferSchema=True)
df_test.createOrReplaceTempView("df_test")

df_train = spark.read.csv("train.csv", header=True, inferSchema=True)
df_train.createOrReplaceTempView("df_train")

df_survived = spark.read.csv("gender_submission.csv", header=True, inferSchema=True)
df_survived.createOrReplaceTempView("df_survived")

In [25]:
# Verificar os dados
lista_spec = ['PassengerId', 'Survived']
abt_00 = df_train.drop(*lista_spec)

abt_00.show(5)
abt_00.printSchema()

+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 5 rows

root
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: doub

# Data Preparation

## Tratamento inicial padrão (Alta porcentagem de nulos, Variáveis constantes, Missings)

In [29]:
def get_metadata(df):
    metadata_list = []
    
    for coluna in df.schema:
        col_name = coluna.name
        data_type = str(coluna.dataType)

        total_count = df.count()
        null_count = df.filter(F.col(col_name).isNull()).count()
        non_null_percentage = (total_count - null_count) / total_count if total_count > 0 else 0
        cardinality = df.select(col_name).agg(F.countDistinct(F.col(col_name))).collect()[0][0]

        metadata_list.append((col_name, data_type, null_count, 1 - non_null_percentage, cardinality))
    
    metadata = spark.createDataFrame(metadata_list, ["coluna", "tipo", "qt_nulos", "percent_nulos", "cardinalidade"])
    return metadata

metadados = get_metadata(abt_00)
metadados.createOrReplaceTempView("metadados")
metadados.printSchema()


root
 |-- coluna: string (nullable = true)
 |-- tipo: string (nullable = true)
 |-- qt_nulos: long (nullable = true)
 |-- percent_nulos: double (nullable = true)
 |-- cardinalidade: long (nullable = true)



In [31]:
def preprocess_dataframe(df):
	# Drop columns with >80% missing values
	total_count = df.count()
	columns_to_drop = [col for col in df.columns if df.filter(F.col(col).isNull()).count() / total_count > 0.8]
	df = df.drop(*columns_to_drop)
	
	# Replace missing values
	for coluna in df.schema:
		col_name = coluna.name
		data_type = coluna.dataType
		
		if isinstance(data_type, DoubleType) or isinstance(data_type, IntegerType):
			mean_value = df.select(F.mean(F.col(col_name))).collect()[0][0]
			df = df.fillna({col_name: mean_value})
		elif isinstance(data_type, StringType):
			df = df.fillna({col_name: "Desconhecido"})
	
	# Drop columns with variance equals to 0
	numeric_columns = [col for col, dtype in df.dtypes if isinstance(dtype, NumericType)]
	variances = df.select([F.variance(F.col(col)).alias(col) for col in numeric_columns]).collect()[0].asDict()
	columns_to_drop = [col for col, var in variances.items() if var == 0]
	df = df.drop(*columns_to_drop)
	
	return df

# Apply the function to the dataframe
abt_01 = preprocess_dataframe(abt_00)
abt_01.show(5)
abt_01.printSchema()

+------+--------------------+------+----+-----+-----+----------------+-------+------------+--------+
|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|       Cabin|Embarked|
+------+--------------------+------+----+-----+-----+----------------+-------+------------+--------+
|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25|Desconhecido|       S|
|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|         C85|       C|
|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925|Desconhecido|       S|
|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1|        C123|       S|
|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05|Desconhecido|       S|
+------+--------------------+------+----+-----+-----+----------------+-------+------------+--------+
only showing top 5 rows

root
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nul

## Tratamento de variáveis numéricas

# Métodos de seleção de variáveis

## Feature Importance

## Recursive Feature Elimination (RFE)

## Boruta

## Pearson Correlation

## Corte por IV

## PCA + IV