# Iniciar Sessão Spark

import os
## Set up environment variables
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-21'
os.environ['SPARK_HOME'] = r'C:\Users\kawda\Downloads\spark-3.5.4-bin-hadoop3\spark-3.5.4-bin-hadoop3'

## Initialize a Spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
	.master("local[*]") \
	.config("spark.executor.memory", "8g") \
	.config("spark.driver.memory", "8g") \
	.getOrCreate()

## Verify the Spark session
spark)

# Imports

from pyspark.sql import DataFrame
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Summarizer
from pyspark.sql.types import DoubleType, IntegerType, StringType, NumericType
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, when, isnan, lit, approx_count_distinct
from sklearn.preprocessing import StandardScaler
import numpy as np

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# Dados

In [12]:
# Abrir os dados disponíveis sobre o titanic
df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")
df_survived = pd.read_csv("gender_submission.csv")

# Display the first few rows of each dataframe
df_test.head()
df_train.head()
df_survived.head()

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1 

In [13]:
# Verificar os dados
lista_spec = ['PassengerId', 'Survived']
abt_00 = df_train.drop(columns=lista_spec)

abt_00.head()

   Pclass                                               Name     Sex   Age  \
0       3                            Braund, Mr. Owen Harris    male  22.0   
1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
2       3                             Heikkinen, Miss. Laina  female  26.0   
3       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   
4       3                           Allen, Mr. William Henry    male  35.0   

   SibSp  Parch            Ticket     Fare Cabin Embarked  
0      1      0         A/5 21171   7.2500   NaN        S  
1      1      0          PC 17599  71.2833   C85        C  
2      0      0  STON/O2. 3101282   7.9250   NaN        S  
3      1      0            113803  53.1000  C123        S  
4      0      0            373450   8.0500   NaN        S  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    ------------

# Data Preparation

## Tratamento inicial padrão (Alta porcentagem de nulos, Variáveis constantes, Missings)

In [14]:
import pandas as pd
from scipy import stats

def get_metadata(dataframe):
	# Coleta de metadados básicos
	metadata = pd.DataFrame({
		'nome_variavel': dataframe.columns,
		'tipo': dataframe.dtypes,
		'qt_nulos': dataframe.isnull().sum(),
		'percent_nulos': round((dataframe.isnull().sum() / len(dataframe)) * 100, 2),
		'cardinalidade': dataframe.nunique(),
	})

	# Função para testar normalidade usando o teste de Shapiro-Wilk
	def test_normality(series, alpha=0.05):
		if series.dtype in ["float64", "int64", "int32"]:
			statistic, p_value = stats.shapiro(series.dropna())  # dropping NA values for the test
			return p_value > alpha
		else:
			return None  # Return None for non-numeric data types

	# Aplicando teste de normalidade
	metadata["fl_normal"] = dataframe.apply(test_normality)

	metadata = metadata.sort_values(by='percent_nulos', ascending=False)
	metadata = metadata.reset_index(drop=True)

	return metadata

# Apply the function to the dataframe
metadados = get_metadata(abt_00)
metadados)

  nome_variavel     tipo  qt_nulos  percent_nulos  cardinalidade fl_normal
0         Cabin   object       687          77.10            147      None
1           Age  float64       177          19.87             88     False
2      Embarked   object         2           0.22              3      None
3        Pclass    int64         0           0.00              3     False
4          Name   object         0           0.00            891      None
5           Sex   object         0           0.00              2      None
6         SibSp    int64         0           0.00              7     False
7         Parch    int64         0           0.00              7     False
8        Ticket   object         0           0.00            681      None
9          Fare  float64         0           0.00            248     False


In [18]:
def preprocess_dataframe(df):
	# Drop columns with >80% missing values
	total_count = len(df)
	columns_to_drop = [col for col in df.columns if df[col].isnull().sum() / total_count > 0.8]
	df = df.drop(columns=columns_to_drop)
	
	# Replace missing values
	for col_name in df.columns:
		data_type = df[col_name].dtype
		
		if np.issubdtype(data_type, np.number):
			mean_value = df[col_name].mean()
			df[col_name].fillna(mean_value, inplace=True)
		elif data_type == object:
			df[col_name].fillna("Desconhecido", inplace=True)
	
	# Drop columns with variance equals to 0
	numeric_columns = df.select_dtypes(include=[np.number]).columns
	variances = df[numeric_columns].var()
	columns_to_drop = variances[variances == 0].index.tolist()
	df = df.drop(columns=columns_to_drop)
	
	return df

# Apply the function to the dataframe
abt_01 = preprocess_dataframe(abt_00)
abt_01.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col_name].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col_name].fillna("Desconhecido", inplace=True)


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Desconhecido,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Desconhecido,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Desconhecido,S


## Tratamento de variáveis numéricas (Padronização)

In [21]:
# Instanciando o scaler
scaler = StandardScaler()

# Selecionando colunas numéricas
numeric_cols = abt_01.select_dtypes(include=['float64', 'int64','int32']).columns

# Aplicando a normalização
abt_01[numeric_cols] = scaler.fit_transform(abt_01[numeric_cols])

# Show the result
abt_01.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.827377,"Braund, Mr. Owen Harris",male,-0.592481,0.432793,-0.473674,A/5 21171,-0.502445,Desconhecido,S
1,-1.566107,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.638789,0.432793,-0.473674,PC 17599,0.786845,C85,C
2,0.827377,"Heikkinen, Miss. Laina",female,-0.284663,-0.474545,-0.473674,STON/O2. 3101282,-0.488854,Desconhecido,S
3,-1.566107,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.407926,0.432793,-0.473674,113803,0.42073,C123,S
4,0.827377,"Allen, Mr. William Henry",male,0.407926,-0.474545,-0.473674,373450,-0.486337,Desconhecido,S


## Tratamento de variáveis categóricas

### Baixa Cardinalidade (Dummy)

### Alta Cardinalidade (Label Encoding)

# Métodos de seleção de variáveis

## Feature Importance

## Recursive Feature Elimination (RFE)

## Boruta

## Pearson Correlation

## Corte por IV

## PCA + IV