In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [20]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_data = pd.read_csv(url)

In [21]:
# Separar as features (X) e o alvo (y)
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [22]:
# Definir colunas categóricas e numéricas
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 10 and
                    X[cname].dtype == "object"]
numeric_cols = [cname for cname in X.columns if
                X[cname].dtype in ['int64', 'float64']]

In [23]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Criar um ColumnTransformer para aplicar transformers específicos a colunas específicas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Visualizar as primeiras linhas do conjunto de treinamento após o pré-processamento
X_train_preprocessed = pd.DataFrame(preprocessor.fit_transform(X_train))
print(X_train_preprocessed.head())

         0         1         2         3         4         5    6    7    8   \
0 -0.453066 -1.614136  1.232263 -0.470722 -0.479342 -0.078684  0.0  1.0  0.0   
1  1.113874 -0.400551 -0.500482 -0.470722 -0.479342 -0.377145  0.0  1.0  0.0   
2 -0.254275  0.813034  0.192616 -0.470722 -0.479342 -0.474867  0.0  1.0  0.0   
3  1.000836  0.813034 -0.269449  0.379923 -0.479342 -0.476230  0.0  1.0  0.0   
4  1.425702  0.813034 -1.809667  2.931860  2.048742 -0.025249  1.0  0.0  0.0   

    9    10  
0  0.0  1.0  
1  0.0  1.0  
2  0.0  1.0  
3  0.0  1.0  
4  0.0  1.0  


In [24]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Definir os modelos individuais
random_forest_model = RandomForestClassifier(random_state=42)
gradient_boosting_model = GradientBoostingClassifier(random_state=42)
logistic_regression_model = LogisticRegression(random_state=42)

# Criar um VotingClassifier
voting_classifier = VotingClassifier(estimators=[
    ('rf', random_forest_model),
    ('gb', gradient_boosting_model),
    ('lr', logistic_regression_model)
], voting='soft')  # 'soft' utiliza as probabilidades para a votação

# Criar um StackingClassifier
stacking_classifier = StackingClassifier(estimators=[
    ('rf', random_forest_model),
    ('gb', gradient_boosting_model),
    ('lr', logistic_regression_model)
], final_estimator=LogisticRegression())

# Criar um pipeline para o ensemble
ensemble_voting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_classifier)  # Pode ser substituído por 'stacking_classifier'
])

ensemble_stacking_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', stacking_classifier)  # Pode ser substituído por 'stacking_classifier'
])

# Avaliar o ensemble usando validação cruzada
cv_scores_voting = cross_val_score(ensemble_voting_pipeline, X, y, cv=5, scoring='accuracy')

cv_scores_stacking = cross_val_score(ensemble_stacking_pipeline, X, y, cv=5, scoring='accuracy')

# Exibir a média das pontuações
print("Accuracy média do Ensemble (voting):", cv_scores_voting.mean())
print("Accuracy média do Ensemble (stacking):", cv_scores_stacking.mean())

Accuracy média do Ensemble (voting): 0.7879919653505743
Accuracy média do Ensemble (stacking): 0.7958131944008537
