In [2]:
!pip install scikit-optimize


Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

In [5]:
# Carregando a base Titanic
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")


In [6]:
# Tratamento inicial dos dados
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
df.dropna(subset=['Embarked'], inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Separando features e labels
X = df.drop(columns=['Survived'])
y = df['Survived']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['Embarked'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})


In [7]:
# Divisão treino/teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Questão 1 - Utilizando o otimizador BayesSearchCV (from skopt import BayesSearchCV), ajuste os hiperparâmetros do Random Forest e Árvore de decisão para o problema do TITANIC

In [8]:
param_grid_rf = {'n_estimators': (10, 200), 'max_depth': (1, 20)}
param_grid_dt = {'max_depth': (1, 20), 'min_samples_split': (2, 10)}

rf_search = BayesSearchCV(RandomForestClassifier(), param_grid_rf, n_iter=30, cv=5)
dt_search = BayesSearchCV(DecisionTreeClassifier(), param_grid_dt, n_iter=30, cv=5)

rf_search.fit(X_train, y_train)
dt_search.fit(X_train, y_train)

# Melhores modelos
rf_best = rf_search.best_estimator_
dt_best = dt_search.best_estimator_

# Avaliação
y_pred_rf = rf_best.predict(X_test)
y_pred_dt = dt_best.predict(X_test)

print("Random Forest - Acurácia:", accuracy_score(y_test, y_pred_rf))
print("Árvore de Decisão - Acurácia:", accuracy_score(y_test, y_pred_dt))



Random Forest - Acurácia: 0.8258426966292135
Árvore de Decisão - Acurácia: 0.7865168539325843


Questão 2 - Uma vez que a base de dados do Titanic é desbalanceada, investigue métodos de balanceamento para balancear as classes.

In [9]:
# Imputação de valores ausentes antes de aplicar SMOTE
imp_mean = SimpleImputer(strategy='mean')
X_train_imputed = imp_mean.fit_transform(X_train)

# SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(X_train_imputed, y_train)

# TomekLinks
tomek = TomekLinks()
X_res_tl, y_res_tl = tomek.fit_resample(X_train_imputed, y_train)

# RandomUnderSampler
under = RandomUnderSampler()
X_res_under, y_res_under = under.fit_resample(X_train_imputed, y_train)

# ADASYN (Substituto para DSTO-GAN)
adasyn = ADASYN()
X_res_adasyn, y_res_adasyn = adasyn.fit_resample(X_train_imputed, y_train)

# Avaliação com modelos balanceados
for name, X_bal, y_bal in zip(["SMOTE", "TomekLinks", "RandomUnderSampler", "ADASYN"],
                               [X_res, X_res_tl, X_res_under, X_res_adasyn],
                               [y_res, y_res_tl, y_res_under, y_res_adasyn]):
    model = RandomForestClassifier().fit(X_bal, y_bal)
    y_pred = model.predict(X_test)
    print(f"{name} - Precisão:", precision_score(y_test, y_pred))
    print(f"{name} - Recall:", recall_score(y_test, y_pred))
    print(f"{name} - F1-Score:", f1_score(y_test, y_pred))



SMOTE - Precisão: 0.75
SMOTE - Recall: 0.8260869565217391
SMOTE - F1-Score: 0.7862068965517242




TomekLinks - Precisão: 0.6904761904761905
TomekLinks - Recall: 0.8405797101449275
TomekLinks - F1-Score: 0.7581699346405228




RandomUnderSampler - Precisão: 0.651685393258427
RandomUnderSampler - Recall: 0.8405797101449275
RandomUnderSampler - F1-Score: 0.7341772151898734
ADASYN - Precisão: 0.75
ADASYN - Recall: 0.8260869565217391
ADASYN - F1-Score: 0.7862068965517242




Questão 3 - Uma vez que a base de dados do Titanic possui dados ausentes, investigue métodos de imputação para imputar as ausências desta base de dados

In [10]:
# Criando dados com valores ausentes para simulação
X_missing = X.copy()
X_missing.loc[np.random.randint(0, X.shape[0], 20), 'Age'] = np.nan

# Imputação por Média
imp_mean = SimpleImputer(strategy='mean')
X_imputed_mean = imp_mean.fit_transform(X_missing)

# Imputação por Moda
imp_mode = SimpleImputer(strategy='most_frequent')
X_imputed_mode = imp_mode.fit_transform(X_missing)

# Imputação por KNN
imp_knn = KNNImputer(n_neighbors=3)
X_imputed_knn = imp_knn.fit_transform(X_missing)

# Avaliação
models = {
    "Média": X_imputed_mean,
    "Moda": X_imputed_mode,
    "KNN": X_imputed_knn
}

for name, X_imp in models.items():
    model = RandomForestClassifier().fit(X_imp, y)
    y_pred = model.predict(X_test)
    print(f"{name} - Acurácia:", accuracy_score(y_test, y_pred))



Média - Acurácia: 0.9719101123595506




Moda - Acurácia: 0.9831460674157303
KNN - Acurácia: 0.9662921348314607


