Manipulação de Dados

In [1]:
# Importações
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns

# Carregar dataset Titanic
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

# Visualizar colunas
df.head()


ModuleNotFoundError: No module named 'sklearn'

Remover colunas embarked, alone e fare

In [None]:
df = df.drop(columns=['embarked', 'alone', 'fare'])

Substituir valores ausentes da coluna age pela média

In [None]:
df['age'].fillna(df['age'].mean(), inplace=True)

One-Hot Encoding para a coluna pclass

In [None]:
df = pd.get_dummies(df, columns=['pclass'], prefix='pclass')

Modelagem
Pré-processamento
Selecionar colunas numéricas e categóricas:

In [None]:
# Remover outras colunas irrelevantes
df = df.drop(columns=['name', 'ticket', 'cabin', 'sibsp', 'parch', 'who', 'deck', 'sex', 'adult_male'])

# One-hot encoding de 'sex' (recolocando para tornar o modelo mais significativo)
df['sex'] = pd.get_dummies(df['sex'], drop_first=True)

# Features e target
X = df.drop('survived', axis=1)
y = df['survived']

Separar dados em treino e teste

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Treinar modelos e calcular métricas

In [None]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred)
    }

# Mostrar resultados
results_df = pd.DataFrame(results).T
results_df

Visualização e Interpretação
Gráfico de acurácias

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x=results_df.index, y=results_df['Accuracy'])
plt.title('Comparação de Acurácia entre Modelos')
plt.ylabel('Acurácia')
plt.xlabel('Modelo')
plt.ylim(0, 1)
plt.show()