# Titanic 데이터 머신러닝 분류

## 1. 데이터 로딩 및 확인

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('/mnt/data/titanic.csv')
df.head()

## 2. 결측치 확인

In [None]:
df.isnull().sum()

## 3. 결측치 처리

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

## 4. 레이블 확인

In [None]:
df['Survived'].value_counts()

## 5. 불필요한 컬럼 제거

In [None]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

## 6. 데이터 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Embarked'].fillna('S', inplace=True)
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])

## 7. 레이블 변환 후 확인

In [None]:
df['Survived'].value_counts()

## 8. 데이터 분할

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 9. 분류 모델 학습 및 평가

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=200),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f'{name}: Accuracy = {acc:.4f}')
    print(confusion_matrix(y_test, y_pred))
