<a href="https://colab.research.google.com/github/ANDUYONG/TitanicSurvivorPrediction/blob/main/Titanic_ScikitLearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 타이타닉 데이터 셋 로드

# Seaborn에서 예시 데이터셋을 로드
df = sns.load_dataset('titanic')

df.drop(columns=['adult_male', 'deck', 'embark_town', 'alive', 'alone'], inplace=True)

# age 칼럼의 중앙값 계산 및 결측치 채우기
age_median = df['age'].median()
df['age'].fillna(age_median, inplace=True)

# embarked 칼럼의 최빈값 계산 및 결측치 채우기
embarked_mode = df['embarked'].mode()[0]
df['embarked'].fillna(embarked_mode, inplace=True)

# map 함수를 이용한 레이블 인코딩
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
# embarked 칼럼 원-핫 인코딩
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

# 논리 1 : 가족의 크기 ( 가족의 수에 따라 생존에 큰 영향을 미침 )
# 'sibsp'와 'parch'를 더하고 자신을 포함하기 위해 1을 더합니다.
df['FamilySize'] = df['sibsp'] + df['parch'] + 1

# 논리 2 : 단독 탑승 여부 ( 혼자 여행하는 승객은 생존률이 낮다는 가설 검증 )
# FamilySize가 1이면(혼자 탑승) 1, 아니면 0을 부여
df['IsAlone'] = np.where(df['FamilySize'] == 1, 1, 0)

# 특징 공학으로 대체된 칼럼들을 정리 > 복잡도 낮춤
df = df.drop(columns=['sibsp', 'parch'])

# Target (Y)
Y = df['survived']

# Features (X) - 타겟 칼럼과 스케일링에 부적합한 비숫자형/중복 칼럼 제거
X = df.drop(columns=['survived', 'class', 'who'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(age_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embarked'].fillna(embarked_mode, inplace=True)


In [12]:
"""
학습/테스트 데이터 분할
"""
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42, stratify=Y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (623, 8)
X_test shape: (268, 8)


In [14]:
"""
로지스틱 회귀 모델 임포트 및 생성
"""
from sklearn.linear_model import LogisticRegression

# 로지스틱 객체 생성
model = LogisticRegression(random_state=42)

# 모델 학습
model.fit(X_train, Y_train)

# 예측 수행
Y_pred = model.predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
"""
모델 성능 평가: 지표 계산 및 오차 행렬
"""
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

accuracy = accuracy_score(Y_test, Y_pred) # 정확도
conf_matrix = confusion_matrix(Y_test, Y_pred) # 오차 행렬
precision = precision_score(Y_test, Y_pred) # 정밀도
recall = recall_score(Y_test, Y_pred) # 재현율
f1 = f1_score(Y_test, Y_pred) # F1


print(f"정확도 (Accuracy): {accuracy: 4f}")
print(f"오차 행렬 (Confusion Matrix): ${conf_matrix}")
print(f"정밀도 (Precision): ${precision}")
print(f"재현율 (Recall): ${recall}")
print(f"F1-Score: ${f1}")

정확도 (Accuracy):  0.794776
오차 행렬 (Confusion Matrix): $[[140  25]
 [ 30  73]]
정밀도 (Precision): $0.7448979591836735
재현율 (Recall): $0.7087378640776699
F1-Score: $0.7263681592039801
