In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. 데이터 로드
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 2. 결측치 처리
# Age 결측치는 중앙값으로 채움
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Fare 결측치는 중앙값으로 채움
test_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)

# Embarked 결측치는 최빈값으로 채움
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# Cabin 결측치는 "Unknown"으로 채움
train_data['Cabin'].fillna('Unknown', inplace=True)
test_data['Cabin'].fillna('Unknown', inplace=True)

# 3. 파생 변수 생성 (FamilySize)
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']

# 4. 불필요한 열 제거
train_data = train_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_data_ids = test_data['PassengerId']  # 제출 시 사용할 PassengerId 저장
test_data = test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# 5. 원-핫 인코딩
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'], drop_first=True)

# 6. 훈련 데이터와 타겟 변수 분리
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# 7. 학습/검증 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. 모델 학습
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 9. 검증 데이터 평가
y_pred_valid = model.predict(X_valid)
print("Validation Accuracy:", accuracy_score(y_valid, y_pred_valid))
print("Classification Report:\n", classification_report(y_valid, y_pred_valid))

# 10. Kaggle 테스트 데이터 예측
test_predictions = model.predict(test_data)

# 11. 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': test_data_ids,
    'Survived': test_predictions
})
submission.to_csv('submission.csv', index=False)
print("submission.csv 파일 생성 완료!")


Validation Accuracy: 0.8268156424581006
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85       105
           1       0.80      0.77      0.79        74

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

submission.csv 파일 생성 완료!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediat