In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as plt
import platform
from matplotlib import font_manager, rc

# 한글 폰트 설치 
if platform.system() == 'Darwin':   #macOS
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':   #WINDOWS
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:   #LINUX
    print('Unknown system... sorry~~~~') 
    

import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic') 

In [109]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [110]:
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(train['Age'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Age'].fillna(train['Age'].median(), inplace=True)


In [111]:
test['Fare'].fillna(train['Fare'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Fare'].fillna(train['Fare'].median(), inplace=True)


In [112]:
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)


In [113]:
# train = train.drop(['Name','Ticket','Cabin'], axis=1)
# test = test.drop(['Name','Ticket','Cabin'], axis=1)

In [114]:
# 범주형 변수 인코딩
train = pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Sex', 'Embarked'], drop_first=True)

# 테스트 데이터에 없는 열을 추가
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0

# 테스트 데이터와 훈련 데이터의 열 순서 일치
test = test[train.columns.drop('Survived')]

In [115]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fare와 Age 컬럼에 스케일링 적용
train[['Fare', 'Age']] = scaler.fit_transform(train[['Fare', 'Age']])
test[['Fare', 'Age']] = scaler.transform(test[['Fare', 'Age']])

In [116]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",-0.565736,1,0,A/5 21171,-0.502445,,True,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.663861,1,0,PC 17599,0.786845,C85,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",-0.258337,0,0,STON/O2. 3101282,-0.488854,,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.433312,1,0,113803,0.42073,C123,False,False,True
4,5,0,3,"Allen, Mr. William Henry",0.433312,0,0,373450,-0.486337,,True,False,True


In [117]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",0.394887,0,0,330911,-0.490783,,True,True,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1.35551,1,0,363272,-0.507479,,False,False,True
2,894,2,"Myles, Mr. Thomas Francis",2.508257,0,0,240276,-0.453367,,True,True,False
3,895,3,"Wirz, Mr. Albert",-0.181487,0,0,315154,-0.474005,,True,False,True
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",-0.565736,1,1,3101298,-0.401017,,False,False,True


In [118]:
X_train = train.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y_train = train['Survived']

In [119]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,3,-0.565736,1,0,-0.502445,True,False,True
1,2,1,0.663861,1,0,0.786845,False,False,False
2,3,3,-0.258337,0,0,-0.488854,False,False,True
3,4,1,0.433312,1,0,0.42073,False,False,True
4,5,3,0.433312,0,0,-0.486337,True,False,True


In [120]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [121]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",0.394887,0,0,330911,-0.490783,,True,True,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1.35551,1,0,363272,-0.507479,,False,False,True
2,894,2,"Myles, Mr. Thomas Francis",2.508257,0,0,240276,-0.453367,,True,True,False
3,895,3,"Wirz, Mr. Albert",-0.181487,0,0,315154,-0.474005,,True,False,True
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",-0.565736,1,1,3101298,-0.401017,,False,False,True


In [122]:
# test = test.drop(['Name', 'Ticket', 'Cabin'],axis=1)

In [123]:
# 그레이디언트 부스팅 

from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state=123)
gbc.fit(X_train, y_train)


In [124]:
# 예측 수행
X_test = test.drop(['Name', 'Ticket', 'Cabin'], axis=1)
predictions = gbc.predict(X_test)

# 결과 저장
submission_gbc_g = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

submission_gbc_g.to_csv('submission_gbc_g.csv', index=False)

In [125]:
print(gbc.score(X_train, y_train))

0.8900112233445566


In [126]:
from sklearn.ensemble import RandomForestClassifier

# 모델 정의 및 훈련
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)

# 예측 수행

predictions_RF = RF.predict(X_test)

In [127]:
# 결과 저장
submission_RF = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions_RF
})

submission_RF.to_csv('submission_RF.csv', index=False)