In [1]:
!python --version

Python 3.10.9


In [2]:
from sklearn.model_selection import train_test_split

import pandas as pd

df = pd.read_csv('titanic.csv', encoding='utf-8', engine='python')

x_train, x_test, y_train, y_test = train_test_split(
    df.drop(['Survived'], axis=1),
    df['Survived'],
    test_size=0.3, shuffle=True, stratify=df['Survived']
)

# train 데이터 셋에 대한 df => Age에 결측값에 mean()으로 채워넣기 위한 빌드업
df = pd.concat([x_train, y_train], axis=1)
df_test = pd.concat([x_test, y_test], axis=1)
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
383,384,1,"Holverson, Mrs. Alexander Oskar (Mary Aline To...",female,35.0,1,0,113789,52.0,,S,1
137,138,1,"Futrelle, Mr. Jacques Heath",male,37.0,1,0,113803,53.1,C123,S,0
488,489,3,"Somerton, Mr. Francis William",male,30.0,0,0,A.5. 18509,8.05,,S,0
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,1
196,197,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q,0


In [3]:
from sklearn.preprocessing import LabelEncoder

import math
import warnings

warnings.filterwarnings("ignore")

## Sex column 처리
encoder = LabelEncoder()
df.Sex = encoder.fit_transform(df.Sex)

## Age column 결측값 채우기
cond_1 = df.loc[(df.SibSp == 0) & (df.Parch == 0)].index
cond_2 = df.loc[(df.Pclass == 3) | (df.Sex == "male") | (df.Survived == 0)].index
new_df = df.loc[cond_1 & cond_2]
df.Age.fillna(int(round(new_df.Age.mean(), 0)), inplace=True)
df.loc[df.Age % 1 != 0, "Age"] = df.loc[df.Age % 1 != 0, "Age"].apply(math.floor)
df.Age = df.Age.astype(int)

## Embarked column 처리 [결측값, 원핫인코딩]
df.Embarked.fillna("S", inplace=True)
df = pd.concat([df, pd.get_dummies(df.Embarked)], axis=1)
df.drop(['Embarked'], axis=1, inplace=True)

In [4]:
# test 데이터 처리

## Sex column 처리
encoder = LabelEncoder()
df_test.Sex = encoder.fit_transform(df_test.Sex)

## Age column 결측값 채우기
cond_1 = df.loc[(df.SibSp == 0) & (df.Parch == 0)].index
cond_2 = df.loc[(df.Pclass == 3) | (df.Sex == "male") | (df.Survived == 0)].index
new_df = df.loc[cond_1 & cond_2]

df_test.Age.fillna(int(round(new_df.Age.mean(), 0)), inplace=True)
df_test.loc[df_test.Age % 1 != 0, "Age"] = df_test.loc[df_test.Age % 1 != 0, "Age"].apply(math.floor)
df_test.Age = df_test.Age.astype(int)

## Embarked column 처리 [결측값, 원핫인코딩]
df_test.Embarked.fillna("S", inplace=True)
df_test = pd.concat([df_test, pd.get_dummies(df_test.Embarked)], axis=1)
df_test.drop(['Embarked'], axis=1, inplace=True)

## object column 4가지['PassengerId', 'Ticket', 'Name', 'Cabin'] 제거
df.drop(['PassengerId', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)

x_train = df.drop(['Survived'], axis=1)
x_test = df_test.drop(['Survived'], axis=1)

In [5]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

temp_df = pd.DataFrame()
X = x_train.copy()
X.drop(['S'], axis=1, inplace=True)
temp_df['Feature'] = X.columns
temp_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

x_train.drop(['S'], axis=1, inplace=True)
x_test.drop(['S'], axis=1, inplace=True)

temp_df

Unnamed: 0,Feature,VIF
0,Pclass,4.985297
1,Sex,3.13361
2,Age,4.181721
3,SibSp,1.512639
4,Parch,1.622176
5,Fare,1.766971
6,C,1.352925
7,Q,1.180139


In [6]:
#@title 오버샘플링
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

smote = SMOTE(random_state=42) # SMOTE 객체 생성

# 그냥 전처리가 아니라 단순히 SMOTE 기법을 시도하는 것이기 때문에 그냥 다버리고 오버샘플링 한번 보자.
x_train_over, y_train_over = smote.fit_resample(x_train, y_train)

pd.DataFrame(y_train_over).Survived.value_counts() # SMOTE 를 활용한 오버샘플링 적용 된 것 확인

1    384
0    384
Name: Survived, dtype: int64

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

params = {
    'init' : [None, 1],
    'criterion' : ['friedman_mse', 'squared_error'],
    'loss' : ['deviance', 'exponential'],
    'n_estimators': [1000], 
    'learning_rate': [0.03, 0.01, 0.007, 0.005], 
    'max_depth': [8, 10, 12], 
    'max_leaf_nodes' : [2, 4, 6, 8],
    'subsample': [0.5, 0.7, 2, 3, 4],
    'min_samples_split' : [2, 3, 4],
    'min_samples_leaf' : [2, 3, 4],
    'max_features' : ['auto', 'sqrt', 'log2'],
    'n_iter_no_change' : [300, 400, 500, 600],
    'tol' : [0.05, 0.01, 0.005],
}


grid = GridSearchCV(GradientBoostingClassifier(), params, cv=3, n_jobs=12)
grid.fit(x_train_over, y_train_over)

print(grid.best_score_, '\n')
print(grid.best_params_, '\n')

0.80859375 

{'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.03, 'loss': 'deviance', 'max_depth': 8, 'max_features': 'auto', 'max_leaf_nodes': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000, 'n_iter_no_change': 300, 'subsample': 0.5, 'tol': 0.05} 



0.80859375 

GradientBoostingClassifier(
    criterion = 'friedman_mse',
    init = None,
    learning_rate = 0.03,
    loss = 'deviance',
    max_depth = 8,
    max_features = 'auto',
    max_leaf_nodes = 2,
    min_samples_leaf = 2,
    min_samples_split = 2,
    n_estimators = 1000,
    n_iter_no_change = 300,
    subsample = 0.5,
    tol = 0.05
)