In [495]:
# Python : 3.9.18
# Numpy : 1.26.0
# Pandas : 2.1.1
# Matplotlib : 3.7.2
# Seaborn : 0.12.2
# Scikit-learn : 1.3.0
# Created: OCT. 23. 2023
# Author: D.W. SHIN

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [496]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 결측치 처리 부분

In [497]:
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
train_df['Age'].isnull().sum()

0

In [498]:
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
train_df['Embarked'].isnull().sum()

0

### 추가 컬럼

In [499]:
train_df['Age_Cat'] = pd.cut(train_df['Age'],
                             bins=[0, 3, 7, 15, 30, 60, 100],
                             include_lowest=True,
                             labels=['Baby', 'Children', 'Teenage', 'Young', 'Adult', 'Old'])

In [500]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

In [501]:
# VotingClassifier
# accuracy_score  : 0.8547
# precision_score : 0.8547
# recall_score    : 0.8547
# f1_score        : 0.8547

# train_df['IsAlone'] = 1
# train_df['IsAlone'].loc[train_df['FamilySize'] > 1] = 0

In [502]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

  train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)


Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.55
1,"(16.336, 32.252]",0.344168
2,"(32.252, 48.168]",0.404255
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


In [503]:
train_df.loc[ train_df['Age'] <= 16, 'Age'] = 0
train_df.loc[(train_df['Age'] > 16) & (train_df['Age'] <= 32), 'Age'] = 1
train_df.loc[(train_df['Age'] > 32) & (train_df['Age'] <= 48), 'Age'] = 2
train_df.loc[(train_df['Age'] > 48) & (train_df['Age'] <= 64), 'Age'] = 3
train_df.loc[ train_df['Age'] > 64, 'Age'] = 4

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Cat,FamilySize,AgeBand
0,1,0,3,"Braund, Mr. Owen Harris",male,1.0,1,0,A/5 21171,7.25,,S,Young,2,"(16.336, 32.252]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2.0,1,0,PC 17599,71.2833,C85,C,Adult,2,"(32.252, 48.168]"
2,3,1,3,"Heikkinen, Miss. Laina",female,1.0,0,0,STON/O2. 3101282,7.925,,S,Young,1,"(16.336, 32.252]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2.0,1,0,113803,53.1,C123,S,Adult,2,"(32.252, 48.168]"
4,5,0,3,"Allen, Mr. William Henry",male,2.0,0,0,373450,8.05,,S,Adult,1,"(32.252, 48.168]"


### 레이블링 작업

In [504]:
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
train_df['Embarked'] = le.fit_transform(train_df['Embarked'])
train_df['Age_Cat'] = le.fit_transform(train_df['Age_Cat'])

### 컬럼 삭제

In [505]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    int64   
 3   Name         891 non-null    object  
 4   Sex          891 non-null    int64   
 5   Age          891 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     891 non-null    int64   
 12  Age_Cat      891 non-null    int64   
 13  FamilySize   891 non-null    int64   
 14  AgeBand      891 non-null    category
dtypes: category(1), float64(2), int64(9), object(3)
memory usage: 98.7+ KB


In [506]:
train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'AgeBand'], axis=1, inplace=True)
train_df.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Cat,FamilySize
886,0,2,1,1.0,0,0,13.0,2,5,1
887,1,1,0,1.0,0,0,30.0,2,5,1
888,0,3,0,1.0,1,2,23.45,2,5,4
889,1,1,1,1.0,0,0,30.0,0,5,1
890,0,3,1,1.0,0,0,7.75,1,0,1


### 데이터셋 나누기

In [507]:
test_size = 0.2

x_train, x_test, y_train, y_test = train_test_split(train_df.drop(['Survived'], axis=1), train_df['Survived'], test_size=test_size, stratify=train_df['Survived'], random_state=11)

### 모델 선택 및 평가

#### 1. LogisticRegression

In [508]:
# STEP 1. 모델 객체 생성
model = LogisticRegression()

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.8156
precision_score : 0.8156
recall_score    : 0.8156
f1_score        : 0.8156


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


##### 1.1 파라미터 최적화

In [509]:
model = LogisticRegression(n_jobs=-1)

params = {'penalty':['l2', 'l1', 'elasticnet', None],'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'] }

grid = GridSearchCV(model, param_grid=params, cv=3, refit=False)
grid.fit(x_train, y_train)

scores_df = pd.DataFrame(grid.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'penalty': 'l2', 'solver': 'lbfgs'}",0.787919,8,0.789916,0.78903,0.78481
1,"{'penalty': 'l2', 'solver': 'liblinear'}",0.799158,1,0.798319,0.793249,0.805907
2,"{'penalty': 'l2', 'solver': 'newton-cg'}",0.792138,4,0.789916,0.793249,0.793249
3,"{'penalty': 'l2', 'solver': 'newton-cholesky'}",0.792138,4,0.789916,0.793249,0.793249
4,"{'penalty': 'l2', 'solver': 'sag'}",0.693839,9,0.680672,0.704641,0.696203
5,"{'penalty': 'l2', 'solver': 'saga'}",0.692444,11,0.672269,0.71308,0.691983
6,"{'penalty': 'l1', 'solver': 'lbfgs'}",,14,,,
7,"{'penalty': 'l1', 'solver': 'liblinear'}",0.794945,2,0.794118,0.797468,0.793249
8,"{'penalty': 'l1', 'solver': 'newton-cg'}",,14,,,
9,"{'penalty': 'l1', 'solver': 'newton-cholesky'}",,14,,,


In [510]:
print('GridSearchCV 최적 파라미터:', grid.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid.best_score_))

GridSearchCV 최적 파라미터: {'penalty': 'l2', 'solver': 'liblinear'}
GridSearchCV 최고 정확도: 0.7992


##### 1.2 파라미터 적용

In [511]:
# STEP 1. 모델 객체 생성
model = LogisticRegression(penalty='l2', solver='liblinear')

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.8212
precision_score : 0.8212
recall_score    : 0.8212
f1_score        : 0.8212


##### 1.3 교차검증

In [512]:
# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 3개
scores = cross_val_score(model, x_train, y_train, cv=3)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

교차 검증별 정확도: [0.7983 0.7932 0.8059]
평균 검증 정확도: 0.7992


#### 2. SGDClassifier

In [513]:
# STEP 1. 모델 객체 생성
model = SGDClassifier(random_state=123)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.7989
precision_score : 0.7989
recall_score    : 0.7989
f1_score        : 0.7989


##### 2.1 파라미터 최적화

In [514]:
model = SGDClassifier(random_state=123, n_jobs=-1)

params = {'penalty':['l2', 'l1', 'elasticnet', None]}

grid = GridSearchCV(model, param_grid=params, cv=3, refit=True)
grid.fit(x_train, y_train)

# scores_df = pd.DataFrame(grid.cv_results_)
# scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]


In [515]:
print('GridSearchCV 최적 파라미터:', grid.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid.best_score_))

GridSearchCV 최적 파라미터: {'penalty': None}
GridSearchCV 최고 정확도: 0.7317


##### 2.2 파라미터 적용

In [516]:
# STEP 1. 모델 객체 생성
model = SGDClassifier(random_state=123)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.7989
precision_score : 0.7989
recall_score    : 0.7989
f1_score        : 0.7989


##### 2.3 교차검증

In [517]:
# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 3개
scores = cross_val_score(model, x_train, y_train, cv=3)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

교차 검증별 정확도: [0.4202 0.4008 0.7764]
평균 검증 정확도: 0.5325


#### 3. RandomForestClassifier

In [518]:
# STEP 1. 모델 객체 생성
model = RandomForestClassifier(random_state=123)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.8324
precision_score : 0.8324
recall_score    : 0.8324
f1_score        : 0.8324


##### 3.1 파라미터 최적화

In [519]:
model = RandomForestClassifier(random_state=123, n_jobs=-1)

params = {
    'n_estimators':[10, 20, 50, 100, 200, 500, 1000],
    'max_depth' : [6, 8, 10, 12], 
    'min_samples_leaf' : [8, 12, 18],
    'min_samples_split' : [8, 16, 20]
}

grid = GridSearchCV(model, param_grid=params, cv=3, refit=False)
grid.fit(x_train , y_train)


In [520]:
print('GridSearchCV 최적 파라미터:', grid.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid.best_score_))

GridSearchCV 최적 파라미터: {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 50}
GridSearchCV 최고 정확도: 0.7992


##### 3.2 파라미터 적용

In [533]:
# STEP 1. 모델 객체 생성
model = RandomForestClassifier(random_state=123, n_estimators=50, max_depth=6, min_samples_leaf=8, min_samples_split=20)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.8715
precision_score : 0.8715
recall_score    : 0.8715
f1_score        : 0.8715


##### 3.3 교차검증

In [522]:
# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 3개
scores = cross_val_score(model, x_train, y_train, cv=3)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

교차 검증별 정확도: [0.7983 0.7932 0.8059]
평균 검증 정확도: 0.7992


#### 4. GradientBoostingClassifier

In [524]:
# STEP 1. 모델 객체 생성
model = GradientBoostingClassifier()

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.8045
precision_score : 0.8045
recall_score    : 0.8045
f1_score        : 0.8045


##### 4.1 파라미터 최적화

In [527]:
model = GradientBoostingClassifier()

params = {
    'n_estimators':[10, 20, 50, 100, 200, 500, 1000],
    'learning_rate' : [ 0.05, 0.1]
}

grid = GridSearchCV(model, param_grid=params, cv=2, verbose=1, refit=False)
grid.fit(x_train , y_train)

Fitting 2 folds for each of 14 candidates, totalling 28 fits


In [528]:
print('GridSearchCV 최적 파라미터:', grid.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid.best_score_))

GridSearchCV 최적 파라미터: {'learning_rate': 0.1, 'n_estimators': 50}
GridSearchCV 최고 정확도: 0.8118


##### 4.2 파라미터 적용

In [530]:
# STEP 1. 모델 객체 생성
model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.8101
precision_score : 0.8101
recall_score    : 0.8101
f1_score        : 0.8101


##### 4.3 교차검증

In [531]:
# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 3개
scores = cross_val_score(model, x_train, y_train, cv=3)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

교차 검증별 정확도: [0.8319 0.8017 0.8017]
평균 검증 정확도: 0.8118


#### 5. VotingClassifier

In [532]:
# STEP 1. 모델 선택
lr_clf = LogisticRegression(penalty='l2', solver='liblinear')
rfc_clf = RandomForestClassifier(random_state=123, n_estimators=50, max_depth=6, min_samples_leaf=8, min_samples_split=20)
gbc_clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1)

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기 
vo_clf = VotingClassifier(estimators=[('LR',lr_clf), ('RFC',rfc_clf), ('GBC',gbc_clf)], voting='soft')

# VotingClassifier 학습/예측/평가. 
vo_clf.fit(x_train, y_train)
y_pred = vo_clf.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

accuracy_score  : 0.8547
precision_score : 0.8547
recall_score    : 0.8547
f1_score        : 0.8547


#### 결론
**RandomForestClassifier** 
- accuracy_score  : 0.8715
- precision_score : 0.8715
- recall_score    : 0.8715
- f1_score        : 0.8715
