In [None]:
# Python : 3.9.18
# Numpy : 1.26.0
# Pandas : 2.1.1
# Matplotlib : 3.7.2
# Seaborn : 0.12.2
# Scikit-learn : 1.3.0
# Created: OCT. 23. 2023
# Author: D.W. SHIN

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [None]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

train_df.info()

### 결측치 처리 부분

In [None]:
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
train_df['Age'].isnull().sum()

In [None]:
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
train_df['Embarked'].isnull().sum()

### 추가 컬럼

In [None]:
train_df['Age_Cat'] = pd.cut(train_df['Age'],
                             bins=[0, 3, 7, 15, 30, 60, 100],
                             include_lowest=True,
                             labels=['Baby', 'Children', 'Teenage', 'Young', 'Adult', 'Old'])

In [None]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

In [None]:
# VotingClassifier
# accuracy_score  : 0.8547
# precision_score : 0.8547
# recall_score    : 0.8547
# f1_score        : 0.8547

# train_df['IsAlone'] = 1
# train_df['IsAlone'].loc[train_df['FamilySize'] > 1] = 0

In [None]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

In [None]:
train_df.loc[ train_df['Age'] <= 16, 'Age'] = 0
train_df.loc[(train_df['Age'] > 16) & (train_df['Age'] <= 32), 'Age'] = 1
train_df.loc[(train_df['Age'] > 32) & (train_df['Age'] <= 48), 'Age'] = 2
train_df.loc[(train_df['Age'] > 48) & (train_df['Age'] <= 64), 'Age'] = 3
train_df.loc[ train_df['Age'] > 64, 'Age'] = 4

train_df.head()

### 레이블링 작업

In [None]:
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
train_df['Embarked'] = le.fit_transform(train_df['Embarked'])
train_df['Age_Cat'] = le.fit_transform(train_df['Age_Cat'])

### 컬럼 삭제

In [None]:
train_df.info()

In [None]:
train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'AgeBand'], axis=1, inplace=True)
train_df.tail()

#### gcp에 소스 파일 만들기

In [None]:
train2_df = train_df.copy()

new_df = pd.concat([train_df, train2_df])

new_df.info()



In [None]:
new_df.head()

In [None]:
new_df.reset_index(drop=True, inplace=True)

In [None]:
new_df.head()

In [None]:
new_df.to_csv('titan_new_data3.csv', index=False, sep=',', na_rep='NaN')

In [None]:
all_df = pd.read_csv('titan_new_data3.csv')

all_df.info()

In [None]:
all_df.isna().sum()

### 데이터셋 나누기

In [None]:
test_size = 0.2

x_train, x_test, y_train, y_test = train_test_split(train_df.drop(['Survived'], axis=1), train_df['Survived'], test_size=test_size, stratify=train_df['Survived'], random_state=11)

### 모델 선택 및 평가

#### 1. LogisticRegression

In [None]:
# STEP 1. 모델 객체 생성
model = LogisticRegression()

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

##### 1.1 파라미터 최적화

In [None]:
model = LogisticRegression(n_jobs=-1)

params = {'penalty':['l2', 'l1', 'elasticnet', None],'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'] }

grid = GridSearchCV(model, param_grid=params, cv=3, refit=False)
grid.fit(x_train, y_train)

scores_df = pd.DataFrame(grid.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

In [None]:
print('GridSearchCV 최적 파라미터:', grid.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid.best_score_))

##### 1.2 파라미터 적용

In [None]:
# STEP 1. 모델 객체 생성
model = LogisticRegression(penalty='l2', solver='liblinear')

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

##### 1.3 교차검증

In [None]:
# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 3개
scores = cross_val_score(model, x_train, y_train, cv=3)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

#### 2. SGDClassifier

In [None]:
# STEP 1. 모델 객체 생성
model = SGDClassifier(random_state=123)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

##### 2.1 파라미터 최적화

In [None]:
model = SGDClassifier(random_state=123, n_jobs=-1)

params = {'penalty':['l2', 'l1', 'elasticnet', None]}

grid = GridSearchCV(model, param_grid=params, cv=3, refit=True)
grid.fit(x_train, y_train)

# scores_df = pd.DataFrame(grid.cv_results_)
# scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]


In [None]:
print('GridSearchCV 최적 파라미터:', grid.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid.best_score_))

##### 2.2 파라미터 적용

In [None]:
# STEP 1. 모델 객체 생성
model = SGDClassifier(random_state=123)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

##### 2.3 교차검증

In [None]:
# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 3개
scores = cross_val_score(model, x_train, y_train, cv=3)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

#### 3. RandomForestClassifier

In [None]:
# STEP 1. 모델 객체 생성
model = RandomForestClassifier(random_state=123)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

##### 3.1 파라미터 최적화

In [None]:
model = RandomForestClassifier(random_state=123, n_jobs=-1)

params = {
    'n_estimators':[10, 20, 50, 100, 200, 500, 1000],
    'max_depth' : [6, 8, 10, 12], 
    'min_samples_leaf' : [8, 12, 18],
    'min_samples_split' : [8, 16, 20]
}

grid = GridSearchCV(model, param_grid=params, cv=3, refit=False)
grid.fit(x_train , y_train)


In [None]:
print('GridSearchCV 최적 파라미터:', grid.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid.best_score_))

##### 3.2 파라미터 적용

In [None]:
# STEP 1. 모델 객체 생성
model = RandomForestClassifier(random_state=123, n_estimators=50, max_depth=6, min_samples_leaf=8, min_samples_split=20)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

##### 3.3 교차검증

In [None]:
# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 3개
scores = cross_val_score(model, x_train, y_train, cv=3)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

#### 4. GradientBoostingClassifier

In [None]:
# STEP 1. 모델 객체 생성
model = GradientBoostingClassifier()

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

##### 4.1 파라미터 최적화

In [None]:
model = GradientBoostingClassifier()

params = {
    'n_estimators':[10, 20, 50, 100, 200, 500, 1000],
    'learning_rate' : [ 0.05, 0.1]
}

grid = GridSearchCV(model, param_grid=params, cv=2, verbose=1, refit=False)
grid.fit(x_train , y_train)

In [None]:
print('GridSearchCV 최적 파라미터:', grid.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid.best_score_))

##### 4.2 파라미터 적용

In [None]:
# STEP 1. 모델 객체 생성
model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1)

# STEP 2. 모델 학습
model.fit(x_train, y_train)

# STEP 3. 모델 예측
y_pred = model.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

##### 4.3 교차검증

In [None]:
# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 3개
scores = cross_val_score(model, x_train, y_train, cv=3)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

#### 5. VotingClassifier

In [None]:
# STEP 1. 모델 선택
lr_clf = LogisticRegression(penalty='l2', solver='liblinear')
rfc_clf = RandomForestClassifier(random_state=123, n_estimators=50, max_depth=6, min_samples_leaf=8, min_samples_split=20)
gbc_clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1)

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기 
vo_clf = VotingClassifier(estimators=[('LR',lr_clf), ('RFC',rfc_clf), ('GBC',gbc_clf)], voting='soft')

# VotingClassifier 학습/예측/평가. 
vo_clf.fit(x_train, y_train)
y_pred = vo_clf.predict(x_test)

# SETP 4. 평가
print('accuracy_score  : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('precision_score : {0:.4f}'.format(precision_score(y_test, y_pred, average='micro')))
print('recall_score    : {0:.4f}'.format(recall_score(y_test, y_pred, average='micro')))
print('f1_score        : {0:.4f}'.format(f1_score(y_test, y_pred, average='micro')))

#### 결론
**RandomForestClassifier** 
- accuracy_score  : 0.8715
- precision_score : 0.8715
- recall_score    : 0.8715
- f1_score        : 0.8715
