# Template for Machine Learning using scikit-learn, XGBoost, LightGBM, TensorFlow

Copyright (c) 2022, Hyunsoek Choi, All rights reserved

In [1]:
# library
import numpy as np
import pandas as pd

### Data Load

In [2]:
# data load from csv files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

### Pre-processing

#### Missing Value

In [3]:
pd.DataFrame({'Train': df_train.isnull().sum(), 'Test': df_test.isnull().sum()})

Unnamed: 0,Train,Test
Age,177,86.0
Cabin,687,327.0
Embarked,2,0.0
Fare,0,1.0
Name,0,0.0
Parch,0,0.0
PassengerId,0,0.0
Pclass,0,0.0
Sex,0,0.0
SibSp,0,0.0


* Age

Age 결측치, Name TItle 중앙값으로 채우기

In [4]:
# Name에서 TItle 추출
df_train['Title'] = df_train['Name'].str.extract(' (\S+)\.')
df_test['Title'] = df_test['Name'].str.extract(' (\S+)\.')
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr


In [5]:
df_train['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)

In [6]:
pd.crosstab(df_train['Title'], df_train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [7]:
# 프랑스어 표현 영어로 대체 및 소수 명칭은 other로 
other_list = ['Capt', 'Col', 'Countess', 'Don','Dona', 'Jonkheer', 'Lady','Major', 'Rev', 'Sir']

# Train
df_train['Title'].replace('Mlle', 'Miss', inplace=True) # 프랑스어
df_train['Title'].replace('Mme', 'Mrs', inplace=True) # 프랑스어
df_train['Title'].replace('Ms', 'Miss', inplace=True) # 영국식 호칭
df_train['Title'].replace(other_list, 'Other', inplace=True)

# Test
df_test['Title'].replace('Mlle', 'Miss', inplace=True) # 프랑스어
df_test['Title'].replace('Mme', 'Mrs', inplace=True) # 프랑스어
df_test['Title'].replace('Ms', 'Miss', inplace=True) # 영국식 호칭
df_test['Title'].replace(other_list, 'Other', inplace=True)

In [8]:
pd.crosstab(df_train['Title'], df_train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Dr,1,6
Master,0,40
Miss,185,0
Mr,0,517
Mrs,126,0
Other,2,14


In [9]:
# 학습데이터 결측치 처리
age_dr = df_train[df_train['Title']=='Dr']['Age'].median()
age_master = df_train[df_train['Title']=='Master']['Age'].median()
age_miss = df_train[df_train['Title']=='Miss']['Age'].median()
age_mr = df_train[df_train['Title']=='Mr']['Age'].median()
age_mrs = df_train[df_train['Title']=='Mrs']['Age'].median()
age_other = df_train[df_train['Title']=='Other']['Age'].median()

df_train.loc[(df_train['Title']=='Dr') & (df_train['Age'].isnull()), 'Age'] = age_dr
df_train.loc[(df_train['Title']=='Master') & (df_train['Age'].isnull()), 'Age'] = age_master
df_train.loc[(df_train['Title']=='Miss') & (df_train['Age'].isnull()), 'Age'] = age_miss
df_train.loc[(df_train['Title']=='Mr') & (df_train['Age'].isnull()), 'Age'] = age_mr
df_train.loc[(df_train['Title']=='Mrs') & (df_train['Age'].isnull()), 'Age'] = age_mrs
df_train.loc[(df_train['Title']=='Other') & (df_train['Age'].isnull()), 'Age'] = age_other

# 테스트데이터 결측치 처리 (학습 데이터 중앙값으로 대체)
df_test.loc[(df_test['Title']=='Dr') & (df_test['Age'].isnull()), 'Age'] = age_dr
df_test.loc[(df_test['Title']=='Master') & (df_test['Age'].isnull()), 'Age'] = age_master
df_test.loc[(df_test['Title']=='Miss') & (df_test['Age'].isnull()), 'Age'] = age_miss
df_test.loc[(df_test['Title']=='Mr') & (df_test['Age'].isnull()), 'Age'] = age_mr
df_test.loc[(df_test['Title']=='Mrs') & (df_test['Age'].isnull()), 'Age'] = age_mrs
df_test.loc[(df_test['Title']=='Other') & (df_test['Age'].isnull()), 'Age'] = age_other

* Fare

1개의 NaN, Fare의 최빈값으로 대체

In [10]:
df_test.loc[df_test['Fare'].isnull(), 'Fare'] = df_train['Fare'].mode()[0]

* Embark

최빈값으로 대체

In [11]:
df_train.loc[df_train['Embarked'].isnull(), 'Embarked'] = df_train['Embarked'].mode()[0]

#### Discretization and Feature Engineering

In [12]:
# Sex
df_train['Gender'] = df_train['Sex'].map({'male': 0, 'female':1})
df_test['Gender'] = df_test['Sex'].map({'male':0, 'female':1})

# SibSp + Parch = Family
df_train['Family'] = df_train['SibSp'] + df_train['Parch']
df_test['Family'] = df_test['SibSp'] + df_test['Parch']

# Embark
df_train['Boadrind'] = df_train['Embarked'].map({'S':0, 'C':1, 'Q':2})
df_test['Boadrind'] = df_test['Embarked'].map({'S':0, 'C':1, 'Q':2})

#### Feature Selection

In [13]:
cols = ['Pclass', 'Age', 'Fare', 'Gender', 'Family', 'Boadrind']

x_train = df_train[cols]
y_train = df_train['Survived']

x_test = df_test[cols]

In [14]:
x_train.shape, y_train.shape

((891, 6), (891,))

In [15]:
x_test.shape

(418, 6)

In [16]:
## 데이터 저장
# titanic_train = pd.concat([x_train, y_train], axis=1)
# titanic_train.to_pickle('titanic_train.pkl')

In [17]:
## 데이터 로드
# titanic_train = pd.read_pickle('titanic_train.pkl')
x_train = titanic_train.loc[:, 'Pclass':'Boadrind']
y_train = titanic_train['Survived']

NameError: name 'titanic_train' is not defined

### Machine Learning

In [None]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0) # model define
model.fit(x_train, y_train) # model fitting

y_pred = model.predict(x_train) # inference

# accuracy
lr_score = model.score(x_train, y_train)
lr_score

In [None]:
## Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

model = GaussianNB() # model define
model.fit(x_train, y_train) # model fitting

y_pred = model.predict(x_train) # inference

# accuracy
nb_score = model.score(x_train, y_train)
nb_score

In [None]:
## K-nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3) # define
model.fit(x_train, y_train) # fitting

y_pred = model.predict(x_train) # inference

# accuracy
knn_score = model.score(x_train, y_train)
knn_score

In [None]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth=3)
model.fit(x_train, y_train)

# inference
y_pred = model.predict(x_train)

# accuracy
dt_score = model.score(x_train, y_train)
dt_score

In [None]:
# decision tree visualization
import matplotlib.pyplot as plt
from sklearn import tree

plt.figure(figsize=(15,9))
annots = tree.plot_tree(model, class_names=['0', '1'], feature_names=x_train.columns, filled=True, rounded=True)

In [None]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10, max_depth=3) # define
model.fit(x_train, y_train) # fitting

y_pred = model.predict(x_train)

# accuracy
rf_score = model.score(x_train, y_train)
rf_score

In [None]:
## Support Vector Machine
from sklearn.svm import SVC

model = SVC(kernel='rbf', gamma='auto') # define
model.fit(x_train, y_train) # fitting

y_pred = model.predict(x_train)

# accuracy
svm_score = model.score(x_train, y_train)
svm_score

In [None]:
## Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100)
model.fit(x_train, y_train)

y_pred = model.predict(x_train)

# accuracy
gb_score = model.score(x_train, y_train)
gb_score

In [None]:
## MLP by scikit-learn
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(max_iter=1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_train)

# accuracy
mlp_score = model.score(x_train, y_train)
mlp_score

In [None]:
## XGBoost
# (참고) XGBoost 개념 정리 https://webnautes.tistory.com/1643
from xgboost import XGBClassifier, plot_importance

model = XGBClassifier(n_estimators=400)
model.fit(x_train, y_train)

y_pred = model.predict(x_train)

# acuracy
xgb_score = model.score(x_train, y_train)
xgb_score

In [None]:
plot_importance(model)

In [None]:
## LGBM (LightGBM)
from lightgbm import LGBMClassifier

model = LGBMClassifier(n_estimators=400)
model.fit(x_train, y_train)

y_pred = model.predict(x_train)

# accuracy
lgbm_score = model.score(x_train, y_train)
lgbm_score

In [None]:
models = pd.DataFrame(
    data = [
        ['Logistic Regression', lr_score],
        ['Gaussian Naive Bayes', nb_score],
        ['K-nearest Neighbors', knn_score],
        ['Decision Tree', dt_score],
        ['Random Forest', rf_score],
        ['Support Vector Machine', svm_score],
        ['Gradient Boosting', gb_score],
        ['MLP', mlp_score],
        ['XGBoost', xgb_score],
        ['LightGBM', lgbm_score]
    ],
    columns = ['Model', 'Score']
)

models.set_index('Model', inplace=True)
models.sort_values('Score', ascending=False)

In [None]:
from tensorflow.keras.utils import to_categorical

y_train_onehot = to_categorical(y_train)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
m = Sequential()
m.add(Dense(12, input_shape=(6,), activation='relu'))
m.add(Dense(2, activation='softmax'))

m.compile(loss='categorical_crossentropy', metrics='acc')

In [None]:
hist = m.fit(x_train, y_train_onehot,
             epochs=100,
             validation_split=0.2,
             verbose=0)

In [None]:
pd.DataFrame(hist.history)

In [None]:
pd.DataFrame(hist.history).plot()

In [None]:
pd.DataFrame(hist.history)[['acc', 'val_acc']].plot()

In [None]:
pd.DataFrame(hist.history)[['loss', 'val_loss']].plot()

# Submission

In [18]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10, max_depth=3) # define
model.fit(x_train, y_train) # fitting

y_pred = model.predict(x_test)

In [36]:
# XGBoost 

from xgboost import XGBClassifier, plot_importance

model = XGBClassifier(n_estimators=400)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
from sklearn.model_selection import GridSearchCV

xgb_param_grid={
    'n_estimators' : [100,200,300,400,500],
    'learning_rate' : [0.01,0.05,0.1,0.15],
    'max_depth' : [3,5,7,10,15],
    'gamma' : [0,1,2,3],
    'colsample_bytree' : [0.8,0.9],
    
}

#score종류는 acc,f1,f1_micro,f1_macro등 원하는걸로 설정)
#여기서 설정 파라미터의 갯수(총 4000개의 조합이므로 4000번의 학습이 돌아감)
xgb_grid=GridSearchCV(model, param_grid = xgb_param_grid, scoring="f1_macro", n_jobs=-1, verbose = 2)
xgb_grid.fit(x_train, y_train)

#best f1_macro 수치와 best parameter확인
print("best f1_macro : {0: .4f}".format(xgb_grid.best_score_))
print("best param : ",xgb_grid.best_params_)

#dataframe으로 랭킹순보기
result_df = pd.DataFrame(xgb_grid.cv_results_)
result_df.sort_values(by=['rank_test_score'],inplace=True)

#plot
result_df[['params','mean_test_score','rank_test_score']].head(10)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits


In [22]:
#LGBM
from lightgbm import LGBMClassifier
model = LGBMClassifier(n_estimators=400)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [25]:
## MLP by scikit-learn
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(max_iter=1000)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [None]:
# Submission
submission = pd.DataFrame({"PassengerId" : df_test['PassengerId'], "Survived":rf_pred})

submission

In [31]:
submission.to_csv("ML_Template_GB_submission.csv", index = False)