## Basic Settings

In [86]:
# Connecting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [87]:
import pandas as pd

# 경고 무시 -> 에러랑 헷갈림
import warnings
warnings.filterwarnings('ignore')

## Train Data Preprocessing

In [88]:
# train data 불러오기
train = pd.read_csv("drive/My Drive/titanic_data/train.csv")

In [89]:
# 전처리 함수
'''
1) 결측치 처리: Age -> 평균값, Cabin -> 열 삭제, Embarked -> 최빈값
2) 범주형 데이터 처리: Sex, Embarked -> 원핫인코딩, 원래 열 삭제
3) Name, Ticket 열 삭제
'''

def preprocessingData(df):
  # 1) 결측치처리
  df['Age'] = df['Age'].fillna(df['Age'].mean())
  df = df.drop('Cabin', axis=1)
  df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

  # 2) 범주형 데이터 처리
  df_for_encoding = pd.get_dummies(df[['Sex', 'Embarked']])
  df = pd.concat([df, df_for_encoding], axis=1)
  df = df.drop('Sex', axis=1)
  df = df.drop('Embarked', axis=1)

  # 3) 열 삭제
  df = df.drop('Name', axis=1)
  df = df.drop('Ticket', axis=1)

  return df

In [90]:
train = preprocessingData(train)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,False,True,False,False,True
1,2,1,1,38.0,1,0,71.2833,True,False,True,False,False
2,3,1,3,26.0,0,0,7.925,True,False,False,False,True
3,4,1,1,35.0,1,0,53.1,True,False,False,False,True
4,5,0,3,35.0,0,0,8.05,False,True,False,False,True


## Test Data Preprocessing

In [91]:
# test data 불러오기
test = pd.read_csv("drive/My Drive/titanic_data/test.csv")

In [92]:
# test data 확인
test.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q


In [93]:
# shape 비교
train.shape, test.shape

((891, 12), (418, 11))

In [94]:
# 결측치 확인
test.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


In [95]:
# training data랑 똑같이 preprocessing 하고 결측치 확인
test = preprocessingData(test)
test.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Age,0
SibSp,0
Parch,0
Fare,1
Sex_female,0
Sex_male,0
Embarked_C,0
Embarked_Q,0


In [96]:
# 결측치 중앙값으로 대체 -> test data의 행은 절대로 삭제하면 안됨
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

In [97]:
# 다시 결측치 확인
test.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Age,0
SibSp,0
Parch,0
Fare,0
Sex_female,0
Sex_male,0
Embarked_C,0
Embarked_Q,0


## Base Model

In [98]:
# scikit_learn 에서 randomforest classifier 불러오기
from sklearn.ensemble import RandomForestClassifier

In [99]:
# T: Survied 값, X: Survived, PassengerId 빼고 나머지
T = train['Survived']
X = train.drop(['Survived', 'PassengerId'], axis=1)

In [100]:
# test data도 마찬가지
X_test = test.drop(['PassengerId'], axis=1)

In [101]:
# 모델로 학습
# 1) Hyperparameter 설정 안한 버전
# model = RandomForestClassifier()

# 2) Hyperparameter 설정
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, T)

# 학습된 모델에 X_test 넣어서 결과 예측
y = model.predict(X_test)

In [102]:
y

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [103]:
# 예측값들(y)을 dataframe으로 만들기
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y})
output = output.set_index("PassengerId")

# csv로 저장
output.to_csv('drive/My Drive/Savings/Submission2.csv')

In [104]:
output

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


## Spliting the Training Data

In [105]:
# Training Data를 Training Data, Validation Data로 나누기
from sklearn.model_selection import train_test_split

X = train.drop(['Survived', 'PassengerId'], axis=1)
T = train["Survived"]

X_train, X_validation, Y_train, Y_validation = train_test_split(X, T, test_size = 0.25, random_state = 0)

## Models

In [106]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# GNB 모델에 X_train, Y_train 학습
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)

# 학습된 걸 바탕으로 validation data 넣어서 예측값 확인
Y_pred = gaussian.predict(X_validation)

# accuracy score 확인(validation data의 Y값으로)
accuracy_gaussian = round(accuracy_score(Y_pred, Y_validation) * 100, 2)
print(accuracy_gaussian)

78.92


In [107]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Logistic Regression 모델에 X_train, Y_train 학습
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
Y_pred = logreg.predict(X_validation)

# accuracy score 확인
accuracy_logreg = round(accuracy_score(Y_pred, Y_validation) * 100, 2)
print(accuracy_logreg)

80.27


In [108]:
# SVM: Support Vector Machines
from sklearn.svm import SVC

# SVC 모델에 X_train, Y_train 학습
svc = SVC()
svc.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = svc.predict(X_validation)

# accuracy score 확인
accuracy_svm = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_svm)

72.2


In [109]:
# Linear SVC
from sklearn.svm import LinearSVC

# LinearSVC 모델에 X_train, Y_train 학습
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = linear_svc.predict(X_validation)

# accuracy score 확인
accuracy_linear_svc = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_linear_svc)

78.92


In [110]:
# Perceptron
from sklearn.linear_model import Perceptron

# Perceptron 모델에 X_train, Y_train 학습
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = perceptron.predict(X_validation)

# accuracy score 확인
accuracy_perceptron = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_perceptron)

72.65


In [111]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# DecisionTreeClassifier 모델에 X_train, Y_train 학습
decisiontree = DecisionTreeClassifier()
decisiontree.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = decisiontree.predict(X_validation)

# accuracy score 확인
accuracy_decisiontree = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_decisiontree)

75.34


In [112]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# RandomForestClassifier 모델에 X_train, Y_train 학습
randomforest = RandomForestClassifier()
randomforest.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = randomforest.predict(X_validation)

# accuracy score 확인
accuracy_randomforest = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_randomforest)

81.61


In [113]:
# kNN: k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

# knn 모델에 X_train, Y_train 학습
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = knn.predict(X_validation)

# accuracy score 확인
accuracy_knn = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_knn)

73.54


In [114]:
# Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier

# SGDClassifier 모델에 X_train, Y_train 학습
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = sgd.predict(X_validation)

# accuracy score 확인
accuracy_sgd = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_sgd)

78.48


In [115]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

# GradientBoostingClassifier 모델에 X_train, Y_train 학습
gbc = GradientBoostingClassifier()
gbc.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = gbc.predict(X_validation)

# accuracy score 확인
accuracy_gbc = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_gbc)

85.2


In [116]:
# XGBoost Classifier
from xgboost import XGBClassifier

# XGBClassifier 모델에 X_train, Y_train 학습
xgb = XGBClassifier()
xgb.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = xgb.predict(X_validation)

# accuracy score 확인
accuracy_xgb = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_xgb)

82.96


In [117]:
# LightGBM Classifier
from lightgbm import LGBMClassifier

# LGBMClassifier 모델에 X_train, Y_train 학습
lgbc = LGBMClassifier()
lgbc.fit(X_train, Y_train)

# 학습된 모델에 validation data 넣어서 예측값 확인
y_pred = lgbc.predict(X_validation)

# accuracy score 확인
accuracy_lgbc = round(accuracy_score(y_pred, Y_validation) * 100, 2)
print(accuracy_lgbc)

[LightGBM] [Info] Number of positive: 258, number of negative: 410
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 201
[LightGBM] [Info] Number of data points in the train set: 668, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.386228 -> initscore=-0.463198
[LightGBM] [Info] Start training from score -0.463198
85.65


In [118]:
# 모델별로 이름, 점수를 'models' 데이터프레임에 저장
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
             'Random Forest', 'Naive Bayes', 'Perceptron', 'Linear SVC',
             'Decision Tree', 'Stochastic Gradient Descent', 'Gradient Boosting Classifier',
             'XGBClassifier', 'LGBMClassifier'],
    'Score': [accuracy_svm, accuracy_knn, accuracy_logreg,
              accuracy_randomforest, accuracy_gaussian, accuracy_perceptron,accuracy_linear_svc, accuracy_decisiontree,
              accuracy_sgd, accuracy_gbc, accuracy_xgb, accuracy_lgbc]})
# 점수 높은 순 정렬
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
11,LGBMClassifier,85.65
9,Gradient Boosting Classifier,85.2
10,XGBClassifier,82.96
3,Random Forest,81.61
2,Logistic Regression,80.27
4,Naive Bayes,78.92
6,Linear SVC,78.92
8,Stochastic Gradient Descent,78.48
7,Decision Tree,75.34
1,KNN,73.54


In [119]:
# 학습된 lgbc 모델에 test data 사용해서 예측
ids = test['PassengerId']
predictions = lgbc.predict(test.drop('PassengerId', axis=1))

In [120]:
output = pd.DataFrame({'PassengerId': ids, 'Survived': predictions})
output.to_csv('drive/My Drive/Savings/LastSubmission.csv', index=False)

## Train Data 전체 학습시키기

In [121]:
# Train Data 전체를 설정
T = train['Survived']
X = train.drop(['Survived', 'PassengerId'], axis=1)
X_test = test.drop(['PassengerId'], axis=1)

In [122]:
# lgbc 모델로 train data 전체 학습
lgbc = LGBMClassifier()
lgbc.fit(X, T)

# 예측
y_pred = lgbc.predict(X_test)

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 227
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


In [123]:
# 예측 결과 저장
output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred})
output.to_csv('drive/My Drive/Savings/ReallyLastSubmission.csv', index=False)