In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [93]:
# data load
base_path = 'titanic_dataset/'

train = pd.read_csv(base_path + 'train.csv')
test = pd.read_csv(base_path + 'test.csv')
submission = pd.read_csv(base_path + 'gender_submission.csv')

# 결측치 처리

In [94]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [95]:
train[train.isnull().any(axis=1)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [96]:
# Null in Embarked
train[train.Embarked.isnull()]

# train.Embarked.value_counts() --> "S"
# train.loc[(train.Pclass == 1) & (train.Sex == 'female'),'Embarked'].value_counts() --> "S"

train.loc[train.Embarked.isnull(), 'Embarked'] = "S"

In [97]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [98]:
# missing value handling
train.Cabin.value_counts() # --> drop
# drop columns : PassengerId, Name, Ticket, Cabin
train = train.drop(columns=['PassengerId','Name','Ticket','Cabin'])
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [99]:
# Null in 'Age'
train = train.fillna(train.Age.mean())
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


# Feature Engineering

In [100]:
# Ordinal Encoding -> ordinal feature를 변환 (ex/ 학력, 선호도~)
# One-hot Encoding -> Nominal feature를 변환 (ex/ 성별, 부서, 출신학교~)

train_OHE = pd.get_dummies(train,columns=['Sex','Embarked']) # 'Sex','Embarked' columns -> one-hot Encoding
train_OHE

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,7.2500,0,1,0,0,1
1,1,1,38.000000,1,0,71.2833,1,0,1,0,0
2,1,3,26.000000,0,0,7.9250,1,0,0,0,1
3,1,1,35.000000,1,0,53.1000,1,0,0,0,1
4,0,3,35.000000,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,0,1,0,0,1
887,1,1,19.000000,0,0,30.0000,1,0,0,0,1
888,0,3,29.699118,1,2,23.4500,1,0,0,0,1
889,1,1,26.000000,0,0,30.0000,0,1,1,0,0


In [101]:
# Normalization --> min-max scaling

X = train_OHE.drop(columns='Survived') # input matrix
y = train_OHE.Survived                 # target vector

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# scaler.fit()
# scaler.transform()
# X.Age = scaler.fit_transform(X.Age)
# X.Fare = scaler.fit_transform(X.Fare)
temp = scaler.fit_transform(X.loc[:,['Age','Fare']])
X['Age'] = temp[:,0]
X['Fare'] = temp[:,1]
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,0.271174,1,0,0.014151,0,1,0,0,1
1,1,0.472229,1,0,0.139136,1,0,1,0,0
2,3,0.321438,0,0,0.015469,1,0,0,0,1
3,1,0.434531,1,0,0.103644,1,0,0,0,1
4,3,0.434531,0,0,0.015713,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,0.334004,0,0,0.025374,0,1,0,0,1
887,1,0.233476,0,0,0.058556,1,0,0,0,1
888,3,0.367921,1,2,0.045771,1,0,0,0,1
889,1,0.321438,0,0,0.058556,0,1,1,0,0


# Training

In [140]:
# sklearn model load
# 1. Linear Classifier
from sklearn.linear_model import SGDClassifier
# 2. Logistic Regression
from sklearn.linear_model import LogisticRegression
# 3. Decision Tree
from sklearn.tree import DecisionTreeClassifier
# 4. Random Forest
from sklearn.ensemble import RandomForestClassifier

# 평가 지표
from sklearn.metrics import accuracy_score

In [148]:
clf = SGDClassifier() # classifier
clf2 = LogisticRegression()
clf3 = DecisionTreeClassifier()
clf4 = RandomForestClassifier()

clf.fit(X,y)
clf2.fit(X,y)
clf3.fit(X,y)
clf4.fit(X,y)

pred = clf.predict(X)
pred2 = clf2.predict(X)
pred3 = clf3.predict(X)
pred4 = clf4.predict(X)

In [149]:
print(" 1. Linear Classifier , Accuracy for Training : %.4f" % accuracy_score(y, pred))
print(" 2. Logistic Regression , Accuracy for Training : %.4f" % accuracy_score(y, pred2))
print(" 3. Decision Tree , Accuracy for Training : %.4f" % accuracy_score(y, pred3))
print(" 4. Random Forest , Accuracy for Training : %.4f" % accuracy_score(y, pred4))

 1. Linear Classifier , Accuracy for Training : 0.7800
 2. Logistic Regression , Accuracy for Training : 0.8013
 3. Decision Tree , Accuracy for Training : 0.9820
 4. Random Forest , Accuracy for Training : 0.9820


# test

In [105]:
# test data에 같은 feature engineering을 적용
test = test.drop(columns=['PassengerId','Name','Ticket','Cabin'])

In [106]:
test.loc[test['Age'] != test['Age'],'Age'] = test.Age.mean()

In [107]:
test.loc[test['Fare'] != test['Fare'],'Fare'] = test.Fare.mean()

In [108]:
# Categorical feature encoding
test_OHE = pd.get_dummies(test,columns=['Sex','Embarked'])
# Normalization
temp = scaler.transform(test_OHE.loc[:,['Age','Fare']])
test_OHE.Age = temp[:,0]
test_OHE.Fare = temp[:,1]

test_OHE

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,0.428248,0,0,0.015282,0,1,0,1,0
1,3,0.585323,1,0,0.013663,1,0,0,0,1
2,2,0.773813,0,0,0.018909,0,1,0,1,0
3,3,0.334004,0,0,0.016908,0,1,0,0,1
4,3,0.271174,1,1,0.023984,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,3,0.375127,0,0,0.015713,0,1,0,0,1
414,1,0.484795,0,0,0.212559,1,0,1,0,0
415,3,0.478512,0,0,0.014151,0,1,0,0,1
416,3,0.375127,0,0,0.015713,0,1,0,0,1


In [109]:
# prediction
result = clf.predict(test_OHE)
result2 = clf2.predict(test_OHE)
result3 = clf3.predict(test_OHE)
result4= clf4.predict(test_OHE)

In [110]:
# 결과 파일인 submission.csv 생성
submission['Survived'] = result4

In [111]:
submission.to_csv(base_path + "submission.csv", index=False)