In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from matplotlib import font_manager, rc 

font_name = font_manager.FontProperties(fname="C:/Windows/Fonts/MALGUN.TTF").get_name()
rc('font', family=font_name)

plt.rcParams['figure.figsize'] = (10,6) # 그림 그릴 배경 사이즈 비율
%matplotlib inline
plt.style.use('ggplot')
mpl.rcParams['axes.unicode_minus'] = False

In [2]:
train = pd.read_csv('./data/titanic_train.csv')
test = pd.read_csv('./data/titanic_test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## EDA

In [6]:
# train set check
for col in train.columns:
    msg = 'column: {:<10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (train[col].isnull().sum() / train[col].shape[0]))
    # 첫 번째에 컬럼명, 두 번째에 '컬럼 결측치 갯수 / 전체 row 수'
    print(msg)

column: PassengerId	 Percent of NaN value: 0.00%
column: Survived  	 Percent of NaN value: 0.00%
column: Pclass    	 Percent of NaN value: 0.00%
column: Name      	 Percent of NaN value: 0.00%
column: Sex       	 Percent of NaN value: 0.00%
column: Age       	 Percent of NaN value: 19.87%
column: SibSp     	 Percent of NaN value: 0.00%
column: Parch     	 Percent of NaN value: 0.00%
column: Ticket    	 Percent of NaN value: 0.00%
column: Fare      	 Percent of NaN value: 0.00%
column: Cabin     	 Percent of NaN value: 77.10%
column: Embarked  	 Percent of NaN value: 0.22%


In [7]:
# test set check
for col in test.columns:
    msg = 'column: {:<10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (test[col].isnull().sum() / test[col].shape[0]))
    # 첫 번째에 컬럼명, 두 번째에 '컬럼 결측치 갯수 / 전체 row 수'
    print(msg)

column: PassengerId	 Percent of NaN value: 0.00%
column: Pclass    	 Percent of NaN value: 0.00%
column: Name      	 Percent of NaN value: 0.00%
column: Sex       	 Percent of NaN value: 0.00%
column: Age       	 Percent of NaN value: 20.57%
column: SibSp     	 Percent of NaN value: 0.00%
column: Parch     	 Percent of NaN value: 0.00%
column: Ticket    	 Percent of NaN value: 0.00%
column: Fare      	 Percent of NaN value: 0.24%
column: Cabin     	 Percent of NaN value: 78.23%
column: Embarked  	 Percent of NaN value: 0.00%


In [8]:
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1 # 자기자신 포함
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1 # 자기자신 포함

In [9]:
test.loc[test.Fare.isnull(), 'Fare'] = test['Fare'].mean()
train['Fare'] = train['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
test['Fare'] = test['Fare'].map(lambda i: np.log(i) if i > 0 else 0)

In [12]:
test.loc[test.Fare.isnull(), 'Fare'] = test['Fare'].mean()

In [15]:
train['Embarked'].fillna('S', inplace = True)

In [14]:
train['Initial'] = train.Name.str.extract('([A-Za-z]+)\.') # .으로 끝나기 전까지의 앞 글자만 따기
test['Initial'] = test.Name.str.extract('([A-Za-z]+)\.')
train['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr', 'Mr'],inplace=True)

test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr', 'Mr'],inplace=True)

In [18]:
train.loc[(train.Age.isnull())&(train.Initial=='Mr'),'Age'] = 33
train.loc[(train.Age.isnull())&(train.Initial=='Mrs'),'Age'] = 36
train.loc[(train.Age.isnull())&(train.Initial=='Master'),'Age'] = 5
train.loc[(train.Age.isnull())&(train.Initial=='Miss'),'Age'] = 22
train.loc[(train.Age.isnull())&(train.Initial=='Other'),'Age'] = 46

test.loc[(test.Age.isnull())&(test.Initial=='Mr'),'Age'] = 33
test.loc[(test.Age.isnull())&(test.Initial=='Mrs'),'Age'] = 36
test.loc[(test.Age.isnull())&(test.Initial=='Master'),'Age'] = 5
test.loc[(test.Age.isnull())&(test.Initial=='Miss'),'Age'] = 22
test.loc[(test.Age.isnull())&(test.Initial=='Other'),'Age'] = 46

In [20]:
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3
    elif x < 50:
        return 4
    elif x < 60:
        return 5
    elif x < 70:
        return 6
    else:
        return 7    
    
train['Age_cat'] = train['Age'].apply(category_age)
test['Age_cat'] = test['Age'].apply(category_age)

In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
X = train.drop(['PassengerId', 'Survived', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis = 1)
y = train['Survived']

In [39]:
# 수치형으로
col = ['Sex', 'FamilySize', 'Initial', 'Embarked']
from sklearn.preprocessing import LabelEncoder
en = LabelEncoder()
for i in col:
    X[i] = en.fit_transform(X[i])
y = en.fit_transform(y)
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Initial,Age_cat
0,3,1,22.0,1.981001,2,1,2,2
1,1,0,38.0,4.266662,0,1,3,3
2,3,0,26.0,2.070022,2,0,1,2
3,1,0,35.0,3.972177,2,1,3,3
4,3,1,35.0,2.085672,2,0,2,3


In [49]:
XX = test.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis = 1)
# 수치형으로
col = ['Sex', 'FamilySize', 'Initial', 'Embarked']
from sklearn.preprocessing import LabelEncoder
en = LabelEncoder()
for i in col:
    XX[i] = en.fit_transform(XX[i])
XX.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Initial,Age_cat
0,3,1,34.5,2.05786,1,0,2,3
1,3,0,47.0,1.94591,2,1,3,4
2,2,1,62.0,2.270836,1,0,2,6
3,3,1,27.0,2.159003,2,0,2,2
4,3,0,22.0,2.508582,2,2,3,2


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)
result = []
for i in range(1, 10):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print('K == {}, accuracy == {:.2f}'.format(i, (pred == y_test).sum()/(len(pred))))
    print('accuracy of training set: {:.2f}'.format(model.score(X_train, y_train)))
    print('accuracy of training set: {:.2f}'.format(model.score(X_test, y_test)))
    result.append(pred)

K == 1, accuracy == 0.71
accuracy of training set: 0.98
accuracy of training set: 0.71
K == 2, accuracy == 0.67
accuracy of training set: 0.86
accuracy of training set: 0.67
K == 3, accuracy == 0.70
accuracy of training set: 0.87
accuracy of training set: 0.70
K == 4, accuracy == 0.70
accuracy of training set: 0.82
accuracy of training set: 0.70
K == 5, accuracy == 0.72
accuracy of training set: 0.83
accuracy of training set: 0.72
K == 6, accuracy == 0.70
accuracy of training set: 0.81
accuracy of training set: 0.70
K == 7, accuracy == 0.72
accuracy of training set: 0.81
accuracy of training set: 0.72
K == 8, accuracy == 0.72
accuracy of training set: 0.79
accuracy of training set: 0.72
K == 9, accuracy == 0.71
accuracy of training set: 0.80
accuracy of training set: 0.71


In [52]:
# k는 5가 좋아 보이네요
# 5로 가겠습니다.
model = KNeighborsClassifier(n_neighbors = 5)
model.fit(X_train, y_train)
pred = model.predict(XX)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [53]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Initial,Age_cat
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,2.057860,,Q,1,Mr,3
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,1.945910,,S,2,Mrs,4
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,2.270836,,Q,1,Mr,6
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,2.159003,,S,1,Mr,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,2.508582,,S,3,Mrs,2
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,2.221917,,S,1,Mr,1
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,2.031983,,Q,1,Miss,3
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,3.367296,,S,3,Mr,2
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,1.978128,,C,1,Mrs,1
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,3.184284,,S,3,Mr,2


In [64]:
# 캐글에 제출
titanic_result = pd.read_csv('./data/titanic_result.csv')

In [65]:
titanic_result['Survived'] = pred

In [67]:
titanic_result.to_csv('my_titanic_result.csv', index = False)

![image.png](attachment:image.png)

![image.png](attachment:image.png)