In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [25]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [26]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked']
       , axis = 'columns', inplace=True)
# 데이터 프레임에서 필요없는 부분을 제거해준다. 
# inplace = True (저장함)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [27]:
target = df.Survived
inputs = df.drop('Survived', axis = 'columns')
# target = Survived, inputs = 나머지로 구성해준다.

In [28]:
dummies = pd.get_dummies(inputs.Sex)
# Sex columns의 dummies를 만들어준다. male, female을 숫자로 나타냄.
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [29]:
inputs = pd.concat([inputs, dummies], axis = 'columns')
# concat: 두 개의 배열을 하나의 배열로 만들어주는 역활도 하는 함수
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [30]:
inputs.drop('Sex', axis='columns', inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [31]:
inputs.head(10)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1
5,3,,8.4583,0,1
6,1,54.0,51.8625,0,1
7,3,2.0,21.075,0,1
8,3,27.0,11.1333,1,0
9,2,14.0,30.0708,1,0


In [32]:
inputs.columns[inputs.isna().any()]
# isna().any(): 데이터프래임에 NaN이 있는지 알려준다.
print(inputs.isna().any())

Pclass    False
Age        True
Fare      False
female    False
male      False
dtype: bool


In [34]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
# fillna: 데이터프래임에 NaN을 특정값으로 채워준다.
inputs.head(10)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1
5,3,29.699118,8.4583,0,1
6,1,54.0,51.8625,0,1
7,3,2.0,21.075,0,1
8,3,27.0,11.1333,1,0
9,2,14.0,30.0708,1,0


In [48]:
X = inputs; y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [49]:
len(X_train)

712

In [50]:
len(X_test)

179

In [51]:
len(inputs)

891

In [52]:
model = GaussianNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.770949720670391

In [53]:
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
52,1,49.0,76.7292,1,0
794,3,25.0,7.8958,0,1
259,2,50.0,26.0,1,0
451,3,29.699118,19.9667,0,1
95,3,29.699118,8.05,0,1
416,2,34.0,32.5,1,0
561,3,40.0,7.8958,0,1
795,2,39.0,13.0,0,1
70,2,32.0,10.5,0,1
814,3,30.5,8.05,0,1


In [54]:
y_test[:10]

52     1
794    0
259    1
451    0
95     0
416    1
561    0
795    0
70     0
814    0
Name: Survived, dtype: int64

In [55]:
model.predict(X_test[:10])
# model 예측

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 0], dtype=int64)

In [56]:
model.predict_proba(X_test[:10])
# model의 확률적 예측

array([[0.00204152, 0.99795848],
       [0.99041493, 0.00958507],
       [0.04801992, 0.95198008],
       [0.99142667, 0.00857333],
       [0.99114112, 0.00885888],
       [0.04043746, 0.95956254],
       [0.99206324, 0.00793676],
       [0.98378195, 0.01621805],
       [0.98234972, 0.01765028],
       [0.99124263, 0.00875737]])