# Naive Bayes (Predicting survival from titanic crash)

In [1]:
# import libraries
import pandas as pd 

In [2]:
df = pd.read_csv('tested.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)

In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,34.5,7.8292
1,1,3,female,47.0,7.0
2,0,2,male,62.0,9.6875
3,0,3,male,27.0,8.6625
4,1,3,female,22.0,12.2875


In [7]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

In [8]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head(3)

Unnamed: 0,female,male
0,0,1
1,1,0
2,0,1


In [9]:
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,34.5,7.8292,0,1
1,3,female,47.0,7.0,1,0
2,2,male,62.0,9.6875,0,1


In [10]:
inputs.drop(['Sex','male'],axis='columns',inplace=True)
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female
0,3,34.5,7.8292,0
1,3,47.0,7.0,1
2,2,62.0,9.6875,0


In [11]:
inputs.columns[inputs.isna().any()]

Index(['Age', 'Fare'], dtype='object')

In [12]:
inputs.Age[:10]

0    34.5
1    47.0
2    62.0
3    27.0
4    22.0
5    14.0
6    30.0
7    26.0
8    18.0
9    21.0
Name: Age, dtype: float64

In [13]:
inputs.Age = inputs.fillna(inputs.Age.mean())

In [19]:
inputs.Fare = inputs.fillna(inputs.Fare.mean())

In [20]:
inputs.head(6)

Unnamed: 0,Pclass,Age,Fare,female
0,3,3.0,3.0,0
1,3,3.0,3.0,1
2,2,2.0,2.0,0
3,3,3.0,3.0,0
4,3,3.0,3.0,1
5,3,3.0,3.0,0


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.2)

In [22]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [23]:
model.fit(X_train,y_train)

GaussianNB()

In [24]:
model.score(X_test,y_test)

1.0

In [25]:
X_test[0:10]

Unnamed: 0,Pclass,Age,Fare,female
409,3,3.0,3.0,1
5,3,3.0,3.0,0
33,3,3.0,3.0,1
288,3,3.0,3.0,0
267,3,3.0,3.0,0
391,1,1.0,1.0,1
106,3,3.0,3.0,0
133,3,3.0,3.0,0
154,3,3.0,3.0,0
377,2,2.0,2.0,0


In [26]:
y_test[0:10]

409    1
5      0
33     1
288    0
267    0
391    1
106    0
133    0
154    0
377    0
Name: Survived, dtype: int64

In [27]:
model.predict(X_test[:10])

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 0], dtype=int64)

In [28]:
model.predict_proba(X_test[:10])

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [29]:
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train,y_train,cv=5)

array([1., 1., 1., 1., 1.])