In [2]:
import pandas as pd

df = pd.read_csv("titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [4]:
# remove unnecessary coulumns
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)

df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [5]:
# target that we wanna predict
target = df.Survived

target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [8]:
# features that we use to predict
features = df.drop('Survived',axis='columns')

features.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [9]:
# Sex column has text data, so we convert it with integers
dummies = pd.get_dummies(features.Sex)

dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [10]:
features = pd.concat([features, dummies], axis='columns')

features.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [11]:
features.drop('Sex', axis='columns', inplace=True)

features.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [12]:
features.columns[features.isna().any()]

Index(['Age'], dtype='object')

In [13]:
features.Age.head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

We have NaN (Not a Number) in Age column

In [14]:
features.describe()

Unnamed: 0,Pclass,Age,Fare,female,male
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,32.204208,0.352413,0.647587
std,0.836071,14.526497,49.693429,0.47799,0.47799
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,7.9104,0.0,0.0
50%,3.0,28.0,14.4542,0.0,1.0
75%,3.0,38.0,31.0,1.0,1.0
max,3.0,80.0,512.3292,1.0,1.0


Mean for Age column is **29.699118**

In [16]:
features = features.fillna(features.Age.mean())

features.head(6)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1
5,3,29.699118,8.4583,0,1


In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2) # 20% for testing

In [18]:
len(X_train)

712

In [19]:
len(X_test)

179

712 + 179 = 891

In [22]:
len(features)

891

In [51]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

In [52]:
model.fit(X_train, y_train)

In [53]:
model.score(X_test, y_test)

0.7988826815642458

In [55]:
y_test[:10]

714    0
328    1
826    0
142    1
368    1
299    1
202    0
666    0
59     0
224    1
Name: Survived, dtype: int64

In [59]:
model.predict(X_test[:10])

array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0], dtype=int64)

In [60]:
# probability
model.predict_proba(X_test[:10])

array([[9.78320919e-01, 2.16790810e-02],
       [5.80530736e-02, 9.41946926e-01],
       [9.79574248e-01, 2.04257515e-02],
       [5.21131143e-02, 9.47886886e-01],
       [5.69818841e-02, 9.43018116e-01],
       [1.05536393e-11, 1.00000000e+00],
       [9.89827670e-01, 1.01723299e-02],
       [9.75157077e-01, 2.48429225e-02],
       [9.74372547e-01, 2.56274529e-02],
       [5.94414582e-01, 4.05585418e-01]])

In [61]:
model.predict_log_proba(X_test[:10])

array([[-2.19175247e-02, -3.83140749e+00],
       [-2.84639763e+00, -5.98063474e-02],
       [-2.06372421e-02, -3.89095884e+00],
       [-2.95433865e+00, -5.35201027e-02],
       [-2.86502189e+00, -5.86697856e-02],
       [-2.52745504e+01, -1.05551123e-11],
       [-1.02244216e-02, -4.58808400e+00],
       [-2.51567158e-02, -3.69518238e+00],
       [-2.59615566e-02, -3.66409112e+00],
       [-5.20178254e-01, -9.02423778e-01]])

In [62]:
model.predict_joint_log_proba(X_test[:10])

array([[-10.59891577, -14.40840574],
       [-14.79209028, -12.005499  ],
       [ -9.61767972, -13.48800132],
       [-14.96300254, -12.06218399],
       [-14.90304433, -12.09669223],
       [-43.17915299, -17.90460264],
       [ -9.20041501, -13.77827459],
       [ -9.23943799, -12.90946365],
       [-10.61415837, -14.25228793],
       [-13.39801851, -13.78026404]])