In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("C:/Users/hp/OneDrive/Documents/Machine Learning Practice/Datasets/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [3]:
df['Pclass'].unique()

array([3, 1, 2], dtype=int64)

In [4]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis='columns', inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [5]:
target = df['Survived']
inputs = df.drop("Survived", axis='columns')

In [6]:
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [7]:
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


### One Hot Encoding as Feature Encoding

In [9]:
dummies = pd.get_dummies(inputs['Sex'])
dummies.head()

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


In [10]:
inputs = pd.concat([inputs, dummies], axis='columns')
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,False,True
1,1,female,38.0,71.2833,True,False
2,3,female,26.0,7.925,True,False
3,1,female,35.0,53.1,True,False
4,3,male,35.0,8.05,False,True


In [11]:
inputs.drop("Sex", axis='columns', inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,False,True
1,1,38.0,71.2833,True,False
2,3,26.0,7.925,True,False
3,1,35.0,53.1,True,False
4,3,35.0,8.05,False,True


In [17]:
inputs.isna().any()

Pclass    False
Age        True
Fare      False
female    False
male      False
dtype: bool

In [18]:
inputs.isna().sum()

Pclass      0
Age       177
Fare        0
female      0
male        0
dtype: int64

In [19]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [20]:
inputs['Age'].head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

#### If dataset has outliers then we replace our 'NULL' value by their 'median()' otherwise replace with 'mean()'.

In [21]:
inputs['Age'] = inputs['Age'].fillna(inputs['Age'].mean())
inputs.head(10)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,False,True
1,1,38.0,71.2833,True,False
2,3,26.0,7.925,True,False
3,1,35.0,53.1,True,False
4,3,35.0,8.05,False,True
5,3,29.699118,8.4583,False,True
6,1,54.0,51.8625,False,True
7,3,2.0,21.075,False,True
8,3,27.0,11.1333,True,False
9,2,14.0,30.0708,True,False


In [23]:
inputs['Age'].mean()

29.69911764705882

In [24]:
inputs.shape

(891, 5)

#### Split the dataset

In [25]:
from sklearn.model_selection import train_test_split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

In [74]:
len(X_train)

712

In [75]:
len(X_test)

179

## Naive Bayes Model

In [76]:
from sklearn.naive_bayes import GaussianNB

In [77]:
model = GaussianNB()

In [78]:
model.fit(X_train, y_train)

#### Accuracy

In [79]:
model.score(X_test, y_test)

0.8044692737430168

In [80]:
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
197,3,42.0,8.4042,False,True
89,3,24.0,8.05,False,True
468,3,29.699118,7.725,False,True
855,3,18.0,9.35,True,False
670,2,40.0,39.0,True,False
844,3,17.0,8.6625,False,True
684,2,60.0,39.0,False,True
152,3,55.5,8.05,False,True
437,2,24.0,18.75,True,False
150,2,51.0,12.525,False,True


In [81]:
y_test[:10]

197    0
89     0
468    0
855    1
670    1
844    0
684    0
152    0
437    1
150    0
Name: Survived, dtype: int64

In [82]:
model.predict(X_test[:10])

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0], dtype=int64)

In [83]:
model.predict_proba(X_test[:10])

array([[0.98786394, 0.01213606],
       [0.98642553, 0.01357447],
       [0.98733995, 0.01266005],
       [0.04290529, 0.95709471],
       [0.02032049, 0.97967951],
       [0.98453478, 0.01546522],
       [0.96003462, 0.03996538],
       [0.98609427, 0.01390573],
       [0.02230119, 0.97769881],
       [0.97216929, 0.02783071]])