In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [50]:
! kaggle datasets download -d brendan45774/test-file

Dataset URL: https://www.kaggle.com/datasets/brendan45774/test-file
License(s): CC0-1.0
test-file.zip: Skipping, found more recently modified local copy (use --force to force download)


In [51]:
df = pd.read_csv('/content/test-file.zip')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [52]:
df.shape

(418, 12)

In [53]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [54]:
df.drop(['PassengerId',	'Name',	'SibSp', 'Parch', 'Ticket',	'Cabin', 'Embarked'], axis= 1, inplace=True)
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,34.5,7.8292
1,1,3,female,47.0,7.0
2,0,2,male,62.0,9.6875
3,0,3,male,27.0,8.6625
4,1,3,female,22.0,12.2875


In [55]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
Fare         1
dtype: int64

In [56]:
df.fillna(df['Age'].mean(), inplace=True)

In [57]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
dtype: int64

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    object 
 3   Age       418 non-null    float64
 4   Fare      418 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 16.5+ KB


In [59]:
inputs = df.drop('Survived', axis=1)
target = df.Survived

In [60]:
dummies = pd.get_dummies(inputs.Sex)
x = dummies.astype(int)
x.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,0,1
3,0,1
4,1,0


In [61]:
inputs  = pd.concat([inputs, x], axis=1)
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,34.5,7.8292,0,1
1,3,female,47.0,7.0,1,0
2,2,male,62.0,9.6875,0,1
3,3,male,27.0,8.6625,0,1
4,3,female,22.0,12.2875,1,0


In [62]:
inputs.drop(['Sex','male'], axis=1, inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female
0,3,34.5,7.8292,0
1,3,47.0,7.0,1
2,2,62.0,9.6875,0
3,3,27.0,8.6625,0
4,3,22.0,12.2875,1


In [63]:
inputs.columns[inputs.isna().any()]

Index([], dtype='object')

In [65]:
inputs.Age[:10]

0    34.5
1    47.0
2    62.0
3    27.0
4    22.0
5    14.0
6    30.0
7    26.0
8    18.0
9    21.0
Name: Age, dtype: float64

In [66]:
from sklearn.model_selection import train_test_split
x_train, x_test , y_train, y_test = train_test_split(inputs, target, test_size=0.2)

In [67]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [68]:
model.fit(x_train, y_train)

In [70]:
model.score(x_test, y_test)*100

100.0

In [71]:
model.predict(x_test[:10])

array([1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

In [72]:
y_test[:10]

37     1
277    0
383    1
142    0
152    0
160    1
175    1
219    0
176    1
355    0
Name: Survived, dtype: int64

In [74]:
model.predict(x_test[0:10])

array([1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

In [75]:
model.predict_proba(x_test[:10])

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])