**Libraries used : Pandas & Sci-kit Learn**

In [50]:
import pandas as pd

In [51]:
df=pd.read_csv(r"C:\Users\Deepti Anand\Desktop\ML\Titanic Dataset\train.csv" , encoding='unicode_escape')

**exploring data** 

In [52]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


**Dropping redundant columns**

In [53]:
df.drop(['PassengerId',	'Name' ,'SibSp','Ticket','Cabin','Embarked'] , axis='columns' , inplace =True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare
0,0,3,male,22.0,0,7.25
1,1,1,female,38.0,0,71.2833
2,1,3,female,26.0,0,7.925
3,1,1,female,35.0,0,53.1
4,0,3,male,35.0,0,8.05


**Preparing Data**

In [54]:
target = df.Survived
inputs = df.drop('Survived' , axis='columns')

In [55]:
target

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [56]:
inputs

Unnamed: 0,Pclass,Sex,Age,Parch,Fare
0,3,male,22.0,0,7.2500
1,1,female,38.0,0,71.2833
2,3,female,26.0,0,7.9250
3,1,female,35.0,0,53.1000
4,3,male,35.0,0,8.0500
...,...,...,...,...,...
886,2,male,27.0,0,13.0000
887,1,female,19.0,0,30.0000
888,3,female,,2,23.4500
889,1,male,26.0,0,30.0000


**Creating Dummy variables . Since ,in ML , machine do not recognize text .**

In [57]:
dummies = pd.get_dummies(inputs.Sex)
dummies.head(3)

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False


**Concatenating inputs dataframe and dummies**

In [58]:
inputs=pd.concat([inputs,dummies] , axis = 'columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,female,male
0,3,male,22.0,0,7.25,False,True
1,1,female,38.0,0,71.2833,True,False
2,3,female,26.0,0,7.925,True,False


In [59]:
inputs.drop(['Sex'] , axis ='columns' , inplace=True)
inputs.head(3)

Unnamed: 0,Pclass,Age,Parch,Fare,female,male
0,3,22.0,0,7.25,False,True
1,1,38.0,0,71.2833,True,False
2,3,26.0,0,7.925,True,False


**Checking for null values.**

In [60]:
inputs.columns[inputs.isna().any()]
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

**Filling null values with mean value.**

In [61]:
inputs.Age=inputs.Age.fillna(inputs.Age.mean())
inputs.head(6)

Unnamed: 0,Pclass,Age,Parch,Fare,female,male
0,3,22.0,0,7.25,False,True
1,1,38.0,0,71.2833,True,False
2,3,26.0,0,7.925,True,False
3,1,35.0,0,53.1,True,False
4,3,35.0,0,8.05,False,True
5,3,29.699118,0,8.4583,False,True


In [62]:
inputs.columns[inputs.isna().any()]
inputs.Age[:10]

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: Age, dtype: float64

**Training and testing the model.**

In [63]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(inputs,target,test_size=0.2)

In [64]:
len(x_train)

712

In [65]:
len(x_test)

179

**creating Naive_Bayes model and checking for it's score.**

In [66]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [67]:
model.fit(x_train,y_train)

In [68]:
model.score(x_test,y_test)

0.770949720670391

In [69]:
model.score(x_train,y_train)

0.7851123595505618

In [70]:
x_test[:10]

Unnamed: 0,Pclass,Age,Parch,Fare,female,male
449,1,52.0,0,30.5,False,True
90,3,29.0,0,8.05,False,True
398,2,23.0,0,10.5,False,True
702,3,18.0,1,14.4542,True,False
665,2,32.0,0,73.5,False,True
291,1,19.0,0,91.0792,True,False
839,1,29.699118,0,29.7,False,True
227,3,20.5,0,7.25,False,True
45,3,29.699118,0,8.05,False,True
489,3,9.0,1,15.9,False,True


In [71]:
y_test[:10]

449    1
90     0
398    0
702    0
665    0
291    1
839    1
227    0
45     0
489    1
Name: Survived, dtype: int64

In [72]:
model.predict(x_test[:10])

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 0], dtype=int64)

In [73]:
model.predict_proba(x_test[:10])

array([[9.36607983e-01, 6.33920172e-02],
       [9.91544051e-01, 8.45594854e-03],
       [9.80352179e-01, 1.96478209e-02],
       [5.21107682e-02, 9.47889232e-01],
       [9.30765947e-01, 6.92340534e-02],
       [7.70703535e-04, 9.99229296e-01],
       [9.35156179e-01, 6.48438206e-02],
       [9.90333570e-01, 9.66642956e-03],
       [9.91609828e-01, 8.39017202e-03],
       [9.84667641e-01, 1.53323590e-02]])

**Importing different classifiers from sci-kit library.**

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

**Creating Logistic_Regression model.**

In [77]:
lr = LogisticRegression()
lr.fit(x_train , y_train)
lr.score(x_test, y_test)

0.7932960893854749

**Creating Support_Vector_Model(SVM).**

In [78]:
svm = SVC()
svm.fit(x_train , y_train)
svm.score(x_test, y_test)

0.6983240223463687

**Creating Random_Forest_Model.**

In [79]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(x_train , y_train)
rf.score(x_test, y_test)

0.8491620111731844

**Creating K_Neighbor model.**

In [80]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train , y_train)
knn.score(x_test, y_test)

0.7150837988826816

In [89]:
data= {'Model': ['Naive_Bayes','Logistic_Regression', 'SVC','Random_Forest','K_Neighbor'],
       'Accuracy_Score':[model.score(x_test,y_test) ,lr.score(x_test, y_test) ,svm.score(x_test, y_test),rf.score(x_test, y_test),knn.score(x_test, y_test)]}

**Different Models and their Accuracy score.**

In [91]:
df = pd.DataFrame(data)
print(df)

                 Model  Accuracy_Score
0          Naive_Bayes        0.770950
1  Logistic_Regression        0.793296
2                  SVC        0.698324
3        Random_Forest        0.849162
4           K_Neighbor        0.715084
