## Build Various Classification Models and Suggest the best Model based on Accuracy of Results:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [66]:
titanic = pd.read_csv('titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [67]:
titanic.shape

(891, 12)

In [68]:
titanic.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208,0.383838
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429,0.486592
min,1.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104,0.0
50%,446.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,668.5,3.0,38.0,1.0,0.0,31.0,1.0
max,891.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [69]:
# Finding values and there frequency in Survived column
titanic['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [70]:
# Finding the number relation between the sex and the survival of the passengers
titanic.groupby('Sex')[['Survived']].mean()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [71]:
# Finding the relation of Sex and the Pclass of the passengers
titanic.pivot_table('Survived', index='Sex', columns='Pclass')

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [72]:
age = pd.cut(titanic['Age'], [0, 18, 80])
titanic.pivot_table('Survived', ['Sex', age], 'Pclass')

Unnamed: 0_level_0,Pclass,1,2,3
Sex,Age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [73]:
# Finding the total Nan values in each column
titanic.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Survived         0
dtype: int64

In [74]:
# Drop the columns
titanic = titanic.drop(labels='Cabin',axis=1)
titanic = titanic.drop(labels='Name',axis=1)
titanic = titanic.drop(labels='Ticket',axis=1)

#Remove the rows with missing values
titanic = titanic.dropna(subset =['Embarked', 'Age'])

In [75]:
titanic

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1,3,male,22.0,1,0,7.2500,S,0
1,2,1,female,38.0,1,0,71.2833,C,1
2,3,3,female,26.0,0,0,7.9250,S,1
3,4,1,female,35.0,1,0,53.1000,S,1
4,5,3,male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...
885,886,3,female,39.0,0,5,29.1250,Q,0
886,887,2,male,27.0,0,0,13.0000,S,0
887,888,1,female,19.0,0,0,30.0000,S,1
889,890,1,male,26.0,0,0,30.0000,C,1


In [76]:
titanic.shape

(712, 9)

In [77]:
titanic.dtypes

PassengerId      int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
Survived         int64
dtype: object

In [78]:
#Print the unique values in the columns
print(titanic['Sex'].unique())
print(titanic['Embarked'].unique())

['male' 'female']
['S' 'C' 'Q']


In [79]:
#Encoding categorical data values (Transforming object data types to integers)
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

#Encode sex column
titanic.iloc[:,2]= labelencoder.fit_transform(titanic.iloc[:,2].values)
#print(labelencoder.fit_transform(titanic.iloc[:,2].values))

#Encode embarked
titanic.iloc[:,7]= labelencoder.fit_transform(titanic.iloc[:,7].values)
#print(labelencoder.fit_transform(titanic.iloc[:,7].values))

#Print the NEW unique values in the columns
print(titanic['Sex'].unique())
print(titanic['Embarked'].unique())

[1 0]
[2 0 1]


In [80]:
titanic

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1,3,1,22.0,1,0,7.2500,2,0
1,2,1,0,38.0,1,0,71.2833,0,1
2,3,3,0,26.0,0,0,7.9250,2,1
3,4,1,0,35.0,1,0,53.1000,2,1
4,5,3,1,35.0,0,0,8.0500,2,0
...,...,...,...,...,...,...,...,...,...
885,886,3,0,39.0,0,5,29.1250,1,0
886,887,2,1,27.0,0,0,13.0000,2,0
887,888,1,0,19.0,0,0,30.0000,2,1
889,890,1,1,26.0,0,0,30.0000,0,1


In [81]:
#Split the data into independent 'X' and dependent 'Y' variables
X = titanic.iloc[:, 0:8].values 
Y = titanic.iloc[:, 8].values 

In [82]:
# Split the dataset into 80% Training set and 20% Testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [83]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [107]:
#Create a function within many Machine Learning Models
def Classification_models(X_train,Y_train):  
      #Using Logistic Regression Algorithm to the Training Set
      from sklearn.linear_model import LogisticRegression
      log = LogisticRegression(random_state = 0)
      log.fit(X_train, Y_train)
  
      #Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm
      from sklearn.neighbors import KNeighborsClassifier
      knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
      knn.fit(X_train, Y_train)

      #Using GaussianNB method of naïve_bayes class to use Naïve Bayes Algorithm
      from sklearn.naive_bayes import GaussianNB
      gauss = GaussianNB()
      gauss.fit(X_train, Y_train)

      #Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm
      from sklearn.tree import DecisionTreeClassifier
      tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
      tree.fit(X_train, Y_train)

      #Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
      from sklearn.ensemble import RandomForestClassifier
      forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
      forest.fit(X_train, Y_train)
        
  
      #print model accuracy on the training data.
      print('1.Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
      print('2.K Nearest Neighbor Training Accuracy:', knn.score(X_train, Y_train))
      print('3.Gaussian Naive Bayes Training Accuracy:', gauss.score(X_train, Y_train))
      print('4.Decision Tree Classifier Training Accuracy:', tree.score(X_train, Y_train))
      print('5.Random Forest Classifier Training Accuracy:', forest.score(X_train, Y_train))
      
      return log, knn, gauss, tree, forest
      

In [102]:
model = Classification_models(X_train,Y_train)

1.Logistic Regression Training Accuracy: 0.7926186291739895
2.K Nearest Neighbor Training Accuracy: 0.8576449912126538
3.Gaussian Naive Bayes Training Accuracy: 0.804920913884007
4.Decision Tree Classifier Training Accuracy: 1.0
5.Random Forest Classifier Training Accuracy: 0.9929701230228472


The best model is the Decision Tree Classifier because its accuracy is 100 % here