In [44]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [45]:
# Read the dataset into python environment

tit = pd.read_csv('/content/titanic_dataset .csv')

In [46]:
# Understand no.of rows and no.of columns

tit.shape

(891, 12)

In [47]:
# Datatypes

tit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [48]:
# Shows head part
tit.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [49]:
# statistics

tit.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [50]:
# Checking Null values

tit.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Cleaning Data

In [51]:
# Cabin column contains 687 null values so it will be remove
tit = tit.drop('Cabin', axis=1,)

In [52]:
# Remove unwanted columns
tit = tit.drop('Name', axis=1,)
tit = tit.drop('Ticket', axis=1,)


In [53]:
#there few Nullvalues in Age. so replacing Age with mean of the column
tit["Age"] = tit["Age"].fillna(tit["Age"].mean())

In [54]:
# Catagorical values of Embarked
tit["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [55]:
#Only 2 Null values in Embarked so these 2 columns filled with 's'
tit["Embarked"] = tit["Embarked"].fillna("S")

In [56]:
#Check the Null values
tit.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

 **Replacing the categorical values into numerical value**

In [57]:
tit.Embarked.replace(['S', 'C', 'Q'], [1, 2, 3], inplace=True)

In [58]:
tit.Sex.replace(['male', 'female'], [1,0], inplace=True)

In [59]:
tit.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,1
1,2,1,1,0,38.0,1,0,71.2833,2
2,3,1,3,0,26.0,0,0,7.925,1


# Dividing data into features and labels

In [60]:
feature_columns = ['PassengerId','Pclass','Sex', 'Age', 'Parch','Fare','Embarked']
X = tit[feature_columns].values
y = tit['Survived'].values

# Spliting dataset into training set and test set

In [61]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=44)

In [62]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((623, 7), (268, 7))

# Fit the model

In [63]:
# Fitting clasifier to the Training set
from sklearn.neighbors import KNeighborsClassifier

# Instantiate learning model (k = 3)
classifier = KNeighborsClassifier(n_neighbors=3)

# Fitting the model
classifier.fit(X_train, y_train)

# Testing the KNN algorithm on the Testing Data
```



In [64]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Evaluate the model

In [65]:
from sklearn.metrics import accuracy_score

# checking accuracy
accuracy = accuracy_score(y_test, y_pred)*100
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

Accuracy of our model is equal 60.45 %.


# SVM(support vector machine) Algorithm


In [66]:
#Create the SVM model
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
#Fit the model for the data

classifier.fit(X_train, y_train)

#Make the prediction
y_pred = classifier.predict(X_test)

In [67]:
#display accuracy
print(accuracy_score(y_test,y_pred)*100)

76.49253731343283


# Linear Regression

In [68]:
from sklearn.linear_model import LogisticRegression
# create a linear regression model
LR_model =LogisticRegression()
# Fit the model
LR_model =LR_model.fit(X_train,y_train)
score_LR=LR_model.score(X_test,y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
# Accuracy of LR
score_LR

0.7649253731343284

# K-Fold cross validation

In [70]:
# Lets split the data into 10 folds.
# We will use this 'kfold_validator'(KFold splitting stratergy) object as input to cross_val_score() method

from sklearn.model_selection import KFold
kfold_validator =KFold(10)

# split()  method generate indices to split data into training and test set.
for train_index,test_index in kfold_validator.split(X,y):
  print('Training Index: ',train_index)
  print('Testing Index: ',test_index)


Training Index:  [ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
 324 325 326 327 328 329 330 331 3

Using Linear regression

In [None]:
from sklearn.model_selection import cross_val_score
cv_re = cross_val_score(LR_model,X,y,cv=kfold_validator)

In [72]:
#  score values(accuracy) of each 10 fold cross validation of test data
cv_re

array([0.75555556, 0.80898876, 0.75280899, 0.83146067, 0.73033708,
       0.78651685, 0.75280899, 0.74157303, 0.84269663, 0.76404494])

In [73]:
# avaerage
np.mean(cv_re)

0.7766791510611735

# Using kNN

In [74]:
# 10-fold cross-validation with K=20 for KNN (the n_neighbors parameter)
# k = 20 for KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=20)
# We are passing the entirety of X and y, not X_train or y_train, it takes care of splitting the data
# cv=10 for 10 folds and scores showing accuracy
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

[0.61111111 0.61797753 0.51685393 0.51685393 0.5505618  0.68539326
 0.70786517 0.61797753 0.68539326 0.61797753]


In [75]:
# avaerage
np.mean(scores)

0.6127965043695381

Using SVM

In [76]:
cv_result1 = cross_val_score(classifier,X,y,cv=kfold_validator)

In [77]:
cv_result1

array([0.8       , 0.82022472, 0.75280899, 0.84269663, 0.76404494,
       0.79775281, 0.75280899, 0.73033708, 0.84269663, 0.76404494])

In [78]:
np.mean(cv_result1)

0.786741573033708

# Stratified cross validation

In [87]:
# Lets split the data into 10 folds.
from sklearn.model_selection import StratifiedKFold
skfold_validator =StratifiedKFold(n_splits = 10)# no.of fold is 10


# split()  method generate indices to split data into training and test set.
for train_index,test_index in skfold_validator.split(X,y):
  print('Training Index: ',train_index)
  print('Testing Index: ',test_index)

Training Index:  [ 82  84  85  88  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
 324 325 326 327 328 329 330 331 3

Stratified cross validation Using Linear Regression

In [None]:
# Accuracy checking
cv_res = cross_val_score(LR_model,X,y,cv=skfold_validator)

In [81]:
cv_res

array([0.76666667, 0.82022472, 0.76404494, 0.84269663, 0.76404494,
       0.75280899, 0.7752809 , 0.75280899, 0.82022472, 0.76404494])

In [82]:
# mean
np.mean(cv_res)

0.7822846441947566

Stratified cross validation Using kNN

In [83]:
#Accuracy checking
cv_result = cross_val_score(knn,X,y,cv=skfold_validator)
cv_result

array([0.61111111, 0.61797753, 0.51685393, 0.51685393, 0.5505618 ,
       0.68539326, 0.70786517, 0.61797753, 0.68539326, 0.61797753])

In [84]:
# mean
np.mean(cv_result)

0.6127965043695381

Stratified cross validation Using SVM

In [85]:
#Accuracy
cv_res_svm = cross_val_score(classifier,X,y,cv=skfold_validator)
cv_res_svm


array([0.8       , 0.79775281, 0.76404494, 0.84269663, 0.79775281,
       0.7752809 , 0.7752809 , 0.74157303, 0.80898876, 0.76404494])

In [90]:
#mean
np.mean(cv_res_svm)

0.7867415730337078

CONCLUSION

What i understand is that accuracy does not change ,it comes with the same values in K-fold and Stratified Cross validation.After the k-fold and Stratified cross validation accuracy has increased.SVM and LR gives better accuracy than kNN.