In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

titanic_df = pd.read_csv('Titanic-Dataset.csv')

In [2]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_df = titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)
titanic_df['Fare'].fillna(titanic_df['Fare'].median(), inplace=True)

In [4]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
titanic_df = pd.get_dummies(titanic_df, columns=['Sex', 'Embarked'], drop_first=True)


In [6]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [7]:
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

In [8]:
X.shape

(891, 8)

In [9]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.925,0,0,1
3,1,35.0,1,0,53.1,0,0,1
4,3,35.0,0,0,8.05,1,0,1


In [10]:
y.shape

(891,)

In [11]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=92)

In [13]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [14]:
def hypothesis(X, theta):
    return sigmoid(np.dot(X, theta))

In [15]:
def cost(X, y, theta):
    m = len(y)
    h = hypothesis(X, theta)
    return -1 / m * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

In [16]:
def gradient(X, y, theta):
    m = len(y)
    h = hypothesis(X, theta)
    return 1 / m * np.dot(X.T, (h - y))

In [37]:
def gradient_descent(X, y, learning_rate=0.02, epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)
    costs = []
    for _ in range(epochs):
        theta -= learning_rate * gradient(X, y, theta)
        costs.append(cost(X, y, theta))
    return theta, costs


In [38]:
X_train_bias = np.c_[np.ones((len(X_train), 1)), X_train]
X_test_bias = np.c_[np.ones((len(X_test), 1)), X_test]

In [39]:
theta, costs = gradient_descent(X_train_bias, y_train)


  return -1 / m * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))


In [40]:
def predict(X, theta):
    return np.round(hypothesis(X, theta)).astype(int)


In [41]:
y_pred = predict(X_test_bias, theta)

In [42]:
accuracy = np.mean(y_pred == y_test) * 100
print("Accuracy:", accuracy, "%")

Accuracy: 67.0391061452514 %


USING SVM


In [23]:
from sklearn import svm
svm = svm.SVC

In [24]:
M1 = svm(kernel = 'sigmoid', C =1, degree = 2)
M1.fit(X_train, y_train)
pred = M1.predict(X_test)
M1.score(X_test, y_test)

0.5251396648044693

In [25]:
M1 = svm(kernel = 'linear', C =1, degree = 6)
M1.fit(X_train, y_train)
pred = M1.predict(X_test)
M1.score(X_test, y_test)  #best accuracy

0.7597765363128491

In [26]:
M1 = svm(kernel = 'rbf', C =1, degree = 4)
M1.fit(X_train, y_train)
pred = M1.predict(X_test)
M1.score(X_test, y_test)

0.6424581005586593

In [27]:
M1 = svm(kernel = 'poly', C =1, degree = 5)
M1.fit(X_train, y_train)
pred = M1.predict(X_test)
M1.score(X_test, y_test)

0.6424581005586593

In [28]:
#Hyperparameter tuning for SVM
from sklearn import svm
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
degrees = [2, 3, 4, 5]  
Cs = [0.1, 1, 10, 100]   

best_accuracy = 0
best_model = None
best_kernel = None
best_degree = None
best_C = None

for kernel in kernels:
    for degree in degrees:
        for C in Cs:
            model = svm.SVC(kernel=kernel, C=C, degree=degree)
            model.fit(X_train, y_train)
            accuracy = model.score(X_test, y_test)
            print(f"Kernel: {kernel}, Degree: {degree}, C: {C}, Accuracy: {accuracy}")
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model
                best_kernel = kernel
                best_degree = degree
                best_C = C

print(f"Best Kernel: {best_kernel}, Best Degree: {best_degree}, Best C: {best_C}, Best Accuracy: {best_accuracy}")

Kernel: linear, Degree: 2, C: 0.1, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 2, C: 1, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 2, C: 10, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 2, C: 100, Accuracy: 0.7653631284916201
Kernel: linear, Degree: 3, C: 0.1, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 3, C: 1, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 3, C: 10, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 3, C: 100, Accuracy: 0.7653631284916201
Kernel: linear, Degree: 4, C: 0.1, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 4, C: 1, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 4, C: 10, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 4, C: 100, Accuracy: 0.7653631284916201
Kernel: linear, Degree: 5, C: 0.1, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 5, C: 1, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 5, C: 10, Accuracy: 0.7597765363128491
Kernel: linear, Degree: 5, C: 100, Accuracy: 0.76536

USING DECISION TREE

In [29]:
model = DecisionTreeClassifier()
model

DecisionTreeClassifier()

In [30]:
clf = DecisionTreeClassifier()
clf

DecisionTreeClassifier()

In [31]:
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [32]:
y_pred = clf.predict(X_test)

In [33]:
y_test

539    1
476    0
711    0
290    1
315    1
      ..
425    0
390    1
770    0
888    0
334    1
Name: Survived, Length: 179, dtype: int64

In [34]:
y_pred

array([1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1], dtype=int64)

In [35]:
print(f"Accuracy of the model = {accuracy_score(y_test, y_pred)}")

Accuracy of the model = 0.7877094972067039


In [36]:
# Hyperparameter tuning for descion tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

from sklearn.model_selection import GridSearchCV


grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5, verbose = 5)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.790 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.832 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.831 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.803 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=2;, score=0.803 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.790 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.832 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=3, min_samples_leaf=1, min_samples_split=5;, score=0.831 total time=   0.0s
[CV 4/5] END crite