# Initial testing of machine learning algorithms

Splitting the dataframe into a training and testing set to train several machine learning and check which one has the highest accuracy

In [5]:
#Imports

import os
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [6]:
complete_df = pd.read_csv(os.path.join('..','..','data',"Variant 2",'titanic_cleaned.csv'))

train_df, test_df = train_test_split(complete_df, test_size=0.25, random_state=25)
print(f"No. of training examples: {train_df.shape[0]}")
print(f"No. of testing examples: {test_df.shape[0]}")

No. of training examples: 981
No. of testing examples: 328


In [7]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.drop("Survived", axis=1)
Y_test = test_df["Survived"]

Testing accuracy of the SGD Classifier which generates the worst accuracy from all of the algorithms. Since a new value is generated every time, 1000 cases were taken and a mean and modal value were displayed to show the average and most common accuracies generated.

In [8]:
model_a =[]
i = 0
while i < 1000:
    #possible solution is to run it multiple times and find the average
    model = linear_model.SGDClassifier(max_iter=5, tol=None)
    model.fit(X_train, Y_train)

    model.score(X_train, Y_train)
    model_a.append(round(model.score(X_test, Y_test) * 100, 2))
    i += 1
    
print(model_a)

[60.37, 60.67, 41.77, 54.27, 75.61, 72.56, 39.63, 71.95, 37.5, 65.24, 60.67, 60.37, 75.3, 72.87, 60.98, 71.34, 57.01, 67.38, 60.37, 71.95, 40.24, 68.6, 67.99, 68.29, 60.37, 71.65, 70.12, 72.56, 72.26, 73.48, 74.39, 41.46, 45.12, 60.37, 68.29, 73.48, 60.37, 76.22, 39.63, 58.84, 71.04, 73.17, 75.0, 71.95, 75.0, 66.46, 74.7, 41.16, 40.24, 62.2, 72.87, 61.28, 60.37, 75.91, 70.12, 39.63, 70.73, 59.15, 71.95, 73.17, 72.26, 71.95, 40.85, 68.6, 71.34, 70.73, 75.61, 72.26, 41.16, 71.34, 68.29, 72.56, 59.45, 60.37, 58.54, 75.0, 60.37, 75.3, 42.68, 73.48, 71.04, 71.34, 73.48, 72.87, 59.45, 72.87, 71.04, 73.78, 60.37, 72.56, 71.04, 73.17, 57.32, 60.37, 63.72, 68.9, 69.51, 60.06, 43.6, 60.37, 69.82, 42.68, 71.04, 68.9, 70.12, 63.72, 60.37, 71.04, 71.04, 72.56, 56.71, 69.21, 71.95, 67.68, 41.16, 65.24, 69.21, 60.37, 73.17, 63.72, 71.04, 40.55, 68.9, 60.67, 75.91, 75.91, 74.39, 75.0, 50.0, 73.17, 71.34, 64.94, 74.39, 74.39, 68.6, 66.16, 71.34, 70.43, 57.93, 64.63, 75.0, 68.29, 61.59, 69.82, 74.09, 71

In [9]:
import statistics
print("Mean value: ",statistics.mean(model_a))
print("Mode value: ",statistics.mode(model_a))

Mean value:  64.7258
Mode value:  60.37


This graph depicts how many times an accuracy value was generated form the algorithm

In [10]:
sns.histplot(model_a, stat="count", multiple="stack",
             kde=False,
             element="bars", legend=True)
plt.title("Model accuracy count")
plt.xlabel("Accuracy")
plt.ylabel("Count")

Text(0, 0.5, 'Count')

Testing accuracy of the KNN Classifier which generates the second best accuracy from all of the algorithms with 13 neighbours providing the highest accuracy

In [11]:
knn = KNeighborsClassifier(n_neighbors = 13) 
knn.fit(X_train, Y_train)  
knnR = round(knn.score(X_test, Y_test) * 100, 2)
print(knnR)

70.12


Testing accuracy of the Random Forest Classifier which generates the highest accuracy from all of the algorithms

In [12]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

random_forest.score(X_test, Y_test)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

print(acc_random_forest)

95.82


Sources: https://towardsdatascience.com/predicting-the-survival-of-titanic-passengers-30870ccc7e8
         https://www.kaggle.com/code/allohvk/titanic-missing-age-imputation-tutorial-advanced/notebook