In [1]:
#Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
#Display the dataset
dataset = pd.read_csv('titanic3.csv')

In [3]:
print( len(dataset) )
print( dataset.head() )

1309
   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000      0      0   24160  211.3375       B5        S    2    NaN   
1   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                   

In [4]:
zero_not_accepted = dataset.dropna(subset = ['age', 'ticket', 'fare'])
zero_not_accepted.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
mydata = zero_not_accepted.drop(columns = ['name','cabin','boat','body','home.dest','ticket','embarked'])

In [6]:
mydata.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,female,29.0,0,0,211.3375
1,1,1,male,0.9167,1,2,151.55
2,1,0,female,2.0,1,2,151.55
3,1,0,male,30.0,1,2,151.55
4,1,0,female,25.0,1,2,151.55


In [7]:
#Changing the given values to numerical
mydata['sex'].unique()
sex = {'male': 1, 'female': 0}
mydata['sex'].replace(sex, inplace=True)

In [8]:
mydata.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,0,29.0,0,0,211.3375
1,1,1,1,0.9167,1,2,151.55
2,1,0,0,2.0,1,2,151.55
3,1,0,1,30.0,1,2,151.55
4,1,0,0,25.0,1,2,151.55


In [9]:
mydata.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

In [10]:
#Performing the KNN algorithm
X = mydata.drop(columns=['survived'])
y = mydata['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
prediction = KNeighborsClassifier()
prediction.fit(X_train,y_train)
y_prediction=prediction.predict(X_test)

In [11]:
#Display the accuracy
accuracy = accuracy_score(y_test, y_prediction)
accuracy

0.6603053435114504

In [None]:
#Comparison
# The K-Nearest Neighbor (KNN) algorithm determines the similarities of a certain variable between two given datasets. There are categories that will be a basis where the variable belongs. Also, this algorithm is the best when the user utilizes small datasets. On the other hand, the Logistic Regression algorithm utilizes probability in determining whether certain data falls into a type of classification. The difference between logistic regression and KNN is that logistic regression utilizes large datasets. In the accuracy part, the result of logistic regression is more accurate which is, 70%-80% while KNN has a 66%-75% accuracy rate. 

In [None]:
#Analysis
#The goal of the program is to determine who are the individuals that can survive the Titanic tragedy. There is a part of the dataset that is not needed. In order to detect the gaps in the data, I used the “isna” function. In the use of this function, the data is organized and clean wherein the application of machine learning in the program will be easier. Also, the “inplace” function was used to change the given values to numerical form. This would allow the program to interpret the data. Lastly, the application of the algorithms was used to determine the accuracy rate. The trained dataset is already modified and the prediction of machine learning will execute. 

In [None]:
#Conclusion
#In conclusion, I was able to create two machine learning algorithm programs that determine the kind of individuals who are more likely to survive in the Titanic voyage. The two machine learning algorithms chosen are K-Nearest Neighbor (KNN) and Logistic Regression. I imported some necessary libraries needed to the program such as Pandas and NumPy. I also used the dataset of titanic passengers from Kaggle which determines the age, fare, names, etc. of each individual. In order to find the accuracy rate of each program, I modified the dataset and remove the unnecessary data. Also, I turned the NaN values to zero to clean it. After modifying the data, I applied the algorithm to get the accuracy rate. 