In [130]:
import pandas as pd
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [155]:
dataset = pd.read_csv('titanic.csv')
test_dataset = pd.read_csv('titanic_test.csv')

In [159]:
train_data = dataset[['Age', 'Fare']]
pos_missing_train = train_data[train_data.isnull().any(axis=1)].index
train_data = train_data.dropna()
print(f'Hasil train data \n{train_data} \n\nhasil pos missing value \n{pos_missing_train}')

Hasil train data 
      Age     Fare
0    22.0   7.2500
1    38.0  71.2833
2    26.0   7.9250
3    35.0  53.1000
4    35.0   8.0500
..    ...      ...
885  39.0  29.1250
886  27.0  13.0000
887  19.0  30.0000
889  26.0  30.0000
890  32.0   7.7500

[714 rows x 2 columns] 

hasil pos missing value 
Int64Index([  5,  17,  19,  26,  28,  29,  31,  32,  36,  42,
            ...
            832, 837, 839, 846, 849, 859, 863, 868, 878, 888],
           dtype='int64', length=177)


In [161]:
test_data = test_dataset[['Age', 'Fare']]
pos_missing_test = test_data[test_data.isnull().any(axis=1)].index
test_data = test_data.dropna()
print(f'Hasil test data \n{test_data} \n\nhasil pos missing value \n{pos_missing_test}')

Hasil test data 
      Age      Fare
0    34.5    7.8292
1    47.0    7.0000
2    62.0    9.6875
3    27.0    8.6625
4    22.0   12.2875
..    ...       ...
409   3.0   13.7750
411  37.0   90.0000
412  28.0    7.7750
414  39.0  108.9000
415  38.5    7.2500

[331 rows x 2 columns] 

hasil pos missing value 
Int64Index([ 10,  22,  29,  33,  36,  39,  41,  47,  54,  58,  65,  76,  83,
             84,  85,  88,  91,  93, 102, 107, 108, 111, 116, 121, 124, 127,
            132, 133, 146, 148, 151, 152, 160, 163, 168, 170, 173, 183, 188,
            191, 199, 200, 205, 211, 216, 219, 225, 227, 233, 243, 244, 249,
            255, 256, 265, 266, 267, 268, 271, 273, 274, 282, 286, 288, 289,
            290, 292, 297, 301, 304, 312, 332, 339, 342, 344, 357, 358, 365,
            366, 380, 382, 384, 408, 410, 413, 416, 417],
           dtype='int64')


In [147]:
train_label = dataset['Survived'].drop(pos_missing_train)
print(train_label)

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 714, dtype: int64


In [148]:
test_label = pd.read_csv('titanic_testlabel.csv')['Survived'].drop(pos_missing_test)
print(test_label)

0      0
1      1
2      0
3      0
4      1
      ..
409    1
411    1
412    1
414    1
415    0
Name: Survived, Length: 331, dtype: int64


In [151]:
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
min_values = scaler.data_min_
max_values = scaler.data_max_
print(train_data)
print(min_values)
print(max_values)

[[0.27117366 0.01415106]
 [0.4722292  0.13913574]
 [0.32143755 0.01546857]
 ...
 [0.23347575 0.0585561 ]
 [0.32143755 0.0585561 ]
 [0.39683338 0.01512699]]
[0. 0.]
[1. 1.]


In [150]:
test_data = (test_data - min_values) / (max_values - min_values)
print(test_data)

          Age      Fare
0    0.428248  0.015282
1    0.585323  0.013663
2    0.773813  0.018909
3    0.334004  0.016908
4    0.271174  0.023984
..        ...       ...
409  0.032420  0.026887
411  0.459663  0.175668
412  0.346569  0.015176
414  0.484795  0.212559
415  0.478512  0.014151

[331 rows x 2 columns]


In [138]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    k_values = range(1, 11)
    class_results = []

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(train_data, train_label)
        predictions = knn.predict(test_data)
        print('Knn ke-',k)
        print(predictions)
        class_results.append(predictions)


Knn ke- 1
[0 0 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 0 0 0
 1 0 0 1 0 1 0 1 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 0
 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0
 1 0 1 0 1 1 0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 0 1
 0 1 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0
 1 0 1 1 1 0 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 1 0
 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1
 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 0 1 1 1 0 1 0]
Knn ke- 2
[0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0
 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0

In [139]:
error_ratios = []

for k, predictions in zip(k_values, class_results):
    accuracy = accuracy_score(test_label, predictions)
    error_ratio = 1 - accuracy
    error_ratios.append(error_ratio)

In [140]:
for k, error_ratio in zip(k_values, error_ratios):
    print(f"K = {k}: Error Ratio = {error_ratio:.4f}")

K = 1: Error Ratio = 0.4441
K = 2: Error Ratio = 0.3837
K = 3: Error Ratio = 0.4411
K = 4: Error Ratio = 0.4079
K = 5: Error Ratio = 0.4139
K = 6: Error Ratio = 0.4018
K = 7: Error Ratio = 0.4230
K = 8: Error Ratio = 0.3927
K = 9: Error Ratio = 0.4230
K = 10: Error Ratio = 0.3837
