In [516]:
import pandas as pd
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [517]:
# 1. dataset <-- titanic.csv
dataset = pd.read_csv('titanic.csv')
print(dataset)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [518]:
# 2. data <-- ambil dataset kolom fitur (Sex, Age, Pclass, Fare, Survived).
data = dataset.loc[: ,['Sex', 'Age', 'Pclass', 'Fare', 'Survived']]
print(data)

        Sex   Age  Pclass     Fare  Survived
0      male  22.0       3   7.2500         0
1    female  38.0       1  71.2833         1
2    female  26.0       3   7.9250         1
3    female  35.0       1  53.1000         1
4      male  35.0       3   8.0500         0
..      ...   ...     ...      ...       ...
886    male  27.0       2  13.0000         0
887  female  19.0       1  30.0000         1
888  female   NaN       3  23.4500         0
889    male  26.0       1  30.0000         1
890    male  32.0       3   7.7500         0

[891 rows x 5 columns]


In [519]:
# 3. train_data <-- ambil fitur (Sex, Pclass, Fare, Survived) pada data yang Age != null
train_data = data[data['Age'].notnull()].loc[:, ['Sex', 'Pclass', 'Fare', 'Survived']]
print(train_data)

        Sex  Pclass     Fare  Survived
0      male       3   7.2500         0
1    female       1  71.2833         1
2    female       3   7.9250         1
3    female       1  53.1000         1
4      male       3   8.0500         0
..      ...     ...      ...       ...
885  female       3  29.1250         0
886    male       2  13.0000         0
887  female       1  30.0000         1
889    male       1  30.0000         1
890    male       3   7.7500         0

[714 rows x 4 columns]


In [520]:
# 4. train_label <-- ambil fitur (Age) pada data yang Age != null
train_label = data[data['Age'].notnull()]['Age']
train_label = train_label.astype(int)
print(train_label)

0      22
1      38
2      26
3      35
4      35
       ..
885    39
886    27
887    19
889    26
890    32
Name: Age, Length: 714, dtype: int64


In [521]:
# 5. test_data <-- ambil fitur (Sex, Pclass, Fare, Survived) pada data yang Age=null
test_data = data[data['Age'].isnull()].loc[:, ['Sex', 'Pclass', 'Fare', 'Survived']]
print(test_data)

        Sex  Pclass     Fare  Survived
5      male       3   8.4583         0
17     male       2  13.0000         1
19   female       3   7.2250         1
26     male       3   7.2250         0
28   female       3   7.8792         1
..      ...     ...      ...       ...
859    male       3   7.2292         0
863  female       3  69.5500         0
868    male       3   9.5000         0
878    male       3   7.8958         0
888  female       3  23.4500         0

[177 rows x 4 columns]


In [522]:
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
test_data['Sex'] = label_encoder.transform(test_data['Sex'])

In [523]:
# 6. train_data <-- lakukan normalisasi pada train_data dengan Min-Max 0-1
# (catat nilai min dan max setiap atribut)
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
min_values = scaler.data_min_
max_values = scaler.data_max_
print(train_data)

[[1.         1.         0.01415106 0.        ]
 [0.         0.         0.13913574 1.        ]
 [0.         1.         0.01546857 1.        ]
 ...
 [0.         0.         0.0585561  1.        ]
 [1.         0.         0.0585561  1.        ]
 [1.         1.         0.01512699 0.        ]]


In [524]:
# 7. test_data <-- lakukan normalisasi pada test_data dengan Min-Max 0-1
# (dengan nilai min dan max setiap atribut pada Langkah 6)
test_data = (test_data - min_values) / (max_values - min_values)
print(test_data)

     Sex  Pclass      Fare  Survived
5    1.0     1.0  0.016510       0.0
17   1.0     0.5  0.025374       1.0
19   0.0     1.0  0.014102       1.0
26   1.0     1.0  0.014102       0.0
28   0.0     1.0  0.015379       1.0
..   ...     ...       ...       ...
859  1.0     1.0  0.014110       0.0
863  0.0     1.0  0.135753       0.0
868  1.0     1.0  0.018543       0.0
878  1.0     1.0  0.015412       0.0
888  0.0     1.0  0.045771       0.0

[177 rows x 4 columns]


In [525]:
# 8. class_result <-- Lakukan klasifikasi test_data terhadap train_data dengan 3-NN
# (output mepakai class pada train_label)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_data, train_label)
class_result = knn.predict(test_data)
print(class_result)

[21 31 13 30 19 28 58 22 20 28 20 32 22  2 28 30  3 28 20 22 20 20 28 21
  4 20 40  4  9 22 17 11 22 45 24 16 23 31  1 40 22 11 40 28 10 21 14  1
 22 18 40 21 29 22 24 45 30 27 22  1 17 20 17 11  4 35 28  1 45 30 19 19
 32 13 22 24 28 21 10 28 19 24 23 28 22 40  1 16 16 20 35 40 20 24 21 22
 31 24 10 16 26 40 21 34 20 24 30 17 19 17  4 19  0 18 19 40 20 23 17 22
 14 17 20 21  6 30 28 28 20  1 40 21 38 21 32 22 28 19 28 19 35 24 23 32
 16  3 62 32 16 24 28 28 26 19 36 24 30 40 21 24 40 16 29 38 20 11 25 17
 20 26 11 19 17 16 28 28  3]




In [526]:
# 9. data (Age) <-- lakukan pengisian missing values pada data yang Age=null dengan nilai class_result
data.loc[data['Age'].isnull(), 'Age'] = class_result
print(data['Age'])

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     3.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64


In [527]:
# 10. test_dataset <-- titanic_test.csv
test_dataset = pd.read_csv('titanic_test.csv')
print(test_dataset)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0      male  34.5      0      0 

In [528]:
# 11. train_data <-- ambil fitur (Sex, Age, Pclass, Fare) dari data
train_data = data.loc[: ,['Sex', 'Age', 'Pclass', 'Fare']]
print(train_data)

        Sex   Age  Pclass     Fare
0      male  22.0       3   7.2500
1    female  38.0       1  71.2833
2    female  26.0       3   7.9250
3    female  35.0       1  53.1000
4      male  35.0       3   8.0500
..      ...   ...     ...      ...
886    male  27.0       2  13.0000
887  female  19.0       1  30.0000
888  female   3.0       3  23.4500
889    male  26.0       1  30.0000
890    male  32.0       3   7.7500

[891 rows x 4 columns]


In [529]:
# 12. train_label <-- ambil fitur (Survived) dari data
train_label =  data.loc[:,['Survived']]
print(train_label)

     Survived
0           0
1           1
2           1
3           1
4           0
..        ...
886         0
887         1
888         0
889         1
890         0

[891 rows x 1 columns]


In [530]:
# 13. test_data <-- ambil test_dataset kolom fitur (Sex, Age, Pclass, Fare).
# Hilangkan baris data yang terdapat missing values
test_data = test_dataset.loc[:, ['Sex', 'Age', 'Pclass', 'Fare']]
missing_index = []
for x in range(len(test_data)):
  if math.isnan(test_data['Age'][x]) | math.isnan(test_data['Fare'][x]):
    missing_index.append(x)

test_data = test_dataset[['Sex', 'Age', 'Pclass', 'Fare']].dropna()
print(test_data)
print(len(missing_index))

        Sex   Age  Pclass      Fare
0      male  34.5       3    7.8292
1    female  47.0       3    7.0000
2      male  62.0       2    9.6875
3      male  27.0       3    8.6625
4    female  22.0       3   12.2875
..      ...   ...     ...       ...
409  female   3.0       3   13.7750
411  female  37.0       1   90.0000
412  female  28.0       3    7.7750
414  female  39.0       1  108.9000
415    male  38.5       3    7.2500

[331 rows x 4 columns]
87


In [531]:
# 14. test_label <-- titanic_testlabel.csv (urutan sesuai test_data no.13)
test_label = pd.read_csv('titanic_testlabel.csv')
print(test_label)

     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


In [532]:
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
test_data['Sex'] = label_encoder.transform(test_data['Sex'])

In [533]:
# 15. train_data <-- lakukan normalisasi pada train_data dengan Min-Max 0-1
# (catat nilai min dan max setiap atribut)
train_data = scaler.fit_transform(train_data)
min_values = scaler.data_min_
max_values = scaler.data_max_
print(train_data)

[[1.         0.275      1.         0.01415106]
 [0.         0.475      0.         0.13913574]
 [0.         0.325      1.         0.01546857]
 ...
 [0.         0.0375     1.         0.04577135]
 [1.         0.325      0.         0.0585561 ]
 [1.         0.4        1.         0.01512699]]


In [534]:
# 16. test_data <-- lakukan normalisasi pada test_data dengan Min-Max 0-1
# (dengan nilai min dan max setiap atribut pada Langkah 15)
test_data = (test_data - min_values) / (max_values - min_values)
print(test_data)

     Sex      Age  Pclass      Fare
0    1.0  0.43125     1.0  0.015282
1    0.0  0.58750     1.0  0.013663
2    1.0  0.77500     0.5  0.018909
3    1.0  0.33750     1.0  0.016908
4    0.0  0.27500     1.0  0.023984
..   ...      ...     ...       ...
409  0.0  0.03750     1.0  0.026887
411  0.0  0.46250     0.0  0.175668
412  0.0  0.35000     1.0  0.015176
414  0.0  0.48750     0.0  0.212559
415  1.0  0.48125     1.0  0.014151

[331 rows x 4 columns]


In [535]:
# 17. class_result <-- Lakukan klasifikasi test_data terhadap train_data dengan 3-NN
# (output mepakai class pada train_label)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_data, train_label)
class_result = knn.predict(test_data)
print(class_result)

  return self._fit(X, y)


[0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 1
 1 0 0 1 1 1 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 1 1
 0 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0
 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 0 1 1 0
 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 0 0 1 0
 0 0 0 1 1 1 1 1 0 1 1 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0
 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 0 1
 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0 0 1 1 0 1 0]


In [536]:
# 18. error <-- Bandingkan hasil klasifikasi class_result dengan test_label.
# Jika tidak sama berarti error. Hitunglah jumlah error dari seluruh class_result
j = 0
for x in range(len(test_label)):
  if x == missing_index[j]:
    test_label.drop([x], axis=0, inplace=True)
    j = j+1

error = sum(class_result != test_label['Survived'])
print(error)

65


In [538]:
#19. error_ratio  error dibagi jumlah test_data, dikali 100 (%)
error_ratio = (error / len(test_data)) * 100

print("Jumlah Error:", error)
print("Error Ratio (%):", error_ratio)

Jumlah Error: 65
Error Ratio (%): 19.637462235649547
