In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

## 1. dataset => water_potability.csv

In [2]:
dataset = pd.read_csv('water_potability.csv')
dataset

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890456,20791.31898,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.05786,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.54173,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.41744,8.059332,356.886136,363.266516,18.436525,100.341674,4.628771,0
4,9.092223,181.101509,17978.98634,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681736,47580.99160,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.80216,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.57822,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.86938,6.303357,,402.883113,11.168946,77.488213,4.708658,1


## 2. data => ambil dataset kolom fitur (Hardness, Solids, Sulfate, Condictivity,Organic_carbon,Potability).

In [3]:
data = dataset[['Hardness','Solids','Sulfate','Conductivity','Organic_carbon','Potability']]
data

Unnamed: 0,Hardness,Solids,Sulfate,Conductivity,Organic_carbon,Potability
0,204.890456,20791.31898,368.516441,564.308654,10.379783,0
1,129.422921,18630.05786,,592.885359,15.180013,0
2,224.236259,19909.54173,,418.606213,16.868637,0
3,214.373394,22018.41744,356.886136,363.266516,18.436525,0
4,181.101509,17978.98634,310.135738,398.410813,11.558279,0
...,...,...,...,...,...,...
3271,193.681736,47580.99160,359.948574,526.424171,13.894419,1
3272,193.553212,17329.80216,,392.449580,19.903225,1
3273,175.762646,33155.57822,,432.044783,11.039070,1
3274,230.603758,11983.86938,,402.883113,11.168946,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Hardness        3276 non-null   float64
 1   Solids          3276 non-null   float64
 2   Sulfate         2495 non-null   float64
 3   Conductivity    3276 non-null   float64
 4   Organic_carbon  3276 non-null   float64
 5   Potability      3276 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 153.7 KB


## 3. train_data => ambil fitur (Hardness, Solids, Condictivity,Organic_carbon,Potability) pada data yang Sulfate != null

In [5]:
train_data = data[data['Sulfate'].notnull()].drop('Sulfate', axis=1)
train_data 

Unnamed: 0,Hardness,Solids,Conductivity,Organic_carbon,Potability
0,204.890456,20791.31898,564.308654,10.379783,0
3,214.373394,22018.41744,363.266516,18.436525,0
4,181.101509,17978.98634,398.410813,11.558279,0
5,188.313324,28748.68774,280.467916,8.399735,0
6,248.071735,28749.71654,283.651634,13.789695,0
...,...,...,...,...,...
3267,215.047358,15921.41202,390.410231,9.899115,1
3268,207.321086,17246.92035,329.266002,16.217303,1
3269,94.812545,37188.82602,439.893618,16.172755,1
3270,186.659040,26138.78019,415.886955,12.067620,1


## 4. train_label => ambil fitur (Sulfate) pada data yang Sulfate != null

In [6]:
train_label = data[data['Sulfate'].notnull()]['Sulfate']
train_label_2 = pd.qcut(train_label, 20, labels=False)
train_label_2

0       16
3       14
4        5
5        8
6       18
        ..
3267     5
3268     4
3269     0
3270    12
3271    14
Name: Sulfate, Length: 2495, dtype: int64

In [7]:
train_data_fill = data[data['Sulfate'].notnull()]
train_data_fill.loc[:, 'Sulfate Fill'] = train_label_2

# menyimpan data Sulfate yang tidak null yang telah diencoding dengan dengan rata-ratanya berdasarkan kelasnya
means_sulfate_category = train_data_fill.groupby('Sulfate Fill')['Sulfate'].mean()
dict_sulfate_category = means_sulfate_category.to_dict()
dict_sulfate_category

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_fill.loc[:, 'Sulfate Fill'] = train_label_2


{0: 244.414115772,
 1: 276.2308840408,
 2: 288.5825741744,
 3: 297.8284644403226,
 4: 304.5590343264,
 5: 311.1413905128,
 6: 316.7455590524193,
 7: 321.3452813464,
 8: 325.8120394496,
 9: 330.500319732,
 10: 335.1246990258064,
 11: 339.6877814744,
 12: 345.0332080584,
 13: 350.6588919435484,
 14: 356.58145500880005,
 15: 363.7611827712,
 16: 371.41497111612904,
 17: 380.1640293496,
 18: 393.318720288,
 19: 422.6340799472}

## 5. test_data => ambil fitur (Hardness, Solids, Condictivity,Organic_carbon,Potability) pada data yang Sulfate = null

In [8]:
test_data = data[data['Sulfate'].isnull()].drop('Sulfate',axis=1)
test_data

Unnamed: 0,Hardness,Solids,Conductivity,Organic_carbon,Potability
1,129.422921,18630.05786,592.885359,15.180013,0
2,224.236259,19909.54173,418.606213,16.868637,0
11,218.693300,18767.65668,364.098230,14.525746,0
14,205.344982,28388.00489,444.645352,13.228311,0
16,211.049406,30980.60079,315.141267,20.397022,0
...,...,...,...,...,...
3266,169.087052,14622.74549,464.525552,11.083027,1
3272,193.553212,17329.80216,392.449580,19.903225,1
3273,175.762646,33155.57822,432.044783,11.039070,1
3274,230.603758,11983.86938,402.883113,11.168946,1


## 6. train_data => lakukan normalisasi pada train_data dengan Min-Max 0-1 (catat nilai min dan max setiap atribut)

In [9]:
scaler = MinMaxScaler()

In [10]:
normalized_train_data = scaler.fit_transform(train_data)
normalized_train_data

array([[0.5833823 , 0.33609646, 0.65737516, 0.32974079, 0.        ],
       [0.61851651, 0.35624379, 0.29298545, 0.65452157, 0.        ],
       [0.49524445, 0.28992169, 0.35668464, 0.37724796, 0.        ],
       ...,
       [0.17554454, 0.60532181, 0.43187239, 0.56326524, 1.        ],
       [0.51583505, 0.42389469, 0.38836022, 0.39778032, 1.        ],
       [0.54185408, 0.7759474 , 0.58870938, 0.47142165, 1.        ]])

## 7. test_data => lakukan normalisasi pada test_data dengan Min-Max 0-1 (dengan nilai min dan max setiap atribut pada Langkah 6)

In [11]:
normalized_test_data = scaler.transform(test_data)
normalized_test_data

array([[0.3037757 , 0.30061142, 0.70917055, 0.52324613, 0.        ],
       [0.65505835, 0.32161885, 0.39328888, 0.59131739, 0.        ],
       [0.63452173, 0.30287061, 0.29449294, 0.49687151, 0.        ],
       ...,
       [0.475464  , 0.53910122, 0.41764635, 0.35631774, 1.        ],
       [0.67864988, 0.19148981, 0.3647907 , 0.36155328, 1.        ],
       [0.54711726, 0.28048408, 0.22808556, 0.56195963, 1.        ]])

##  8. class_result => Lakukan klasifikasi test_data terhadap train_data dengan 3-NN (output mepakai class pada train_label)

In [12]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(normalized_train_data, train_label_2)

class_result = knn.predict(normalized_test_data)
class_result[:10]

array([ 3,  1,  2,  7,  2, 10,  1,  4,  7,  0], dtype=int64)

## 9. data (Sulfate) => lakukan pengisian missing values pada data yang Sulfate=null dengan nilai class_result

In [13]:
# fill the missing values with the predicted values that mapped to the dict_sulfate_category and store it as new data
data = data[data['Sulfate'].isnull()]
data['Sulfate'] = class_result
data['Sulfate'] = data['Sulfate'].map(dict_sulfate_category)
data 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sulfate'] = class_result
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sulfate'] = data['Sulfate'].map(dict_sulfate_category)


Unnamed: 0,Hardness,Solids,Sulfate,Conductivity,Organic_carbon,Potability
1,129.422921,18630.05786,297.828464,592.885359,15.180013,0
2,224.236259,19909.54173,276.230884,418.606213,16.868637,0
11,218.693300,18767.65668,288.582574,364.098230,14.525746,0
14,205.344982,28388.00489,321.345281,444.645352,13.228311,0
16,211.049406,30980.60079,288.582574,315.141267,20.397022,0
...,...,...,...,...,...,...
3266,169.087052,14622.74549,325.812039,464.525552,11.083027,1
3272,193.553212,17329.80216,321.345281,392.449580,19.903225,1
3273,175.762646,33155.57822,288.582574,432.044783,11.039070,1
3274,230.603758,11983.86938,244.414116,402.883113,11.168946,1


## 10. lakukan hold out validation model pada data dengan 80% sebagai data training 20% sebagai data test

In [14]:
train_data = data.drop('Potability',axis=1)
train_label = data['Potability']

train_data, test_data, train_label, test_label = train_test_split(train_data, train_label, test_size=0.2, random_state=100, stratify=train_label)

## 11. train_data => ambil fitur (Hardness, Solids, Sulfate, Condictivity,Organic_carbon) pada data training dari langkah 10

In [15]:
train_data

Unnamed: 0,Hardness,Solids,Sulfate,Conductivity,Organic_carbon
1699,267.533027,11870.42485,244.414116,417.412531,15.252794
757,231.442553,15063.81497,244.414116,422.293819,15.027088
187,284.098351,27088.47646,363.761183,512.277310,16.912142
171,233.850345,21136.28676,339.687781,332.737587,9.745282
1712,186.807751,19790.85693,335.124699,489.796888,19.670703
...,...,...,...,...,...
3048,219.576215,16478.07143,316.745559,362.749220,13.054672
166,170.099236,39906.19027,325.812039,515.386742,13.516078
1367,199.638124,15201.33995,316.745559,306.023975,15.212798
65,212.306618,21815.07415,288.582574,362.108004,14.933013


## 12. train_label => ambil fitur (Potability) pada data training dari langkah 10

In [16]:
data['Potability'].value_counts()

0    488
1    293
Name: Potability, dtype: int64

In [17]:
train_label.value_counts()

0    390
1    234
Name: Potability, dtype: int64

## 13. test_data => ambil fitur (Hardness, Solids, Sulfate, Condictivity,Organic_carbon) pada data test dari langkah 10

In [18]:
test_data

Unnamed: 0,Hardness,Solids,Sulfate,Conductivity,Organic_carbon
2295,172.111514,26595.37354,297.828464,348.843576,7.877739
1936,165.892970,11566.87555,330.500320,369.289777,7.401868
2489,208.188086,24870.82594,311.141391,441.052571,19.095664
554,198.234408,26486.18144,244.414116,472.693725,11.032240
3069,199.599784,10452.72441,335.124699,316.257513,23.042376
...,...,...,...,...,...
1269,190.992873,26895.25796,304.559034,660.254946,18.125202
2464,190.531947,16191.42609,345.033208,564.203178,18.443031
2663,156.422941,14400.71863,304.559034,483.745716,7.897724
2640,230.430056,18254.11066,288.582574,294.582832,12.908755


## 14. test_label => ambil fitur(Potability) pada data test dari langkah 10

In [19]:
data['Potability'].value_counts()

0    488
1    293
Name: Potability, dtype: int64

In [20]:
test_label.value_counts()

0    98
1    59
Name: Potability, dtype: int64

## 15. train_data => lakukan normalisasi pada train_data dengan Min-Max 0-1 (catat nilai min dan max setiap atribut)

In [21]:
scaler = MinMaxScaler()

In [22]:
normalized_train_data = scaler.fit_transform(train_data)
normalized_train_data

array([[0.74725679, 0.19846546, 0.        , 0.47859646, 0.44235828],
       [0.5831722 , 0.25534898, 0.        , 0.48849846, 0.4327115 ],
       [0.8225707 , 0.469543  , 0.6696616 , 0.67103567, 0.51327931],
       ...,
       [0.43857402, 0.2577987 , 0.40585489, 0.25263772, 0.44064883],
       [0.49617107, 0.37560845, 0.24783115, 0.36640773, 0.42869071],
       [0.43833741, 0.41738467, 0.48303345, 0.50119611, 0.47525589]])

## 16. test_data => lakukan normalisasi pada test_data dengan Min-Max 0-1 (dengan nilai min dan max setiap atribut pada Langkah 15)

In [23]:
normalized_test_data = scaler.transform(test_data)
normalized_test_data[:10]

array([[0.31342486, 0.46075941, 0.29971024, 0.33949999, 0.12714605],
       [0.28515238, 0.19305837, 0.48303345, 0.38097641, 0.1068072 ],
       [0.47744625, 0.43004023, 0.37440965, 0.52655178, 0.60660374],
       [0.43219206, 0.45881439, 0.        , 0.59073786, 0.26197045],
       [0.43839971, 0.17321211, 0.50898104, 0.2733971 , 0.77528746],
       [0.61108813, 0.37661863, 0.29971024, 0.39057532, 0.46593626],
       [0.60043988, 0.17305308, 0.        , 0.28067035, 0.1446188 ],
       [0.40419532, 0.58047228, 0.24783115, 0.62532815, 0.1991304 ],
       [0.14896485, 0.48435076, 0.43166413, 0.47497943, 0.30119212],
       [0.59769092, 0.3799246 , 0.43166413, 0.43799908, 0.42717559]])

## 17. class_result => Lakukan klasifikasi test_data terhadap train_data dengan 3-NN (output memakai class pada train_label)

In [24]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(normalized_train_data, train_label)

class_result = knn.predict(normalized_test_data)
class_result[:10]

array([1, 0, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)

In [25]:
comparison = pd.DataFrame({'result':class_result, 'label':test_label})
comparison

Unnamed: 0,result,label
2295,1,0
1936,0,1
2489,0,0
554,1,0
3069,0,0
...,...,...
1269,0,0
2464,0,0
2663,0,0
2640,0,0


In [26]:
accuracy = accuracy_score(test_label, class_result)
print(f'Accuracy: {accuracy * 100} %')

Accuracy: 71.3375796178344 %


In [27]:
error_rate = 1 - accuracy
print(f'Error Rate: {error_rate * 100} %')

Error Rate: 28.66242038216561 %


In [28]:
cm = confusion_matrix(test_label, class_result)
cm

array([[82, 16],
       [29, 30]], dtype=int64)

In [29]:
report = classification_report(test_label, class_result)
print(report)

              precision    recall  f1-score   support

           0       0.74      0.84      0.78        98
           1       0.65      0.51      0.57        59

    accuracy                           0.71       157
   macro avg       0.70      0.67      0.68       157
weighted avg       0.71      0.71      0.70       157

