In [None]:
import pandas as pd
import numpy as np

# Load Data

In [None]:
# Load Iris Dataset
dwn_url='https://drive.google.com/uc?id=' + '1C5-s_COuWrjn52wzs-QtEYc6QE_IlR2O'
df = pd.read_csv(dwn_url)

# Data Explorations

In [None]:
# Membaca beberapa baris pertama dari data
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
# Membaca beberap baris terakhir dari data
df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [None]:
# Menampilkan unique value pada kolom Species
# (bisa kita lihat bahwa kolom Species menjadi class yang memiliki 3 kelas)
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [None]:
# Menampilkan metadata dari data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [None]:
# Menampilkan summary statistik dari data
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [None]:
# Menampilkan jumlah baris dan kolom pada dataframe
df.shape

(150, 6)

# Preprocessing

In [None]:
# Menghapus kolom 'Id' dari dataframe
df = df.drop('Id', axis=1)

In [None]:
# Menghapus kolom 'Id' dari dataframe
# Shuffle Data
df = df.sample(150).reset_index(drop=True)
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.4,3.7,1.5,0.2,Iris-setosa
1,4.6,3.1,1.5,0.2,Iris-setosa
2,6.5,3.2,5.1,2.0,Iris-virginica
3,5.1,3.8,1.6,0.2,Iris-setosa
4,5.0,3.3,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.4,2.7,5.3,1.9,Iris-virginica
146,5.9,3.0,5.1,1.8,Iris-virginica
147,4.8,3.4,1.6,0.2,Iris-setosa
148,7.7,2.8,6.7,2.0,Iris-virginica


In [None]:
# Melakukan folding pada data
fold1 = (df.iloc[0:50].reset_index(drop=True), df.iloc[50:150].reset_index(drop=True))
fold2 = (df.iloc[50:100].reset_index(drop=True), pd.concat([df.iloc[0:50], df.iloc[100:150]]).reset_index(drop=True))
fold3 = (df.iloc[100:150].reset_index(drop=True), df.iloc[0:150].reset_index(drop=True))
# Menampilkan data fold2 untuk training sebanyak 100 baris
test, train = fold2
print(train)

    SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm         Species
0             5.4           3.7            1.5           0.2     Iris-setosa
1             4.6           3.1            1.5           0.2     Iris-setosa
2             6.5           3.2            5.1           2.0  Iris-virginica
3             5.1           3.8            1.6           0.2     Iris-setosa
4             5.0           3.3            1.4           0.2     Iris-setosa
..            ...           ...            ...           ...             ...
95            6.4           2.7            5.3           1.9  Iris-virginica
96            5.9           3.0            5.1           1.8  Iris-virginica
97            4.8           3.4            1.6           0.2     Iris-setosa
98            7.7           2.8            6.7           2.0  Iris-virginica
99            5.2           3.4            1.4           0.2     Iris-setosa

[100 rows x 5 columns]


In [None]:
# Normalizations
# Melakukan normalisasi data dengan min-max normalization
def norm(df):
  df = (df - df.min()) / (df.max() - df.min())
  return df

In [None]:
# Assign fitur dan kelas
# Menghapus kolom species(karena akan kita gunakan sebagai class)
X = df.drop('Species', axis=1)
# Species sebagai kelas
Y = df.Species

In [None]:
# Melakukan normalisasi untuk data fitur
X = norm(X)

In [None]:
X

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,0.305556,0.708333,0.084746,0.041667
1,0.083333,0.458333,0.084746,0.041667
2,0.611111,0.500000,0.694915,0.791667
3,0.222222,0.750000,0.101695,0.041667
4,0.194444,0.541667,0.067797,0.041667
...,...,...,...,...
145,0.583333,0.291667,0.728814,0.750000
146,0.444444,0.416667,0.694915,0.708333
147,0.138889,0.583333,0.101695,0.041667
148,0.944444,0.333333,0.966102,0.791667


# KNN Model

In [None]:
# Menghitung jarak menggunakan euclidean distance
def euclidean(x1,x2):
  return np.sqrt(np.sum(x1-x2)**2)

In [None]:
# Euclidean distance dari data baris pertama dan kedua
euclidean(X.iloc[0],X.iloc[1])

0.4722222222222223

In [None]:
# Training KNN
def knn(X_train, y_train, X_test, k): # k sebagai banyaknya neighbors yang ditentukan
  dist = []
  # Menghitung distance dari data training dan data testing
  for row in range(X_train.shape[0]):
    dist.append(euclidean(X_train.iloc[row],X_test))
  # Menambahkan data distance pada data
  data = X_train.copy()
  data['Dist'] = dist
  # Menambahkan class pada data
  data['Class'] = y_train
  # Mengurutkan data berdasarkan distance
  data = data.sort_values(by='Dist').reset_index(drop=True)
  # Mengambil label kelas yang paling sering muncul diantara k-NN 
  y_pred = data.iloc[:k].Class.mode()
  return y_pred[0]

# Evaluation 

In [None]:
# Menghitung akurasi dari output berdasarkan label kelas
def acc(y_pred, y_true):
  true = 0
  for i in range(len(y_pred)):
    if y_pred[i] == y_true[i]:
      true += 1
  return true/len(y_pred)

In [None]:
# Evaluasi model dengan menggunakan data fold
def evaluate(fold, k):
  test, train = fold
  X_train, Y_train = train.drop('Species', axis=1), train.Species
  X_test, Y_test = test.drop('Species', axis=1), test.Species
  X_train = norm(X_train)
  X_test = norm(X_test)
  y_preds = []
  for row in range(X_test.shape[0]):
    y_preds.append(knn(X_train,Y_train,X_test.iloc[row],k))
  return (acc(y_preds, Y_test))

In [None]:
#Main Program
k = 5
accs = []
folds = [fold1, fold2, fold3]
for i in range(len(folds)):
  accs.append(evaluate(folds[i], k))
print(f'Menggunakan k : {k}, dengan rata-rata akurasi : {sum(accs)/3}')

Menggunakan k : 5, dengan rata-rata akurasi : 0.7866666666666666
