## **GET DATASET**

In [None]:
# Example Simple Dataset
!wget https://download940.mediafire.com/nqiefe3gcofghGCqewMc66UQM7ORYEPCunmX3vndy4U-Ml0lngjAvRjJAvJB9JO9xM9d1UXqYvVkwFf7WcTClNjpHKLpZvxepxVYtyEAAxiymjE-45XBm8NRxrfitbfkfOrazdt7rMR9iBRxP9J1QSyIRtfp3-jyReg7gV9FGhN_55HZ/6uzdf0zfhbo73rk/KNN_blank.xlsx

In [None]:
# Iris Classification from UCI
!wget https://archive.ics.uci.edu/static/public/53/iris.zip

In [None]:
!unzip iris.zip

## **PREPARING REQUIREMENTS**

In [162]:
import numpy as np
import openpyxl
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler
from mpl_toolkits.mplot3d import Axes3D
# from sklearn.metrics import accuracy_score

##### PREPARING DATAFRAME

In [None]:
# UCI Dataset
cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df = pd.read_csv("iris.data", sep=',', names=cols)

# Example Dataset Sheet 1
# df = pd.read_excel("KNN_blank.xlsx").drop(columns=['No.'])

# Example Dataset Sheet 2 (Iris)
# cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
# df = pd.read_excel("KNN_blank.xlsx", sheet_name='Sheet2', names=cols)
df.head()

##### PLOTTING HISTOGRAM FROM DATAFRAME

In [None]:
# UCI Dataset
for label in df.columns[:-1]:
  plt.hist(df[df["class"]=="Iris-setosa"][label], color='blue', label='Iris Sentosa', alpha=0.7, density=True)
  plt.hist(df[df["class"]=="Iris-versicolor"][label], color='red', label='Iris Versicolor', alpha=0.7, density=True)
  plt.hist(df[df["class"]=="Iris-virginica"][label], color='green', label='Iris Virginica', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

In [None]:
# Example Dataset Sheet 1
for label in df.columns[:-1]:
  plt.hist(df[df["Y"]=="Jelek"][label], color='blue', label='Jelek', alpha=0.7, density=True)
  plt.hist(df[df["Y"]=="Baik"][label], color='red', label='Baik', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

In [None]:
# Example Dataset Sheet 2 (Iris)
for label in df.columns[:-1]:
  plt.hist(df[df["class"]=="Iris Sentosa"][label], color='blue', label='Iris Sentosa', alpha=0.7, density=True)
  plt.hist(df[df["class"]=="Iris Versicolor"][label], color='red', label='Iris Versicolor', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

In [None]:
# finding all unique classes in label
pd.unique(df['class'])

##### FINDING EUCLIDEAN DISTANCES

In [None]:
# 2D Plot
# Example Dataset Sheet 1
data = df[df.columns[:-1]].values
point_of_interest = [7, 4]
distances = np.sqrt(np.sum((data - point_of_interest) ** 2, axis=1))

# find x index from nearest neighbor
k = 3
nearest_indices = np.argsort(distances)[:k]

# visualize data
plt.scatter(data[:, 0], data[:, 1], label='Data')
plt.scatter(point_of_interest[0], point_of_interest[1], color='red', marker='x', label='Point of Interest')

# visualize k's nearest neighbor in plot
for idx in nearest_indices:
    plt.annotate(f'Distance: {distances[idx]:.2f}', (data[idx, 0], data[idx, 1]), textcoords="offset points", xytext=(0, 10), ha='center')

plt.legend()
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('K-Nearest Neighbors with Euclidean Distance')
plt.show()

In [None]:
# 3D Plot
# Example Dataset Sheet 2 (Iris)
x = df['sepal_length'][:-1]
y = df['sepal_width'][:-1]
z = df['petal_length'][:-1]
c = df['petal_width'][:-1]
point_of_interest = [6.4, 3.2, 4, 0.2]
distances = np.sqrt((x - point_of_interest[0])**2 + (y - point_of_interest[1])**2 + (z - point_of_interest[2])**2 + (c - point_of_interest[3])**2)

# find x index from nearest neighbor
k = 3
nearest_indices = np.argsort(distances)[:k]

# visualize data
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# plot using c variable with color
img = ax.scatter(x, y, z, c=c, cmap='viridis', marker='o', s=100)
fig.colorbar(img)

# visualize k's nearest neighbor
for idx in nearest_indices:
    ax.text(x[idx], y[idx], z[idx], f'Distance: {distances[idx]:.2f}', fontsize=12)

# visualize center plot
ax.scatter(point_of_interest[0], point_of_interest[1], point_of_interest[2], c='red', marker='x', s=100, label='Point of Interest')

ax.set_xlabel('sepal_length')
ax.set_ylabel('sepal_width')
ax.set_zlabel('petal_length')
ax.set_title('K-Nearest Neighbors with Euclidean Distance')
plt.legend()
plt.show()

## **PREPARING DATASET FOR TRAINING**

In [303]:
# 60% training data (random), 20% validation data, and 20% testing data
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

# 100% training data
# train = df.copy()

In [305]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  # using standard scaler
  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

## **TRAINING DATASET**

In [306]:
train, X_train, y_train = scale_dataset(train, oversample=False)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

In [308]:
y_pred = knn_model.predict(X_test)

## **TESTING AND SCORE**

In [None]:
# print(classification_report(y_train, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
pred = np.array([[6.4, 3.2, 4.0, 0.2]])
predik = knn_model.predict(pred)
predik