In [34]:
import pandas as pd
import numpy as np
from random import uniform
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs, make_classification

In [35]:
def lvq_fit(train, target, lrate, b, max_epoch):
  train = np.array([x for x in train.to_numpy()])
  target = target.to_numpy()
  label, train_idx = np.unique(target, return_index=True)
  weight = train[train_idx].astype(np.float64)
  train = np.asarray([e for i, e in enumerate(zip(train, target)) if i not in train_idx], dtype='object')
  train, target = train[:, 0], train[:, -1]
  epoch = 0

  while epoch < max_epoch:
    for i, x in enumerate(train):
      distance = [sum((w - x) ** 2) for w in weight]
      min = np.argmin(distance)
      sign = 1 if target[i] == label[min] else -1
      weight[min] += sign * lrate * (x - weight[min])
    lrate *= b
    epoch += 1

  return weight, label

In [36]:
def lvq_predict(X, model):
  center, label = model
  X = np.array(X, dtype=np.float64)
  Y = []
  for x in X:
    d = [sum((c - x) ** 2) for c in center]
    Y.append(label[np.argmin(d)])
  return Y

In [37]:
def calc_accuracy(a, b):
    a = np.array(a)
    b = np.array(b)
    _, a = np.unique(a, return_inverse=True)
    _, b = np.unique(b, return_inverse=True)
    s = np.sum(a == b)
    return s / len(a)


In [38]:
data = pd.read_csv('dataset/water_potability.csv')

In [39]:
df_cleaned = data.dropna().reset_index(drop=True)
df_cleaned.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
1,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0
2,5.584087,188.313324,28748.687739,7.544869,326.678363,280.467916,8.399735,54.917862,2.559708,0
3,10.223862,248.071735,28749.716544,7.513408,393.663396,283.651634,13.789695,84.603556,2.672989,0
4,8.635849,203.361523,13672.091764,4.563009,303.309771,474.607645,12.363817,62.798309,4.401425,0


In [45]:
df_cleaned['Potability'].value_counts()

Potability
0    1200
1     811
Name: count, dtype: int64

In [40]:

X = df_cleaned[df_cleaned.columns[:-1]]
y = df_cleaned[df_cleaned.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
label, train_idx = np.unique(y_train, return_index=True)

In [42]:
model = lvq_fit(X_train, y_train, lrate=.5, b=.8, max_epoch=50)
output = lvq_predict(X_test, model)
accuracy = calc_accuracy(output, y_test)
colors = 'rgbcmyk'

print('Accuracy:', accuracy)

Accuracy: 0.4987593052109181
