In [95]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from tabulate import tabulate
import pandas as pd
import numpy as np
import requests
import random
import math

In [96]:
def get_data(url):
  raw_file_url = url

  response = requests.get(raw_file_url)

  if response.status_code != 200:
    return ""

  return response.text

In [97]:

tmp = get_data('https://raw.githubusercontent.com/AbhinayWorkSpace/802_project/main/supervised.csv')
lines = tmp.strip().split('\n')

headers = lines[0].split(',')
headers = [x.rstrip() for x in headers]

data_rows = [line.split(',') for line in lines[1:]]

cleaned = []
for line in data_rows:
  tmp = []
  for x in line:
    x = x.rstrip()
    tmp.append(float(x))
  cleaned.append(tmp)

random.seed(902)
random.shuffle(cleaned)

df = pd.DataFrame(cleaned, columns=headers)
df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,1454.0,1.0,1.6,0.0,6.0,1.0,21.0,0.2,160.0,4.0,...,186.0,1100.0,719.0,14.0,10.0,10.0,1.0,1.0,1.0,0.0
1,1866.0,1.0,2.5,1.0,3.0,1.0,47.0,0.8,89.0,5.0,...,358.0,1782.0,1444.0,7.0,4.0,20.0,1.0,0.0,0.0,1.0
2,1770.0,0.0,2.3,1.0,0.0,0.0,7.0,0.8,128.0,7.0,...,886.0,1552.0,1480.0,6.0,1.0,17.0,1.0,0.0,0.0,1.0
3,1896.0,0.0,0.5,1.0,8.0,1.0,7.0,0.4,141.0,2.0,...,447.0,794.0,3684.0,9.0,7.0,18.0,1.0,0.0,0.0,3.0
4,1183.0,1.0,0.5,1.0,3.0,1.0,48.0,0.1,120.0,8.0,...,268.0,1010.0,1152.0,8.0,3.0,3.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1261.0,1.0,0.5,1.0,0.0,1.0,11.0,0.2,90.0,4.0,...,858.0,1591.0,348.0,14.0,9.0,14.0,1.0,0.0,1.0,0.0
1996,1387.0,0.0,0.5,0.0,0.0,1.0,61.0,0.5,98.0,3.0,...,785.0,1151.0,629.0,5.0,3.0,11.0,1.0,1.0,0.0,0.0
1997,871.0,0.0,0.6,0.0,2.0,0.0,52.0,0.1,178.0,3.0,...,194.0,1437.0,437.0,14.0,7.0,17.0,1.0,0.0,0.0,0.0
1998,1507.0,1.0,0.9,1.0,0.0,1.0,42.0,0.4,123.0,5.0,...,682.0,888.0,1486.0,15.0,3.0,19.0,1.0,0.0,0.0,1.0


In [98]:
def classifier(df, k, x, training):
    l = len(df.axes[1]) - 1
    dist = []

    for _, val in training[0].iterrows():
        tmp = np.zeros((l, 1))
        for idx in range(len(val.values)):
            if idx != l:
                tmp[idx] = val.values[idx]
        dist.append([val.values[l], np.linalg.norm(x - tmp)])

    for _, val in training[1].iterrows():
        tmp = np.zeros((l, 1))
        for idx in range(len(val.values)):
            if idx != l:
                tmp[idx] = val.values[idx]
        dist.append([val.values[l], np.linalg.norm(x - tmp)])

    for _, val in training[2].iterrows():
        tmp = np.zeros((l, 1))
        for idx in range(len(val.values)):
            if idx != l:
                tmp[idx] = val.values[idx]
        dist.append([val.values[l], np.linalg.norm(x - tmp)])

    for _, val in training[3].iterrows():
        tmp = np.zeros((l, 1))
        for idx in range(len(val.values)):
            if idx != l:
                tmp[idx] = val.values[idx]
        dist.append([val.values[l], np.linalg.norm(x - tmp)])

    dist.sort(key=lambda x: x[1])
    top = dist[:k]

    dt = {0: 0, 1: 0, 2: 0, 3: 0}

    for cl, l in top:
        dt[cl] += 1

    for ky, vl in dt.items():
        dt[ky] /= k

    ans = [[ky, vl] for ky, vl in dt.items()]
    ans.sort(key=lambda x: x[1])

    return ans[-1][0] #returns class 1/2/3


In [99]:
def results(k, df):

  actual = []
  predict = []

  try:
    classes = df.groupby("price_range")
  except:
    return [], 0

  class0 = classes.get_group(0)
  class1 = classes.get_group(1)
  class2 = classes.get_group(2)
  class3 = classes.get_group(3)

  bound = int(len(class0) * 0.8)
  train0 = class0[:bound]
  test0 = class0[bound:]

  bound = int(len(class1) * 0.8)
  train1 = class1[:bound]
  test1 = class1[bound:]

  bound = int(len(class2) * 0.8)
  train2 = class2[:bound]
  test2 = class2[bound:]

  bound = int(len(class3) * 0.8)
  train3 = class3[:bound]
  test3 = class3[bound:]

  for x in test0["price_range"].values:
    actual.append(x)
  for x in test1["price_range"].values:
    actual.append(x)
  for x in test2["price_range"].values:
    actual.append(x)
  for x in test3["price_range"].values:
    actual.append(x)

  training_data = [train0, train1, train2, train3]

  for _, val in test0.iterrows():
    l = len(val) - 1
    tmp = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        tmp[idx] = val.values[idx]
    predict.append(classifier(df, k, tmp, training_data))

  for _, val in test1.iterrows():
    l = len(val) - 1
    tmp = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        tmp[idx] = val.values[idx]
    predict.append(classifier(df, k, tmp, training_data))

  for _, val in test2.iterrows():
    l = len(val) - 1
    tmp = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        tmp[idx] = val.values[idx]
    predict.append(classifier(df, k, tmp, training_data))

  for _, val in test3.iterrows():
    l = len(val) - 1
    tmp = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        tmp[idx] = val.values[idx]
    predict.append(classifier(df, k, tmp, training_data))

  cf = confusion_matrix(actual, predict)
  return cf, (len(test1) - cf[0][0] + len(test2) - cf[1][1]+ len(test3) - cf[2][2])/(len(test1) + len(test2) + len(test3))

In [100]:
results(10, df[['battery_power',
   'clock_speed',
   'dual_sim',
   'int_memory',
   'mobile_wt',
   'px_height',
   'px_width',
   'ram',
   'wifi',
   'price_range']])

(array([[100,   0,   0,   0],
        [  6,  91,   3,   0],
        [  0,  10,  84,   6],
        [  0,   0,   2,  98]]),
 0.08333333333333333)

In [101]:
results(0, None)[1]

0

In [102]:
def pca(df):
  l = len(df.axes[1]) - 1

  mean = np.zeros((l, 1))
  for _, val in df.iterrows():
    for idx in range(len(val.values)):
      if idx != l:
        mean[idx] += val.values[idx]
  mean /= len(df)

  scatter = np.zeros((l, l))
  for _, val in df.iterrows():
    x_k = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x_k[idx] = val.values[idx]

    scatter += (x_k - mean) @ (x_k - mean).T

  eigenvalues, eigenvectors = np.linalg.eig(scatter)
  sorted_indices = np.argsort(eigenvalues)[::-1]
  sorted_eigenvalues = eigenvalues[sorted_indices]
  sorted_eigenvectors = eigenvectors[:, sorted_indices]

  pca1 = sorted_eigenvectors[:, 0]
  pca1 = np.reshape(pca1, (l, 1))

  pca2 = sorted_eigenvectors[:, 1]
  pca2 = np.reshape(pca2, (l, 1))

  tmp = []
  for _, val in df.iterrows():
    x_k = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x_k[idx] = val.values[idx]

    val1 = pca1.T @ x_k
    val2 = pca2.T @ x_k

    tmp.append((val1[0][0], val2[0][0]))

  new_pca = pd.DataFrame(tmp, columns=["pca1", "pca2"])
  new_pca["price_range"] = df["price_range"]

  return new_pca

results(10, pca(df))

(array([[91,  9,  0,  0],
        [16, 76,  8,  0],
        [ 0, 21, 57, 22],
        [ 0,  0,  9, 91]]),
 0.25333333333333335)

In [103]:
def mda(df):
  l = len(df.axes[1]) - 1

  tmp = df.groupby("price_range")

  m_1 = np.zeros((l, 1))
  for _, val in tmp.get_group(0).iterrows():
    for idx in range(len(val.values)):
      if idx != l:
        m_1[idx] += val.values[idx]
  m_1 /= len(tmp.get_group(0))

  m_2 = np.zeros((l, 1))
  for _, val in tmp.get_group(1).iterrows():
    for idx in range(len(val.values)):
      if idx != l:
        m_2[idx] += val.values[idx]
  m_2 /= len(tmp.get_group(1))

  m_3 = np.zeros((l, 1))
  for _, val in tmp.get_group(2).iterrows():
    for idx in range(len(val.values)):
      if idx != l:
        m_3[idx] += val.values[idx]
  m_3 /= len(tmp.get_group(2))

  m_4 = np.zeros((l, 1))
  for _, val in tmp.get_group(3).iterrows():
    for idx in range(len(val.values)):
      if idx != l:
        m_4[idx] += val.values[idx]
  m_4 /= len(tmp.get_group(3))

  m = np.zeros((l, 1))
  m = len(tmp.get_group(0)) * m_1 + len(tmp.get_group(1)) * m_2 + len(tmp.get_group(2)) * m_3 + len(tmp.get_group(3)) * m_4
  m /= len(df)

  s_1 = np.zeros((l, l))
  for _, val in tmp.get_group(0).iterrows():
    x = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x[idx] = val.values[idx]

    s_1 += (x - m_1) @ (x - m_1).T

  s_2 = np.zeros((l, l))
  for _, val in tmp.get_group(1).iterrows():
    x = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x[idx] = val.values[idx]

    s_2 += (x - m_2) @ (x - m_2).T

  s_3 = np.zeros((l, l))
  for _, val in tmp.get_group(2).iterrows():
    x = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x[idx] = val.values[idx]

    s_3 += (x - m_3) @ (x - m_3).T

  s_4 = np.zeros((l, l))
  for _, val in tmp.get_group(3).iterrows():
    x = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x[idx] = val.values[idx]

    s_4 += (x - m_4) @ (x - m_4).T

  s_w = s_1 + s_2 + s_3 + s_4

  s_1 = np.zeros((l, l))
  s_1 = len(tmp.get_group(0)) * (m_1 - m) @ (m_1 - m).T

  s_2 = np.zeros((l, l))
  s_2 = len(tmp.get_group(1)) * (m_2 - m) @ (m_2 - m).T

  s_3 = np.zeros((l, l))
  s_3 = len(tmp.get_group(2)) * (m_3 - m) @ (m_3 - m).T

  s_4 = np.zeros((l, l))
  s_4 = len(tmp.get_group(3)) * (m_4 - m) @ (m_4 - m).T

  s_b = s_1 + s_2 + s_3 + s_4

  s = np.linalg.inv(s_w) @ s_b

  eigenvalues, eigenvectors = np.linalg.eig(s)
  sorted_indices = np.argsort(eigenvalues)[::-1]
  sorted_eigenvalues = eigenvalues[sorted_indices]
  sorted_eigenvectors = eigenvectors[:, sorted_indices]

  mda1 = sorted_eigenvectors[:, 0]
  mda1 = np.reshape(mda1, (l, 1))

  mda2 = sorted_eigenvectors[:, 1]
  mda2 = np.reshape(mda2, (l, 1))

  tmp = []
  for _, val in df.iterrows():
    x_k = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x_k[idx] = val.values[idx]

    val1 = mda1.T @ x_k
    val2 = mda2.T @ x_k

    tmp.append((val1[0][0], val2[0][0]))

  new_mda = pd.DataFrame(tmp, columns=["mda1", "mda2"])
  new_mda["price_range"] = df["price_range"]

  return new_mda

results(10, mda(df))

  tmp[idx] = val.values[idx]
  tmp[idx] = val.values[idx]
  tmp[idx] = val.values[idx]
  tmp[idx] = val.values[idx]
  tmp[idx] = val.values[idx]
  tmp[idx] = val.values[idx]
  tmp[idx] = val.values[idx]
  tmp[idx] = val.values[idx]


(array([[97,  3,  0,  0],
        [ 1, 95,  4,  0],
        [ 0,  6, 91,  3],
        [ 0,  0,  4, 96]]),
 0.056666666666666664)