In [110]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from tabulate import tabulate
import pandas as pd
import numpy as np
import requests
import random
import math

In [111]:
def get_data(url):
  raw_file_url = url

  response = requests.get(raw_file_url)

  if response.status_code != 200:
    return ""

  return response.text

In [112]:
tmp = get_data('https://raw.githubusercontent.com/AbhinayWorkSpace/802_project/main/supervised.csv')
lines = tmp.strip().split('\n')

headers = lines[0].split(',')
headers = [x.rstrip() for x in headers]

data_rows = [line.split(',') for line in lines[1:]]

cleaned = []
for line in data_rows:
  tmp = []
  for x in line:
    x = x.rstrip()
    tmp.append(float(x))
  cleaned.append(tmp)

random.seed(802)
random.shuffle(cleaned)

df = pd.DataFrame(cleaned, columns=headers)
df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,1648.0,0.0,0.7,0.0,6.0,1.0,64.0,0.8,170.0,4.0,...,1153.0,1876.0,610.0,8.0,1.0,6.0,1.0,1.0,0.0,1.0
1,935.0,1.0,0.5,1.0,3.0,1.0,58.0,0.1,155.0,5.0,...,639.0,1087.0,2473.0,10.0,4.0,9.0,1.0,0.0,1.0,2.0
2,793.0,0.0,1.2,1.0,1.0,1.0,38.0,0.6,176.0,4.0,...,1226.0,1815.0,3139.0,6.0,1.0,12.0,1.0,1.0,1.0,3.0
3,705.0,1.0,0.5,0.0,5.0,1.0,57.0,0.9,155.0,6.0,...,1114.0,1374.0,1509.0,15.0,11.0,18.0,1.0,1.0,1.0,1.0
4,1456.0,1.0,0.5,1.0,7.0,0.0,7.0,0.4,105.0,5.0,...,823.0,1104.0,1587.0,6.0,5.0,20.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1462.0,1.0,1.7,1.0,1.0,0.0,25.0,0.2,182.0,7.0,...,91.0,1478.0,824.0,15.0,5.0,13.0,1.0,0.0,1.0,0.0
1996,1640.0,1.0,0.7,0.0,10.0,1.0,21.0,0.8,169.0,7.0,...,589.0,1301.0,337.0,16.0,3.0,5.0,1.0,1.0,0.0,0.0
1997,618.0,0.0,0.6,0.0,8.0,0.0,16.0,0.2,143.0,1.0,...,1105.0,1602.0,2262.0,16.0,15.0,4.0,0.0,1.0,0.0,2.0
1998,508.0,0.0,0.8,0.0,7.0,1.0,42.0,0.3,94.0,1.0,...,39.0,557.0,663.0,13.0,12.0,7.0,1.0,0.0,0.0,0.0


In [113]:
def results(df):

  try:
    classes = df.groupby("price_range")
  except:
    return [], 0

  X = df.iloc[:, :-1]
  y = df.iloc[:, -1]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify=y)

  clf = RandomForestClassifier(n_estimators = 1000)
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  cf = confusion_matrix(y_test, y_pred)

  feature_names = df.columns[df.columns != 'price_range'].tolist()
  return cf, accuracy_score(y_test, y_pred), pd.Series(clf.feature_importances_, index = feature_names).sort_values(ascending = False)

In [114]:
results(df)

(array([[94,  6,  0,  0],
        [ 7, 79, 14,  0],
        [ 0, 14, 82,  4],
        [ 0,  0, 16, 84]]),
 0.8475,
 ram              0.482723
 battery_power    0.072541
 px_width         0.057053
 px_height        0.055544
 mobile_wt        0.039111
 int_memory       0.036890
 talk_time        0.030933
 pc               0.029108
 sc_h             0.028742
 sc_w             0.028378
 clock_speed      0.027984
 fc               0.024892
 m_dep            0.024737
 n_cores          0.022551
 dual_sim         0.006954
 touch_screen     0.006883
 four_g           0.006634
 blue             0.006570
 wifi             0.006489
 three_g          0.005283
 dtype: float64)

In [115]:
def pca(df):
  l = len(df.axes[1]) - 1

  mean = np.zeros((l, 1))
  for _, val in df.iterrows():
    for idx in range(len(val.values)):
      if idx != l:
        mean[idx] += val.values[idx]
  mean /= len(df)

  scatter = np.zeros((l, l))
  for _, val in df.iterrows():
    x_k = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x_k[idx] = val.values[idx]

    scatter += (x_k - mean) @ (x_k - mean).T

  eigenvalues, eigenvectors = np.linalg.eig(scatter)
  sorted_indices = np.argsort(eigenvalues)[::-1]
  sorted_eigenvalues = eigenvalues[sorted_indices]
  sorted_eigenvectors = eigenvectors[:, sorted_indices]

  pca1 = sorted_eigenvectors[:, 0]
  pca1 = np.reshape(pca1, (l, 1))

  pca2 = sorted_eigenvectors[:, 1]
  pca2 = np.reshape(pca2, (l, 1))

  tmp = []
  for _, val in df.iterrows():
    x_k = np.zeros((l, 1))
    for idx in range(len(val.values)):
      if idx != l:
        x_k[idx] = val.values[idx]

    val1 = pca1.T @ x_k
    val2 = pca2.T @ x_k

    tmp.append((val1[0][0], val2[0][0]))

  new_pca = pd.DataFrame(tmp, columns=["pca1", "pca2"])
  new_pca["price_range"] = df["price_range"]

  return new_pca

results(pca(df))

(array([[89, 11,  0,  0],
        [ 7, 79, 14,  0],
        [ 0, 24, 66, 10],
        [ 0,  0, 17, 83]]),
 0.7925,
 pca1    0.764501
 pca2    0.235499
 dtype: float64)