In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/ML/hw3-2023')

## Import packages

In [None]:
import sys
import pandas as pd

if sys.version_info[0] < 3:
    raise Exception("Python 3 not detected.")
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
# from scipy.stats import multivariate_normal
from skimage.feature import hog, local_binary_pattern, corner_harris

## Load data

In [None]:
# load data
data_name = "spam"
data = np.load(f"./data/{data_name}-data.npz")
print("\nloaded %s data!" % data_name)
fields = "test_data", "training_data", "training_labels"
for field in fields:
    print(field, data[field].shape)


loaded spam data!
test_data (1000, 1931)
training_data (4172, 1931)
training_labels (4172,)


## Extract features

In [None]:
def extract_lbp_features(img, lbp_radius=1, lbp_point=8):
    lbp = local_binary_pattern(img, lbp_point, lbp_radius, 'default')
    max_bins = int(lbp.max() + 1)
    # hist size:256
    hist, _ = np.histogram(lbp, density=True, bins=max_bins, range=(0, max_bins))
    return hist

def extract_hog_features(img):
    return hog(img)


# def extract_features(data, width=28, height=28, is_grey_image=True):
#     image_descriptors = []
#     arr = np.array(data)
#     print(arr.shape)
#     for x in data:
#       if is_grey_image:
#         x = x.reshape(width, height)
#         # normalize
#         # x /= np.linalg.norm(x)
#         fd = []
#       fd = np.append(fd, extract_hog_features(x))
#       fd = np.append(fd, extract_lbp_features(x))
#       image_descriptors.append(fd)
#     return image_descriptors
def extract_features(data):
    image_descriptors = []
    arr = np.array(data)
    for x in data:
      norm = np.linalg.norm(x)
      if norm!=0:
        x /= norm
      image_descriptors.append(x)
    return image_descriptors

In [None]:
# extract features from raw pixels
training_data = extract_features(data["training_data"])
test_data = extract_features(data["test_data"])

In [None]:
data_flat = np.array(training_data)
labels = data["training_labels"].reshape(len(data["training_labels"]), -1)
dataset = np.concatenate([labels, data_flat], axis=-1)
print(dataset.shape)

(4172, 1932)


In [None]:
# split data
np.random.seed(113)
np.random.shuffle(dataset)
val_set = dataset[0:72]
train_set = dataset[72:]

## Define model

In [None]:
# define GDA(LDA/QDA) model
class GDA:
  def __init__(self, mode="lda"):
    self.mode = mode
    self.mus = []
    self.covs = []
    self.prior_probs = []
    self.cov_invs, self.cov_logdets = [], []

  def train(self, train_set):
    """
    Calculate means and covariance matrixes given x and y
    (If mode set to lda, calculate cov's average.)
    :param train_set: for a training pts, first dim is its label 
    """
    train_set = np.array(train_set)
    total = train_set.shape[0]
    for label in range(2):
      print("Computing mean and covariance matrix for class {}".format(label))
      data_tmp = train_set[np.where(train_set[:, 0].astype(np.int64) == label)][:, 1:]
      mu = np.mean(data_tmp, axis=0)
      self.mus.append(mu)
      cov = (data_tmp - mu).T @ (data_tmp - mu) / data_tmp.shape[0]
      self.covs.append(cov)
      prior_prob = data_tmp.shape[0]/total
      self.prior_probs.append(prior_prob)
    if self.mode=="lda":
      self.cov = np.average(self.covs, axis=0)
    else:
      self.covs = np.array(self.covs)
    self.get_invs_dets()
      

  def get_invs_dets(self):
    if self.mode=="lda":
      tmp = abs(self.cov)
      tmp = tmp[np.nonzero(tmp)]
      minval = np.min(tmp)
      dim = self.cov.shape[0]
      new_cov = self.cov + np.eye(dim) * minval * 0.001
      self.cov_inv = np.linalg.inv(new_cov)
      sign, cov_logdet = np.linalg.slogdet(new_cov)
      assert(sign!=0)
      self.cov_logdet = sign * cov_logdet
    else:
      tmp = abs(self.covs)
      tmp = tmp[np.nonzero(tmp)]
      minval = np.min(tmp)
      dim = self.covs.shape[1]
      print(minval)
      for cov in self.covs:
        new_cov = cov + np.eye(dim) * minval * 0.001
        cov_inv = np.linalg.inv(new_cov)
        sign, cov_logdet = np.linalg.slogdet(new_cov)
        assert(sign!=0)
        self.cov_invs.append(cov_inv)
        self.cov_logdets.append(sign * cov_logdet)


  def calc_probs(self, x):
    """
    For a given input x, return its probabilities of belonging to each class
    """
    probs = np.zeros(2,)
    for i in range(2):
      if self.mode == "lda":
        # extremely slow when using logpdf directly
        # probs[i] = multivariate_normal.logpdf(x, self.mus[i], self.cov, True)
        bias = (x-self.mus[i]).reshape((-1, 1))
        probs[i] = - bias.T @ self.cov_inv @ bias/2 - self.cov_logdet/2 + np.log(self.prior_probs[i])
      else:
        bias = (x-self.mus[i]).reshape((-1, 1))
        probs[i] = - bias.T @ self.cov_invs[i] @ bias/2 - self.cov_logdets[i]/2 + np.log(self.prior_probs[i])
        # probs[i] = multivariate_normal.logpdf(x, self.mus[i], self.covs[i], True) + np.log(self.prior_probs[i])
    return probs

  def predict(self, datas):
    predict_labels = []
    datas = np.array(datas)
    for i, data in enumerate(datas):
      i = (i+1)/datas.shape[0]
      print('\rPredicting：{}{:.2f}%'.format('▉'*int(i*50),(i*100)), end='')
      probs = self.calc_probs(data)
      label = np.argmax(probs)
      predict_labels.append(label)
    return np.array(predict_labels)

  def eval(self, val_set):
    print("Evaluating model...")
    labels = val_set[:, 0]
    datas = val_set[:, 1:]
    predict_labels = self.predict(datas)
    acc = accuracy_score(labels, predict_labels)
    return acc

In [None]:
def results_to_csv(y_test, name):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1 # Ensures that the index starts at 1
    df.to_csv(name+'.csv', index_label='Id')

## Train model

In [None]:
model = GDA("gda")
model.train(train_set)
# calculate overall error rate
total_acc = model.eval(val_set)
print(f"\nAcc: {total_acc*100}%")

Computing mean and covariance matrix for class 0
Computing mean and covariance matrix for class 1
4.0048990305143524e-13
Evaluating model...
Predicting：▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉100.00%
Acc: 84.72222222222221%


In [None]:
model = GDA("lda")
model.train(train_set)
# calculate overall error rate
total_acc = model.eval(val_set)
print(f"\nAcc: {total_acc*100}%")

Computing mean and covariance matrix for class 0
Computing mean and covariance matrix for class 1
Evaluating model...
Predicting：▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉100.00%
Acc: 93.05555555555556%


In [None]:
model = GDA("lda")
model.train(dataset)

Computing mean and covariance matrix for class 0
Computing mean and covariance matrix for class 1


In [None]:
labels = model.predict(np.array(test_data))
results_to_csv(labels, "lda_submission_spam")

Predicting：▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉100.00%