# GDA Implementation.

Implement the Gaussian Discriminant Analysis (GDA) learning algorithm following the steps as discussed in class.

INSTRUCTION: Rename your notebook as: <br>
`firstName_LastName_Live_coding_GDA.ipynb`.

Notes: 
* Do not use any built-in functions to complete a task;
* Do not import additional libraries.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

In [2]:
# Generate data
def generate_data():
  x, y = make_classification(n_samples= 1000, n_features=3, n_redundant=0, 
                           n_informative=3, random_state=1, 
                           n_clusters_per_class=1)
  
  return x,y

x,y= generate_data()
print(x.shape, y.shape)

(1000, 3) (1000,)


In [3]:
def split_data(x,y, train_size= 0.8):
    # shuffle the data to randomize the train/test split
    
    # stack the data
    if y.ndim == 1:
      y = y.reshape(-1,1)
    data = np.hstack((x,y))

    # shuffle the dataset
    np.random.shuffle(data)

    n = x.shape[0]

    # split the data
    x_train, y_train = data[0 : round(n*train_size), :-1], data[0 : round(n*train_size), -1]
    x_test, y_test  =  data[round(n*train_size) : , :-1],  data[round(n*train_size) : , -1]

    return x_train, x_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test= split_data(x,y, train_size= 0.8) # split your data into x_train, x_test, y_train, y_test
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(800, 3) (800,) (200, 3) (200,)


In [5]:
def covariance(x, mu):

  # Easy way: cov= np.cov(x, rowvar=0) but do not use it. One can use it to assess his/her result.
  num_rows, num_cols = x.shape
  covariance_matrix = np.zeros((num_cols, num_cols))

  for i in range(num_cols):
    for j in range(num_cols):

      covariance_matrix[i][j] = (1/(num_rows-1))* np.sum((x.T[i] - mu[i]) @ (x.T[j] - mu[j]).T)
  
  
  return covariance_matrix


In [6]:
class GDA:
  def __init__(self):
    ## set mu, phi and sigma to None
    self.mu = None
    self.phi = None
    self.sigma = None
    
  def fit(self,x,y):

    k = len(np.unique(y)) # Number of class.
    d = x.shape[1]  # input dim
    m = x.shape[0] # Number of examples.
    diff_classes = np.unique(y)
    
    ## Initialize mu, phi and sigma
    self.mu = np.zeros((k,d))
    self.phi = np.zeros((k,1))
    self.sigma = np.zeros((k,d,d))

    # for lab in range(k):
      
    #   self.phi[lab] = np.sum(lab == y)/m # k-dimension


    #   self.mu = np.mean(x[lab == y], axis = 0)#: kxd, i.e., each row contains an individual class mu.
    #   self.sigma[lab] = covariance()  #: kxdxd, i.e., each row contains an individual class sigma.

    ## START THE LEARNING: estimate mu, phi and sigma.

    for i, item in enumerate(diff_classes):
      bool_item = [(y == item)]

      self.phi[i] = len(y[bool_item])/m

      for j in range(d):
        self.mu[i,j] = np.sum((x[bool_item].T)[j])/x[bool_item].shape[0]

      self.sigma[i] = covariance(x, self.mu[i])


  def predict_proba(self,x):
    # reshape or flatt x.
    #x 
    d = x.shape[1]
    k_class = self.mu.shape[0]  # Number of classes we have in our case it's k = 2
    
    ## START THE LEARNING: estimate mu, phi and sigma.

    log_y_probas = np.zeros((x.shape[0], k_class))

    for k in range(k_class):
      det = np.linalg.det(self.sigma[k])
      inverse = np.linalg.inv(self.sigma[k])

      for row in range(x.shape[0]):
        #y_probas[row, k] = ( 1 / ((2*np.pi)**(d/2)) * ((det)**0.5) ) * np.exp( (-0.5) * (x[row] - self.mu[k]).T @ inverse @ (x[row] - self.mu[k]) ) * self.phi[k]
        log_y_probas[row, k] = np.log(( 1 / ((2*np.pi)**(d/2)) * ((det)**0.5) ) * np.exp( (-0.5) * (x[row] - self.mu[k]).T @ inverse @ (x[row] - self.mu[k]) ) )  + np.log( self.phi[k])
                               
    return log_y_probas


  def predict(self,x):
    y_pred = np.zeros(x.shape[0])

    for i in range(y_pred.shape[0]):
      y_pred[i] = np.argmax(self.predict_proba(x)[i])

    return y_pred
  
  def accuracy(self, y, ypreds):
    acc = np.mean(y == ypreds)*100
    
    return acc

In [7]:
model= GDA()
model.fit(X_train,y_train)

  self.phi[i] = len(y[bool_item])/m
  self.mu[i,j] = np.sum((x[bool_item].T)[j])/x[bool_item].shape[0]


In [8]:
yproba= model.predict_proba(X_test)
yproba

array([[ -4.48346246,  -3.62323111],
       [ -4.1754734 ,  -5.05032693],
       [-10.89684335,  -8.77582176],
       [ -3.90162244,  -3.00782902],
       [ -3.48402705,  -4.71350077],
       [ -4.22783531,  -5.13019001],
       [ -3.72279318,  -4.52835331],
       [ -3.40902591,  -4.7812414 ],
       [ -3.83922415,  -3.58857503],
       [ -3.15258126,  -3.01587987],
       [ -5.49233906,  -4.06037284],
       [ -4.91618898,  -3.15038196],
       [ -4.55850336,  -3.45661375],
       [ -6.18157508,  -4.83612004],
       [ -5.16070285,  -6.33916817],
       [ -4.11380779,  -5.50625639],
       [ -2.844148  ,  -3.77967766],
       [ -3.25972043,  -3.44729532],
       [ -4.6560945 ,  -3.70570389],
       [ -3.27177335,  -2.87845454],
       [ -3.5770731 ,  -4.25635429],
       [ -2.9018006 ,  -3.98368071],
       [ -6.16643634,  -3.9792671 ],
       [ -4.94051075,  -3.58248874],
       [ -5.1402838 ,  -4.02255656],
       [ -3.04429773,  -3.4979234 ],
       [ -3.04169623,  -3.5363051 ],
 

In [9]:
ypreds= model.predict(X_test)
ypreds


array([1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 0.,
       1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.,
       0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1.,
       1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.])

In [10]:
model.accuracy(y_test, ypreds)

95.0