# GDA Implementation.

Implement the Gaussian Discriminant Analysis (GDA) learning algorithm following the steps as discussed in class.

INSTRUCTION: Rename your notebook as: <br>
`firstName_LastName_Live_coding_GDA.ipynb`.

Notes: 
* Do not use any built-in functions to complete a task;
* Do not import additional libraries.

In [205]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

In [206]:
# Generate data
def generate_data():
  x, y = make_classification(n_samples= 1000, n_features=3, n_redundant=0, 
                           n_informative=3, random_state=1, 
                           n_clusters_per_class=1)
  
  return x,y

x,y= generate_data() # get data
print(x.shape, y.shape)

(1000, 3) (1000,)


In [207]:
def split_data(x,y, train_size= 0.8):
    # shuffle the data to randomize the train/test split
    indices=np.arange(x.shape[0])
    idx=np.random.permutation(indices)
    x=x[idx]
    y=y[idx]
    size=round(train_size*x.shape[0])
    X_train=x[:size,:] 
    y_train=y[:size]
    X_test=x[size:,:]
    y_test=y[size:]

    return X_train, X_test, y_train, y_test


In [212]:
X_train, X_test, y_train, y_test= split_data(x,y, train_size= 0.8) # split your data into x_train, x_test, y_train, y_test
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(800, 3) (800,) (200, 3) (200,)


In [217]:
def covariance(x,mu):
  N,d=x.shape
  K=mu.shape[0]
  sigma=np.zeros((d,d))
  for k in range(K):
    for i in range (d):
      for j in range(d):
        sigma[i,j]=(1/(N-1))*np.sum((x[:,i]-mu[i])*(x[:,j]-mu[j]))
  return sigma
  # Easy way: cov= np.cov(x, rowvar=0) but do not use it. One can use it to assess his/her result.
  

In [193]:
covariance(x,x.mean(axis=0))

array([[1.84495325, 0.02790646, 1.00137533],
       [0.02790646, 1.00170721, 0.05539176],
       [1.00137533, 0.05539176, 1.74832   ]])

In [194]:
np.cov(x,rowvar=0)

array([[1.84495325, 0.02790646, 1.00137533],
       [0.02790646, 1.00170721, 0.05539176],
       [1.00137533, 0.05539176, 1.74832   ]])

In [47]:
x.shape

(1000, 3)

In [233]:
from math import pi
class GDA:
  def __init__(self):
    ## set mu, phi and sigma to None
    self.mu=None
    self.phi=None
    self.sigma=None
    
  def fit(self,x,y):
    k=np.unique(y).shape[0] # Number of class.
    d=x.shape[1]  # input dim
    m= x.shape[0] # Number of examples.
    
    ## Initialize mu, phi and sigma
    self.mu=np.zeros((k,d))#: kxd, i.e., each row contains an individual class mu.
    self.sigma=np.zeros((k,d,d))#: kxdxd, i.e., each row contains an individual class sigma.
    self.phi= np.zeros(k)# d-dimension

    # let us compute phi
    for label in range(k):
      # choose the rows that belong to the k class
      #let us compute phi for the class label 
      self.phi[label]=sum(label==y)/x.shape[0]
      #let us compute mu for the class label
      self.mu[label]= np.mean(x[label==y],axis=0)
      #let us compute sigma for the class label
      self.sigma[label]= covariance(x[label==y],self.mu[label])
  
    

  def predict_proba(self,x):
    # reshape or flatt x.
    #x= x.flatten()
    n=x.shape[0]
    d= x.shape[1]
    k_class= self.mu.shape[0] # Number of classes we have in our case it's k = 2
    score=np.zeros((n,k_class))
    
    ## START THE LEARNING: estimate mu, phi and sigma.
    for k in range (k_class):
      sigma_det=np.linalg.det(self.sigma[k])**(1/2)
      sigma_inv=np.linalg.inv(self.sigma[k])
      # sigma_det=0
      # sigma_inv=np.zeros((d,d))
      coef=0
      for i in range(n):
        
        coef=((2*pi)**(d/2))
        exp_part=(1/2)*(x[i]-self.mu[k]).T@np.linalg.inv(self.sigma[k])@(x[i]-self.mu[k])
        score[i,k]=1/(coef*sigma_det)*np.exp(-exp_part)*self.phi[k]
    return score

  def predict(self,x):
    probas=self.predict_proba(x)
    ypreds=np.argmax(probas, axis=1)  
     
    return ypreds
  
  def accuracy(self,y,ypreds):
    return np.mean(y==ypreds)*100

In [234]:
model= GDA()
model.fit(X_train,y_train)
model.phi

array([0.50125, 0.49875])

In [235]:
yproba= model.predict_proba(X_test)
yproba

array([[1.18981267e-001, 8.06893497e-007],
       [2.54714210e-002, 7.20409621e-014],
       [3.08928771e-048, 5.53461082e-002],
       [2.37332585e-002, 6.78231391e-006],
       [1.84111715e-003, 2.27925027e-002],
       [6.64186459e-002, 1.05380396e-009],
       [5.42921173e-041, 1.85008983e-002],
       [3.67480573e-004, 1.30700586e-002],
       [6.61155020e-057, 5.54467650e-002],
       [2.86551590e-034, 2.46725392e-002],
       [1.00123533e-002, 2.17803163e-002],
       [1.70964805e-048, 2.85087445e-002],
       [1.79641316e-166, 4.77360538e-003],
       [7.92012067e-019, 1.00171644e-002],
       [1.16401379e-001, 1.97853556e-004],
       [1.44897473e-002, 2.01035116e-002],
       [1.35116819e-001, 4.56987758e-006],
       [2.08121780e-002, 9.16723615e-016],
       [9.10238857e-002, 3.24007344e-009],
       [3.60554434e-002, 3.03428337e-005],
       [2.15754315e-036, 1.43501615e-002],
       [2.31641097e-002, 2.94560281e-005],
       [3.39777663e-009, 8.23387392e-003],
       [3.3

In [236]:
ypreds= model.predict(X_test)
ypreds


array([0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1])

In [237]:
model.accuracy(y_test, ypreds)

97.0