# GDA Implementation.

Implement the Gaussian Discriminant Analysis (GDA) learning algorithm following the steps as discussed in class.

INSTRUCTION: Rename your notebook as: <br>
`firstName_LastName_Live_coding_GDA.ipynb`.

Notes: 
* Do not use any built-in functions to complete a task;
* Do not import additional libraries.

In [111]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

In [112]:
# Generate data
def generate_data():
  x, y = make_classification(n_samples= 1000, n_features=3, n_redundant=0, 
                           n_informative=3, random_state=1, 
                           n_clusters_per_class=1)
  
  return x,y

x,y= generate_data() # get data
print(x.shape, y.shape)

(1000, 3) (1000,)


In [113]:
def split_data(x,y, train_size= 0.8):
    # shuffle the data to randomize the train/test split
    indices = np.random.permutation(len(y))
    
    # Split the shuffled indices into training and testing sets
    train_indices = indices[:int(train_size*len(y))]
    test_indices = indices[int(train_size*len(y)):]
    
    # Use the shuffled indices to extract the corresponding data
    x_train, x_test = x[train_indices], x[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    return x_train, x_test, y_train, y_test

In [114]:
X_train, X_test, y_train, y_test= split_data(x, y, train_size=0.8)# split your data into x_train, x_test, y_train, y_test
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(800, 3) (800,) (200, 3) (200,)


In [115]:
def covariance(x, mu):
    d = x.shape[1]
    mu=np.mean(x,axis=0)
    cov = np.zeros((d, d))
    for i in range(d):
        for j in range(d):
            cov[i, j] = np.sum((x[:,i] - mu[i]) * (x[:,j] - mu[j])) / (len(x) - 1)
    return cov

In [116]:
print(np.mean(x,axis=0))

[-0.01298493  0.99925415  0.02975341]


In [117]:
mu=np.mean(x,axis=0)
covariance(x, mu)

array([[1.84495325, 0.02790646, 1.00137533],
       [0.02790646, 1.00170721, 0.05539176],
       [1.00137533, 0.05539176, 1.74832   ]])

In [118]:
np.cov(x,rowvar=0)

array([[1.84495325, 0.02790646, 1.00137533],
       [0.02790646, 1.00170721, 0.05539176],
       [1.00137533, 0.05539176, 1.74832   ]])

In [119]:
class GDA:
  def __init__(self):
    ## set mu, phi and sigma to None
    self.mu=None
    self.phi=None
    self.sigma=None
    
  def fit(self,x,y):
    k=2 #np.unique(y).size  # Number of class.
    d=x.shape[1] # input dim
    m= x.shape[0]# Number of examples.
    
    ## Initialize mu, phi and sigma
    self.mu= np.zeros((k,d))#: kxd, i.e., each row contains an individual class mu.
    self.sigma= np.zeros((k,d,d))#: kxdxd, i.e., each row contains an individual class sigma.
    self.phi= np.zeros((k))# d-dimension

    ## START THE LEARNING: estimate mu, phi and sigma.
    for lab in range(k):
        
        self.phi[lab] = np.sum(lab==y)/m 
        self.mu[lab] = np.mean(x[lab==y], axis=0)
        self.sigma[lab] = covariance(x[lab==y], self.mu[lab])
    return self.phi,self.mu, self.sigma
            
            

  def predict_proba(self,x):
    # reshape or flatt x.
    #x= x.reshape(-1, self.mu.shape[1])
    #x=x.reshape(-1,1)
    #x=self.mu.shape[0]
    d= x.shape[0]
    #k_class = self.mu.shape[0] 
    k_class= self.mu.shape[0]  # Number of classes we have in our case it's k = 2
    probabilities = np.zeros((d, k_class))
    det_cov = []
    inv_cov = []
    
    ## START THE LEARNING: estimate mu, phi and sigma.
    for lab in range(k_class):
        det_cov= np.linalg.det(self.sigma[lab])
        inv_cov=np.linalg.inv(self.sigma[lab])
        for j in range(x.shape[0]):
            first_term=1/((2*np.pi)**(d/2)*(det_cov**0.5))
            #first_term=((1/(((2*np.pi)**(d/2))*(det_cov**0.5)))
            exponential=-0.5*((x[j]-self.mu[lab]).T)@(inv_cov)@(x[j]-self.mu[lab])
            second_term=np.exp(exponential)
            probabilities[j, lab] = first_term*second_term*self.phi[lab]
    return probabilities

  def predict(self,x):
    predict=self.predict_proba(x)
    
    y = np.argmax(predict,axis = 1)
    #Predict = np.argmax(self.predict_proba(x))
    return y
    
  
  def accuracy(self, y, ypreds):
#     ypred = self.predict(y)
    result = np.mean(y == ypreds)
    return result * 100 

In [120]:
model= GDA()
model.fit(X_train,y_train)

(array([0.50625, 0.49375]),
 array([[ 0.97158398,  1.04184047,  1.00013347],
        [-1.01066815,  0.97276821, -0.93414824]]),
 array([[[ 0.91041516, -0.37996505, -0.05881342],
         [-0.37996505,  1.64272683,  0.09266468],
         [-0.05881342,  0.09266468,  0.03710057]],
 
        [[ 0.83187797,  0.34838457,  0.16597655],
         [ 0.34838457,  0.36286157, -0.07449524],
         [ 0.16597655, -0.07449524,  1.58301942]]]))

In [121]:
yproba= model.predict_proba(X_test)
yproba

array([[8.54640805e-151, 8.05798495e-081],
       [3.76266333e-210, 1.45416664e-081],
       [8.38430948e-081, 2.46456295e-085],
       [1.36693302e-080, 3.51599754e-086],
       [2.60536747e-094, 2.71093425e-081],
       [3.43738189e-185, 5.30029558e-081],
       [2.48273213e-104, 8.90787342e-081],
       [3.23279711e-080, 5.13036960e-082],
       [5.10526019e-087, 8.31190460e-081],
       [5.55913864e-117, 8.93840224e-081],
       [6.00518694e-081, 2.31149987e-082],
       [4.91468279e-082, 5.08489011e-081],
       [2.29852102e-200, 3.65085163e-081],
       [1.96167133e-080, 1.02778847e-082],
       [3.22550180e-080, 1.55557474e-081],
       [1.71957362e-115, 9.49515755e-081],
       [3.21605387e-136, 1.17665077e-080],
       [1.26319569e-241, 1.13914352e-081],
       [8.42339648e-081, 2.08258779e-083],
       [1.68033287e-080, 9.69116161e-084],
       [7.87651818e-142, 6.29431462e-081],
       [9.93470831e-082, 9.54897868e-086],
       [1.37410895e-083, 6.86389857e-081],
       [1.4

In [122]:
ypreds= model.predict(X_test)
ypreds


array([1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0])

In [130]:
model.accuracy(y_test, ypreds)

97.5