- y = f(x) ---> Univariate Normal (Normal Distribution)
- y = f(x1, x2) ---> Bivariate Normal
- y = f(x1, x2, ...) ---> Multi-variate Normal

- Linear Discrimint Analysis : we only take variance &  covariance metricx
  - Used in Supervised & Unsupervised Learning
- Quadratic Discrimint Analysis : we take covariance metricx of each category
  - Used only in Supervised Learning
  - We cannot do Dimensionality Reduction

In [71]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer, make_column_selector 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss, accuracy_score, r2_score
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# HR Dataset
- As example of supervised learning

In [51]:
hr_data = pd.read_csv('HR_comma_sep.csv')
hr_data

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.10,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.00,5,224,5,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14990,0.40,0.57,2,151,3,0,1,0,support,low
14991,0.37,0.48,2,160,3,0,1,0,support,low
14992,0.37,0.53,2,143,3,0,1,0,support,low
14993,0.11,0.96,6,280,4,0,1,0,support,low


In [52]:
X = hr_data.drop(columns='left')
y = hr_data['left']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24,test_size=0.3, stratify=y)

In [54]:
ohe = OneHotEncoder(handle_unknown = 'ignore',drop='first', sparse_output=False)
mct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)),
                             (ohe,make_column_selector(dtype_include=object)),
                             verbose_feature_names_out=False)
mct = ohe_mct.set_output(transform= 'pandas')

# Linear Discriminant Analysis

In [55]:
lda = LinearDiscriminantAnalysis()
pipe = Pipeline([('MCT',mct),('LDA',lda)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7695043342965103


In [57]:
# Using the log_loss evaluation matrix

y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))


0.5329621473428426


# Vehicle Silhouettes DataSet
- Dimensionality reduction as a Unsupervised learning

In [58]:
vehicle = pd.read_csv('Vehicle.csv')

In [59]:
vehicle.Class.unique()

array(['van', 'saab', 'bus', 'opel'], dtype=object)

In [60]:
X = vehicle.drop(columns='Class')
y = vehicle.Class

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size=0.3,stratify=y)

In [62]:
lda = LinearDiscriminantAnalysis().set_output(transform='pandas')
lda.fit(X_train, y_train)

In [63]:
print(X_train.shape)
x_train_lda = lda.transform(X_train)
print(x_train_lda.shape)

(592, 18)
(592, 3)


In [64]:
from sklearn.linear_model import LogisticRegression


In [67]:
# logistic Regression with LDA
log = LogisticRegression()
log.fit(x_train_lda, y_train)
x_test_lda = lda.transform(X_test)
y_pred = log.predict(x_test_lda)
print(accuracy_score(y_test,y_pred))

0.7992125984251969


In [73]:
# Using Pipeline logistic Regression with LDA

# Here if we use- Pipeline([('MCT',mct),('Da',lda)]) then lda(end of pipeline) as model predictor and
# if we use- Pipeline([('Da',lda),('Log',log)]) then lda(middle of pipeline) work as column transformer

pipe = Pipeline([('Da',lda),('Log',log)])         
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7992125984251969


# --------------------------------------------------------------

In [68]:
# Logistic Regression without LDA and without pipeline
log = LogisticRegression()
log.fit(X_train,y_train)
y_pred = log.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7559055118110236


# Quadratic Discriminant Analysis
- Supervised Learning

In [79]:
qda = QuadraticDiscriminantAnalysis()
pipe = Pipeline([('MCT',mct),('QDA',qda)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8543307086614174


# Wine Dataset

In [100]:
wine = pd.read_csv('wine.csv')

In [101]:
wine.head()

Unnamed: 0,Class,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoid,Proanthocyanins,Intensity,Hue,OD280,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [102]:
X = wine.drop(columns='Class')
y = wine['Class']

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size=0.3,stratify=y)

In [104]:
lda = LinearDiscriminantAnalysis().set_output(transform='pandas')
lda.fit(X_train, y_train)

In [105]:
print(X_train.shape)
x_train_lda = lda.transform(X_train)
print(x_train_lda.shape)

(124, 13)
(124, 2)


In [106]:
from sklearn.linear_model import LogisticRegression

In [107]:
# using Pipeline for logistic Regression with LDA
pipe = Pipeline([('Da',lda),('Log',log)])         
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

1.0


In [108]:
# using log_loss scoring for QDA
y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.015394135395675726


In [109]:
# logistic Regression with QDA
qda = QuadraticDiscriminantAnalysis()
pipe = Pipeline([('MCT',mct),('QDA',qda)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9814814814814815


In [110]:
# using log_loss scoring for QDA
y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.054068361848530005
