In [1]:
# Can be used for both supervised and unsupervised learning.


Assumptions made in Naive Bayes

1. with gaussian naive bayes, all independent variables are need to be numbers.
2. we also condider that all independent variables are independent on each other.

for mutivariate - considerations
1. Follows multivariate normal distribution 
2. Covariances are same (assumed)



In [70]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("HR_comma_sep.csv")
df.head(3)

In [18]:
x = df.drop('left', axis = 'columns')
y = df['left']

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                   random_state = 24,
                                                   test_size = 0.3,
                                                   stratify = y)

In [22]:
imp_cat = SimpleImputer(strategy='constant', fill_value="unknown")
imp_num = SimpleImputer(strategy="median")
trans_imp = make_column_transformer((imp_cat, make_column_selector(dtype_include=object)), 
                                    (imp_num, make_column_selector(dtype_exclude=object)),
                                   verbose_feature_names_out=False)
trans_imp = trans_imp.set_output(transform="pandas")
X_imp = trans_imp.fit_transform(x)
print(X_imp.isnull().sum().sum())
print(X_imp.columns)

0
Index(['Department', 'salary', 'satisfaction_level', 'last_evaluation',
       'number_project', 'average_montly_hours', 'time_spend_company',
       'Work_accident', 'promotion_last_5years'],
      dtype='object')


In [24]:
ohe = OneHotEncoder(
    handle_unknown="ignore", 
    sparse_output=False, 
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=object)),
    (ohe, make_column_selector(dtype_include=object)),
    verbose_feature_names_out=False
).set_output(transform='pandas')
trans_ohe = trans_ohe.set_output(transform='pandas')
X_imp_ohe = trans_ohe.fit_transform(X_imp)
X_imp_ohe.dtypes

satisfaction_level        float64
last_evaluation           float64
number_project            float64
average_montly_hours      float64
time_spend_company        float64
Work_accident             float64
promotion_last_5years     float64
Department_RandD          float64
Department_accounting     float64
Department_hr             float64
Department_management     float64
Department_marketing      float64
Department_product_mng    float64
Department_sales          float64
Department_support        float64
Department_technical      float64
salary_low                float64
salary_medium             float64
dtype: object

In [36]:
lr = LogisticRegression(random_state = 24)

kfold = StratifiedKFold(n_splits=5, 
                        random_state=24, 
                        shuffle=True)

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

pipe = Pipeline([('IMP',trans_imp), ('OHE',trans_ohe),("SCL", scaler_std), ('MODEL', lr)])

params = {'MODEL__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
         'MODEL__C':np.linspace(0.001, 10, 20)}

gcv = GridSearchCV(
    pipe, 
    param_grid = params, 
    scoring='neg_log_loss',     
    cv=kfold, 
    verbose=3)

qda = QuadraticDiscriminantAnalysis()

kfold = StratifiedKFold(n_splits=5, 
                        random_state=24, 
                        shuffle=True)

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

pipe = Pipeline([('IMP',trans_imp), ('OHE',trans_ohe), ("SCL", scaler_std), ('MODEL', qda)])



In [64]:
pipe.fit(x_train, y_train)

In [66]:
y_pred = pipe.predict(x_test)

In [68]:
accuracy_score(y_test, y_pred)

0.863080684596577

In [72]:
log_loss(y_test, y_pred)

4.935072346676187

# Vehicle Silhouettes

In [95]:
df = pd.read_csv("Vehicle.csv")
df.head(3)


Unnamed: 0,Comp,Circ,D.Circ,Rad.Ra,Pr.Axis.Ra,Max.L.Ra,Scat.Ra,Elong,Pr.Axis.Rect,Max.L.Rect,Sc.Var.Maxis,Sc.Var.maxis,Ra.Gyr,Skew.Maxis,Skew.maxis,Kurt.maxis,Kurt.Maxis,Holl.Ra,Class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab


In [97]:
x = df.drop('Class', axis = 'columns')
y = df['Class']

In [101]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                   random_state = 24,
                                                   test_size = 0.3,
                                                   stratify = y)

In [83]:
lda = LinearDiscriminantAnalysis().set_output(transform = 'pandas')
lda.fit(x_train, y_train)

In [85]:
x_train_lda = lda.transform(x_train)
x_train_lda.shape

(592, 3)

In [89]:
lr = LogisticRegression()
lr.fit(x_train_lda, y_train)

In [91]:
y_pred_lr = lr.predict(lda.transform(x_test))
accuracy_score(y_test, y_pred_lr)

0.7992125984251969

In [105]:
# Using Logistic Regression and Pipeline

lda = LinearDiscriminantAnalysis()
lr = LogisticRegression()

pipe = Pipeline([('DA', lda), ("LR", lr)])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
accuracy_score(y_test, y_pred)

0.7992125984251969

In [107]:
# QDA supervised
qda = QuadraticDiscriminantAnalysis()
qda.fit(x_train, y_train)
y_pred = qda.predict(x_test)
accuracy_score(y_test, y_pred)

0.8543307086614174

# Discriminant analysis on wine dataset

In [110]:
wine = pd.read_csv("wine.csv")
wine

Unnamed: 0,Class,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoid,Proanthocyanins,Intensity,Hue,OD280,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [124]:
x = wine.drop('Class', axis = 'columns')
y = wine['Class']

In [126]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                   random_state = 24,
                                                   test_size = 0.3,
                                                   stratify = y)

In [128]:
lda = LinearDiscriminantAnalysis().set_output(transform = 'pandas')
lda.fit(x_train, y_train)

In [130]:
x_train_lda = lda.transform(x_train)
x_train_lda.shape

(124, 2)

In [132]:
lr = LogisticRegression()
lr.fit(x_train_lda, y_train)

In [134]:
y_pred_lr = lr.predict(lda.transform(x_test))
accuracy_score(y_test, y_pred_lr)

1.0

In [146]:
# Using QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(x_train, y_train)
y_pred = qda.predict(x_test)
accuracy_score(y_test, y_pred)

0.9814814814814815