# Chapter 12: Discriminant Analysis

> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck 
>
> Code included in
>
> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) 
> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.

## Import required packages

In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pylab as plt
# !pip install dmba
from dmba import classificationSummary


## Riding Mowers Classification

In [2]:
mow = pd.read_csv('RidingMowers.csv')

In [3]:
mow.head()

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner
3,61.5,20.8,Owner
4,87.0,23.6,Owner


In [4]:
lda = LinearDiscriminantAnalysis()
lda.fit(mow.drop(columns='Ownership'), mow['Ownership'])

LinearDiscriminantAnalysis()

In [5]:
lda.predict(mow.drop(columns='Ownership'))

array(['Nonowner', 'Owner', 'Owner', 'Owner', 'Owner', 'Owner', 'Owner',
       'Owner', 'Owner', 'Owner', 'Owner', 'Owner', 'Owner', 'Nonowner',
       'Nonowner', 'Nonowner', 'Owner', 'Nonowner', 'Nonowner',
       'Nonowner', 'Nonowner', 'Nonowner', 'Nonowner', 'Nonowner'],
      dtype='<U8')

In [6]:
lda.predict_proba(mow.drop(columns='Ownership'))

array([[0.78203155, 0.21796845],
       [0.49449211, 0.50550789],
       [0.15236751, 0.84763249],
       [0.31924493, 0.68075507],
       [0.00402325, 0.99597675],
       [0.0124668 , 0.9875332 ],
       [0.05188913, 0.94811087],
       [0.01554353, 0.98445647],
       [0.29300716, 0.70699284],
       [0.01956031, 0.98043969],
       [0.3436552 , 0.6563448 ],
       [0.11070233, 0.88929767],
       [0.2371929 , 0.7628071 ],
       [0.52865847, 0.47134153],
       [0.8505169 , 0.1494831 ],
       [0.80075893, 0.19924107],
       [0.37757951, 0.62242049],
       [0.95203727, 0.04796273],
       [0.96165849, 0.03834151],
       [0.66288177, 0.33711823],
       [0.98387005, 0.01612995],
       [0.97514892, 0.02485108],
       [0.99644001, 0.00355999],
       [0.97819391, 0.02180609]])

In [7]:
lda.classes_

array(['Nonowner', 'Owner'], dtype='<U8')

In [8]:
pred = pd.DataFrame(lda.predict(mow.drop(columns='Ownership')),columns=['predicted'])

In [9]:
prob = pd.DataFrame(lda.predict_proba(mow.drop(columns='Ownership'))[:,1],columns=['p_owner'])

In [10]:
pd.concat([mow['Ownership'],pred,prob], axis=1)

Unnamed: 0,Ownership,predicted,p_owner
0,Owner,Nonowner,0.217968
1,Owner,Owner,0.505508
2,Owner,Owner,0.847632
3,Owner,Owner,0.680755
4,Owner,Owner,0.995977
5,Owner,Owner,0.987533
6,Owner,Owner,0.948111
7,Owner,Owner,0.984456
8,Owner,Owner,0.706993
9,Owner,Owner,0.98044


In [11]:
classificationSummary(mow['Ownership'],pred['predicted'])

Confusion Matrix (Accuracy 0.8750)

       Prediction
Actual  0  1
     0 10  2
     1  1 11


**Prior probabilities**

Owner - 85%

Nonowner - 15%

In [12]:
lda2 = LinearDiscriminantAnalysis(priors = [0.15,0.85])
lda2.fit(mow.drop(columns='Ownership'), mow['Ownership'])

LinearDiscriminantAnalysis(priors=[0.15, 0.85])

In [13]:
lda2.classes_

array(['Nonowner', 'Owner'], dtype='<U8')

In [14]:
pred2 = pd.DataFrame(lda2.predict(mow.drop(columns='Ownership')),columns=['predicted2'])
pred2.head()

Unnamed: 0,predicted2
0,Owner
1,Owner
2,Owner
3,Owner
4,Owner


In [15]:
classificationSummary(mow['Ownership'],pred2['predicted2'])

Confusion Matrix (Accuracy 0.7917)

       Prediction
Actual  0  1
     0  7  5
     1  0 12


In [16]:
both_pred = pd.concat([mow['Ownership'],pred,pred2], axis=1)
both_pred.head()

Unnamed: 0,Ownership,predicted,predicted2
0,Owner,Nonowner,Owner
1,Owner,Owner,Owner
2,Owner,Owner,Owner
3,Owner,Owner,Owner
4,Owner,Owner,Owner


## Accidents Classification

In [17]:
acc = pd.read_csv('accidents.csv')
acc.head()

Unnamed: 0,RushHour,WRK_ZONE,WKDY,INT_HWY,LGTCON_day,LEVEL,SPD_LIM,SUR_COND_dry,TRAF_two_way,WEATHER_adverse,MAX_SEV
0,1,0,1,1,0,1,70,0,0,1,no-injury
1,1,0,1,0,0,0,55,0,1,0,non-fatal
2,1,0,0,0,0,0,35,0,0,1,no-injury
3,1,0,1,0,0,1,35,0,0,1,no-injury
4,1,0,1,0,0,0,25,0,0,1,non-fatal


In [18]:
lda3 = LinearDiscriminantAnalysis()
lda3.fit(acc.drop(columns=['MAX_SEV']), acc['MAX_SEV'])

LinearDiscriminantAnalysis()

In [19]:
pred3 = pd.DataFrame(lda3.predict(acc.drop(columns='MAX_SEV')),columns=['predicted'])
pred3.head()

Unnamed: 0,predicted
0,no-injury
1,no-injury
2,no-injury
3,no-injury
4,no-injury


In [20]:
result = pd.concat([acc,pred3], axis=1)
result

Unnamed: 0,RushHour,WRK_ZONE,WKDY,INT_HWY,LGTCON_day,LEVEL,SPD_LIM,SUR_COND_dry,TRAF_two_way,WEATHER_adverse,MAX_SEV,predicted
0,1,0,1,1,0,1,70,0,0,1,no-injury,no-injury
1,1,0,1,0,0,0,55,0,1,0,non-fatal,no-injury
2,1,0,0,0,0,0,35,0,0,1,no-injury,no-injury
3,1,0,1,0,0,1,35,0,0,1,no-injury,no-injury
4,1,0,1,0,0,0,25,0,0,1,non-fatal,no-injury
...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,1,0,1,0,55,1,1,0,no-injury,no-injury
596,1,0,0,0,1,0,55,1,1,0,no-injury,non-fatal
597,0,0,1,0,1,0,55,1,1,0,no-injury,no-injury
598,0,0,0,0,1,0,65,1,1,0,non-fatal,non-fatal


In [21]:
classificationSummary(acc['MAX_SEV'],pred3['predicted'])

Confusion Matrix (Accuracy 0.5283)

       Prediction
Actual   0   1   2
     0   1   1   3
     1   6 114 172
     2   6  95 202


In [22]:
lda3.classes_

array(['fatal', 'no-injury', 'non-fatal'], dtype='<U9')

In [23]:
lda3.predict_proba(acc.drop(columns=['MAX_SEV']))

array([[2.58367054e-03, 6.18769483e-01, 3.78646847e-01],
       [2.63265433e-01, 4.71200103e-01, 2.65534465e-01],
       [3.76907831e-04, 5.35717590e-01, 4.63905503e-01],
       ...,
       [2.42471329e-03, 5.08766883e-01, 4.88808404e-01],
       [1.70823665e-02, 4.50482201e-01, 5.32435433e-01],
       [2.16884771e-02, 4.59800507e-01, 5.18511016e-01]])

In [24]:
classificationSummary(acc['MAX_SEV'],pred3['predicted'],class_names=lda3.classes_)

Confusion Matrix (Accuracy 0.5283)

          Prediction
   Actual     fatal no-injury non-fatal
    fatal         1         1         3
no-injury         6       114       172
non-fatal         6        95       202


In [25]:
lda3.coef_

array([[-9.96410915e-01, -4.57187516e-01, -1.47177737e+00,
         7.55343534e-01,  9.51536302e-03,  9.76626140e-01,
         4.80327050e-02, -5.99980857e+00,  7.52985312e-01,
        -6.59668997e+00],
       [ 3.34301594e-02,  2.20011814e-01,  1.65706969e-01,
        -7.58161346e-02, -3.14214567e-02, -8.27169309e-02,
         4.38065885e-03, -1.64874117e-01, -1.28440158e-02,
         7.91657803e-02],
       [-1.57740989e-02, -2.04480238e-01, -1.35404450e-01,
         6.05993189e-02,  3.01237246e-02,  6.35980631e-02,
        -5.01424393e-03,  2.57895330e-01, -4.77688520e-05,
         3.25644951e-02]])

## 12.5 Prior probabilities