# Dimensionality Reduction Using LDA (Linear Discriminant Analysis )

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv(r'wine.csv')
print(df.shape)
df.head(2)

(178, 14)


Unnamed: 0,class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050


In [3]:
X = df.iloc[:,1:]
y = df['class']


In [4]:
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict,KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.grid_search import GridSearchCV
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)




In [5]:
sc = StandardScaler()
sc.fit(X)
lda = LinearDiscriminantAnalysis(n_components=2)
lr = LogisticRegression()
pipeline = Pipeline(steps=[('lda',lda),('logistic',lr)]) 
pipeline.fit(X, y)
kfold = KFold(n_splits=10, random_state=1,shuffle=True)
results = cross_val_score(pipeline, X, y, scoring='accuracy', cv=kfold)
predict = cross_val_predict(pipeline,X,y,cv=10)
print("Accuracy : ",results.mean())
print("Classification Report : \n",classification_report(predict,y))
print("Confusion Metrix : \n",confusion_matrix(predict,y))

Accuracy :  0.9882352941176471
Classification Report : 
              precision    recall  f1-score   support

          1       1.00      0.98      0.99        60
          2       0.97      0.99      0.98        70
          3       0.98      0.98      0.98        48

avg / total       0.98      0.98      0.98       178

Confusion Metrix : 
 [[59  1  0]
 [ 0 69  1]
 [ 0  1 47]]




### USing GridSearchCV

In [87]:
n_components = [3,4,5,6]

lda = LinearDiscriminantAnalysis()
lr = LogisticRegression()

pipeline_2 = Pipeline(steps=[('lda',lda),('logistic',lr)]) 
param_dict = [{'lda__n_components':n_components}]


estimator = GridSearchCV(pipeline_2, param_grid=param_dict, scoring='accuracy')

In [88]:
estimator.fit(X,y)
best = estimator.best_estimator_
best

Pipeline(memory=None,
     steps=[('lda', LinearDiscriminantAnalysis(n_components=3, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [89]:
print(estimator.best_score_)

0.9719101123595506
