In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import svm,metrics
from sklearn.model_selection import train_test_split

In [91]:
df = pd.read_csv('/winequality-red.csv',sep=';')

In [92]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [93]:
X = df.values[:,:11]
Y = df.values[:,[11]]

In [94]:
def pca(X,reduced_cols_cnt):
    #getting colwise mean for the input data and subtracting it from original input data
    mean_x= np.mean(X , axis = 0)
    x_avg_mod = X - mean_x

    #getting covariance matrix
    cov_mat = np.cov(x_avg_mod , rowvar = False)

    #finding eigen values and eigen vectors
    eigen_values , eigen_vectors = np.linalg.eigh(cov_mat)
    

    #sorting by eigen values in non increasing order
    sorted_index = np.argsort(eigen_values)[::-1]

    sorted_eigenvalue = eigen_values[sorted_index]
    #sorting eigen vectors
    sorted_eigenvectors = eigen_vectors[:,sorted_index]
    
    #considering the first reduced_cols_cnt columns 
    eigenvector_subset = sorted_eigenvectors[:,0:reduced_cols_cnt]
    return np.dot(eigenvector_subset.transpose(),x_avg_mod.transpose()).transpose()

In [103]:
#applying priciple component analysis on data and reducing the data to 2 dimensional 
#no_cols_to_be_reduced_to
cl = 3

pca_inp_arr=pd.DataFrame(pca(X,cl),columns=['PCA1','PCA2','PCA3'])
train_label=pd.DataFrame(Y,columns=['target'])

pca_df=pd.concat([pca_inp_arr,train_label],axis=1)





In [104]:
pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3,target
0,13.224905,-2.0239,-1.126821,5.0
1,-22.037724,4.408322,-0.310378,5.0
2,-7.162673,-2.501461,-0.581868,5.0
3,-13.430063,-1.951122,2.63404,6.0
4,13.224905,-2.0239,-1.126821,5.0


In [105]:
#splitting our original data into training and testing data in 80:20 ratio 
#reducing dimension without pca 
#considering first two cols
X_no_pca = df.values[:,:cl]
Y_no_pca = df.values[:,[11]]

train_data,test_data,train_label,test_label=train_test_split(X_no_pca,Y_no_pca,train_size= 0.8,test_size=0.2)

In [106]:
#splitting pca data into training and testing data  in 80:20 ratio
X_pca = pca_df.values[:,:cl]
Y_pca = pca_df.values[:,[cl]]

train_data_pca,test_data_pca,train_label_pca,test_label_pca=train_test_split(X_pca,Y_pca,train_size= 0.8,test_size=0.2)


Verifying using KNN Classifier 

In [107]:

import math
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors
 
# Make a classification prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [108]:
k=3
correct=0
total=0

original_train_data=np.concatenate((train_data,train_label),axis=1)
original_test_data=np.concatenate((test_data,test_label),axis=1)
for i in original_test_data:
    correct+=int(predict_classification(original_train_data, i, k)==i[-1])
    total+=1

print('Accuracy without PCA :' ,correct/total)

Accuracy without PCA : 0.46875


In [109]:
k=3
correct=0
total=0
#preparing training and testing data
pca_train_data=np.concatenate((train_data_pca,train_label_pca),axis=1)
pca_test_data=np.concatenate((test_data_pca,test_label_pca),axis=1)
for i in pca_test_data:
    correct+=int(predict_classification(pca_train_data, i, k)==i[-1])
    total+=1

print('Accuracy with PCA :' ,correct/total)

Accuracy with PCA : 0.503125
