In [38]:
import numpy
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pd.read_csv('winequality-red.csv')

In [39]:

X = df.loc[:,'fixed acidity':'alcohol']

scaler = StandardScaler().fit(X)
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [40]:
bins = (2, 5.5, 8)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)
label_quality = LabelEncoder()
df['quality'] = label_quality.fit_transform(df['quality'])
y = df['quality']

In [41]:
from sklearn.decomposition import PCA
X=scaler.transform(X)
pca = PCA() # creates an instance of PCA class
results = pca.fit(X) # applies PCA on predictor variables
Z = results.transform(X) # create a new array of latent variables

In [42]:
pd.DataFrame(results.components_)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.489314,-0.238584,0.463632,0.146107,0.212247,-0.036158,0.023575,0.395353,-0.43852,0.242921,-0.113232
1,-0.110503,0.27493,-0.151791,0.27208,0.148052,0.513567,0.569487,0.233575,0.006711,-0.037554,-0.386181
2,-0.123302,-0.449963,0.238247,0.101283,-0.092614,0.428793,0.322415,-0.338871,0.057697,0.279786,0.471673
3,-0.229617,0.07896,-0.079418,-0.372793,0.666195,-0.043538,-0.034577,-0.1745,-0.003788,0.550872,-0.122181
4,-0.082614,0.218735,-0.058573,0.732144,0.246501,-0.159152,-0.222465,0.157077,0.26753,0.225962,0.350681
5,0.101479,0.411449,0.069593,0.049156,0.304339,-0.014,0.136308,-0.391152,-0.522116,-0.381263,0.361645
6,-0.350227,-0.533735,0.105497,0.290663,0.370413,-0.116596,-0.093662,-0.170481,-0.025138,-0.447469,-0.327651
7,-0.177595,-0.078775,-0.377516,0.299845,-0.357009,-0.204781,0.019036,-0.239223,-0.561391,0.374604,-0.217626
8,-0.194021,0.12911,0.38145,-0.007523,-0.111339,-0.635405,0.592116,-0.020719,0.167746,0.058367,-0.037603
9,-0.249523,0.365925,0.621677,0.092872,-0.217671,0.248483,-0.37075,-0.23999,-0.01097,0.11232,-0.303015


In [43]:
pd.DataFrame(Z[:,:6], columns=list(
[u'Acidity', u'Sulphides', u'More alcohol', u'Chlorides', u'More residual sugar', u'Less pH'])).head(10)

Unnamed: 0,Acidity,Sulphides,More alcohol,Chlorides,More residual sugar,Less pH
0,-1.61953,0.45095,-1.774454,0.04374,0.067014,-0.913921
1,-0.79917,1.856553,-0.91169,0.548066,-0.018392,0.929714
2,-0.748479,0.882039,-1.171394,0.411021,-0.043531,0.401473
3,2.357673,-0.269976,0.243489,-0.92845,-1.499149,-0.131017
4,-1.61953,0.45095,-1.774454,0.04374,0.067014,-0.913921
5,-1.583707,0.569195,-1.538286,0.02375,-0.110076,-0.993626
6,-1.101464,0.608015,-1.075915,-0.343959,-1.133382,0.175
7,-2.248708,-0.416835,-0.986837,-0.001203,-0.780435,0.286057
8,-1.086887,-0.308569,-1.51815,0.003315,-0.226727,-0.512634
9,0.65479,1.665207,1.209476,-0.824635,1.718501,-0.476497


In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [45]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train,y_train)
pred = neigh.predict(X_test)
print('Confusion matrix:')
print(confusion_matrix(pred,y_test))
print('\nAccuracy:')
print(accuracy_score(pred,y_test))
print(classification_report(pred,y_test))

Confusion matrix:
[[ 87  38]
 [ 54 141]]

Accuracy:
0.7125
              precision    recall  f1-score   support

           0       0.62      0.70      0.65       125
           1       0.79      0.72      0.75       195

    accuracy                           0.71       320
   macro avg       0.70      0.71      0.70       320
weighted avg       0.72      0.71      0.71       320



In [46]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(Z[:,:6], y, test_size = 0.2, random_state = 42)

In [47]:
#In nearest neighbour the parameter which needs to be decided is number of neighbours. In this we see keeping this at 5 yields the best accuracy and also we if we consider the Principal Components then also we achieve almost the same accuracy.
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train1,y_train1)
pred = neigh.predict(X_test1)
print('Confusion matrix:')
print(confusion_matrix(pred,y_test1))
print('\nAccuracy:')
print(accuracy_score(pred,y_test1))
print(classification_report(pred,y_test1))

Confusion matrix:
[[ 90  42]
 [ 51 137]]

Accuracy:
0.709375
              precision    recall  f1-score   support

           0       0.64      0.68      0.66       132
           1       0.77      0.73      0.75       188

    accuracy                           0.71       320
   macro avg       0.70      0.71      0.70       320
weighted avg       0.71      0.71      0.71       320



In [None]:
#As the support of the class 1 in test set is more we also see it gives better f1-score and precision recall indicating that Tuype-1 wrror is less for class 1.
#The nearest neighbour doesnt work that well for this dataset as the accuracy is around .70.