In [19]:
import numpy as np
from scipy import stats

class KNearestNeighbours:
    def __init__(self,k,regression=True):
        self.k = k
        self.regression = regression

    def fit(self,X_train,Y_train):
        self.X_train = X_train
        self.Y_train = Y_train
        
    def predict(self,X_test):
        y_pred = []
        for i in range(X_test.shape[0]):
            distances = []
            for j in range(self.X_train.shape[0]):
                d =(np.sum(abs(X_test.iloc[i,:] - self.X_train.iloc[j,:])))
                distances.append((d, self.Y_train[j])) 
            distances = sorted(distances)
            
            neighbors = []
            for item in range(self.k):
                neighbors.append(distances[item][1])
            if self.regression:
                y_pred.append(np.mean(neighbors))
            else:
                y_pred.append(stats.mode(neighbors)[0][0])
        return y_pred

In [20]:
import pandas as pd

In [21]:
df = pd.read_csv('D:/210968186/Wine Quality/WineQT.csv')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


In [22]:
df = df.fillna(df.mean())
df = df/df.max()

In [23]:
Y = df['quality']
Y

0       0.625
1       0.625
2       0.625
3       0.750
4       0.625
        ...  
1138    0.750
1139    0.750
1140    0.625
1141    0.750
1142    0.625
Name: quality, Length: 1143, dtype: float64

In [24]:
X = df.drop(['Id','quality'],axis=1)
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.465409,0.443038,0.00,0.122581,0.124386,0.161765,0.117647,0.994132,0.875312,0.280,0.630872
1,0.490566,0.556962,0.00,0.167742,0.160393,0.367647,0.231834,0.993135,0.798005,0.340,0.657718
2,0.490566,0.481013,0.04,0.148387,0.150573,0.220588,0.186851,0.993335,0.812968,0.325,0.657718
3,0.704403,0.177215,0.56,0.122581,0.122750,0.250000,0.207612,0.994331,0.788030,0.290,0.657718
4,0.465409,0.443038,0.00,0.122581,0.124386,0.161765,0.117647,0.994132,0.875312,0.280,0.630872
...,...,...,...,...,...,...,...,...,...,...,...
1138,0.396226,0.322785,0.13,0.148387,0.124386,0.426471,0.138408,0.992079,0.852868,0.375,0.738255
1139,0.427673,0.392405,0.08,0.122581,0.111293,0.411765,0.131488,0.992846,0.852868,0.410,0.637584
1140,0.389937,0.379747,0.08,0.129032,0.147300,0.470588,0.152249,0.991242,0.860349,0.290,0.704698
1141,0.371069,0.348101,0.10,0.141935,0.101473,0.573529,0.176471,0.991462,0.877805,0.380,0.751678


In [25]:
for values in X:
    print('Feature selected:',values)
    print('\n')
    print("Mean before removal:",X[values].mean(),"Std:",X[values].std())
    X[values].loc[np.logical_or(X[values] < X[values].mean() - 2*X[values].std(),X[values] > X[values].mean() + 2*X[values].std())] = X[values].median()
    print("Mean after removal:",X[values].mean(),"Std:",X[values].std())
    print('\n----')
    print('\n')


Feature selected: fixed acidity


Mean before removal: 0.5227113906359185 Std: 0.10991163629997089
Mean after removal: 0.506803787891293 Std: 0.08591233799337536

----


Feature selected: volatile acidity


Mean before removal: 0.33629024220073833 Std: 0.11369189431805354
Mean after removal: 0.3272090988626429 Std: 0.09742196383834444

----


Feature selected: citric acid


Mean before removal: 0.26836395450568584 Std: 0.1966858523482191
Mean after removal: 0.25717410323709433 Std: 0.18321790963517273

----


Feature selected: residual sugar


Mean before removal: 0.16336466006265388 Std: 0.08747854623759185
Mean after removal: 0.1473964947929899 Std: 0.040927913511232

----


Feature selected: chlorides


Mean before removal: 0.14227926910118227 Std: 0.07736061858000093
Mean after removal: 0.13159157069359803 Std: 0.03088072300113206

----


Feature selected: free sulfur dioxide


Mean before removal: 0.22963949359271324 Std: 0.15074244299162967
Mean after removal: 0.21218671195512345

In [26]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.465409,0.443038,0.00,0.122581,0.124386,0.161765,0.117647,0.994132,0.875312,0.280,0.630872
1,0.490566,0.556962,0.00,0.167742,0.160393,0.367647,0.231834,0.993135,0.798005,0.340,0.657718
2,0.490566,0.481013,0.04,0.148387,0.150573,0.220588,0.186851,0.993335,0.812968,0.325,0.657718
3,0.704403,0.177215,0.56,0.122581,0.122750,0.250000,0.207612,0.994331,0.788030,0.290,0.657718
4,0.465409,0.443038,0.00,0.122581,0.124386,0.161765,0.117647,0.994132,0.875312,0.280,0.630872
...,...,...,...,...,...,...,...,...,...,...,...
1138,0.396226,0.322785,0.13,0.148387,0.124386,0.426471,0.138408,0.992079,0.852868,0.375,0.738255
1139,0.427673,0.392405,0.08,0.122581,0.111293,0.411765,0.131488,0.992846,0.852868,0.410,0.637584
1140,0.389937,0.379747,0.08,0.129032,0.147300,0.470588,0.152249,0.991242,0.860349,0.290,0.704698
1141,0.371069,0.348101,0.10,0.141935,0.101473,0.191176,0.176471,0.991462,0.877805,0.380,0.751678


In [27]:
df2 = X
df2['quality'] = Y
df2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.465409,0.443038,0.00,0.122581,0.124386,0.161765,0.117647,0.994132,0.875312,0.280,0.630872,0.625
1,0.490566,0.556962,0.00,0.167742,0.160393,0.367647,0.231834,0.993135,0.798005,0.340,0.657718,0.625
2,0.490566,0.481013,0.04,0.148387,0.150573,0.220588,0.186851,0.993335,0.812968,0.325,0.657718,0.625
3,0.704403,0.177215,0.56,0.122581,0.122750,0.250000,0.207612,0.994331,0.788030,0.290,0.657718,0.750
4,0.465409,0.443038,0.00,0.122581,0.124386,0.161765,0.117647,0.994132,0.875312,0.280,0.630872,0.625
...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.396226,0.322785,0.13,0.148387,0.124386,0.426471,0.138408,0.992079,0.852868,0.375,0.738255,0.750
1139,0.427673,0.392405,0.08,0.122581,0.111293,0.411765,0.131488,0.992846,0.852868,0.410,0.637584,0.750
1140,0.389937,0.379747,0.08,0.129032,0.147300,0.470588,0.152249,0.991242,0.860349,0.290,0.704698,0.625
1141,0.371069,0.348101,0.10,0.141935,0.101473,0.191176,0.176471,0.991462,0.877805,0.380,0.751678,0.750


In [28]:
df2 = df2.loc[np.logical_and(df['quality'] > df['quality'].mean() - 2*df['quality'].std(),df['quality'] < df['quality'].mean() + 2*df['quality'].std())]
df2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.465409,0.443038,0.00,0.122581,0.124386,0.161765,0.117647,0.994132,0.875312,0.280,0.630872,0.625
1,0.490566,0.556962,0.00,0.167742,0.160393,0.367647,0.231834,0.993135,0.798005,0.340,0.657718,0.625
2,0.490566,0.481013,0.04,0.148387,0.150573,0.220588,0.186851,0.993335,0.812968,0.325,0.657718,0.625
3,0.704403,0.177215,0.56,0.122581,0.122750,0.250000,0.207612,0.994331,0.788030,0.290,0.657718,0.750
4,0.465409,0.443038,0.00,0.122581,0.124386,0.161765,0.117647,0.994132,0.875312,0.280,0.630872,0.625
...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.396226,0.322785,0.13,0.148387,0.124386,0.426471,0.138408,0.992079,0.852868,0.375,0.738255,0.750
1139,0.427673,0.392405,0.08,0.122581,0.111293,0.411765,0.131488,0.992846,0.852868,0.410,0.637584,0.750
1140,0.389937,0.379747,0.08,0.129032,0.147300,0.470588,0.152249,0.991242,0.860349,0.290,0.704698,0.625
1141,0.371069,0.348101,0.10,0.141935,0.101473,0.191176,0.176471,0.991462,0.877805,0.380,0.751678,0.750


In [29]:
features = abs(df2.corr()['quality']).sort_values(ascending=False)[1:].index.to_list()[:6]
features

['alcohol',
 'sulphates',
 'volatile acidity',
 'chlorides',
 'citric acid',
 'total sulfur dioxide']

In [30]:
Y = df2['quality']
Y

0       0.625
1       0.625
2       0.625
3       0.750
4       0.625
        ...  
1138    0.750
1139    0.750
1140    0.625
1141    0.750
1142    0.625
Name: quality, Length: 1088, dtype: float64

In [31]:
X = df2.drop(['quality'],axis=1)[features]
X = X/X.max()
X

Unnamed: 0,alcohol,sulphates,volatile acidity,chlorides,citric acid,total sulfur dioxide
0,0.746032,0.565657,0.786517,0.426966,0.000000,0.306306
1,0.777778,0.686869,0.988764,0.550562,0.000000,0.603604
2,0.777778,0.656566,0.853933,0.516854,0.060606,0.486486
3,0.777778,0.585859,0.314607,0.421348,0.848485,0.540541
4,0.746032,0.565657,0.786517,0.426966,0.000000,0.306306
...,...,...,...,...,...,...
1138,0.873016,0.757576,0.573034,0.426966,0.196970,0.360360
1139,0.753968,0.828283,0.696629,0.382022,0.121212,0.342342
1140,0.833333,0.585859,0.674157,0.505618,0.121212,0.396396
1141,0.888889,0.767677,0.617978,0.348315,0.151515,0.459459


In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [33]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn import metrics
r2_max = 0
best_model = KNearestNeighbours(0)
for k in range(100,4,-5):
    model = KNearestNeighbours(k,True)
    model.fit(X_train,y_train)
    Y_pred = model.predict(X_test)
    r2 = r2_score(y_test,Y_pred)
    if(r2>r2_max):
        r2_max = r2
        best_model = model
    print('k:',k)
    print('R2 score:',r2)
#     print('RMSE score',np.sqrt(mean_squared_error(y_test,Y_pred)))
#    print("accuracy score:",metrics.accuracy_score(y_test,Y_pred)*100)
    print('\n----')
    print('\n')

k: 100
R2 score: 0.22787664490205928

----


k: 95
R2 score: 0.23316575559547048

----


k: 90
R2 score: 0.2357222315233365

----


k: 85
R2 score: 0.2315114902876091

----


k: 80
R2 score: 0.23233299849321942

----


k: 75
R2 score: 0.23670169094257487

----


k: 70
R2 score: 0.2364415379411433

----


k: 65
R2 score: 0.2386130486597975

----


k: 60
R2 score: 0.24134605725765945

----


k: 55
R2 score: 0.24181394788946953

----


k: 50
R2 score: 0.24089442491210455

----


k: 45
R2 score: 0.24181483341704335

----


k: 40
R2 score: 0.24292817679558

----


k: 35
R2 score: 0.23502946934675428

----


k: 30
R2 score: 0.2325553881354988

----


k: 25
R2 score: 0.23431441486690108

----


k: 20
R2 score: 0.24918131592164738

----


k: 15
R2 score: 0.21313466153245153

----


k: 10
R2 score: 0.22097438473129072

----


k: 5
R2 score: 0.15363134103465603

----




  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [34]:
print('best R2 score:',r2_max)
print('best K:',best_model.k)

best R2 score: 0.24918131592164738
best K: 20


0      49.0
1      35.0
2      42.0
3      35.0
4      35.0
       ... 
267    42.0
268    35.0
269    35.0
270    35.0
271    42.0
Name: quality, Length: 272, dtype: float64