In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from pandas_profiling import ProfileReport

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

## KNN

In [5]:
df = pd.read_csv('winequality-red.csv', sep = ';')

In [6]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
x = df.drop(columns='quality')
y = df['quality']

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 20)

In [29]:
knn = KNeighborsClassifier()

In [30]:
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [31]:
knn.score(x_test, y_test)

0.5175

In [32]:
param = {
    'n_neighbors': [3,5,7,9,12,13,15,17, 21, 23]
}

In [33]:
knn_grid_cv = GridSearchCV(knn, param_grid=param)

In [34]:
knn_grid_cv.fit(x_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7, 9, 12, 13, 15, 17, 21, 23]})

In [35]:
knn_grid_cv.best_params_

{'n_neighbors': 17}

In [39]:
## Checking input data with 17 neighbors

knn_new = KNeighborsClassifier(n_neighbors=17)
knn_new.fit(x_train, y_train)
knn_new.score(x_test, y_test)

0.555

In [40]:
## weight: distance -> weightage of data point inversely proportional
## p -> distance 

In [42]:
## Checking input data with 17 neighbors
## p = 1 -> manhattan_distance 

knn_new = KNeighborsClassifier(n_neighbors=17, p = 1)
knn_new.fit(x_train, y_train)
knn_new.score(x_test, y_test)

0.56

In [46]:
param = {
    'n_neighbors': [3,5,7,9,12,13,15,17, 21, 23],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 15, 20, 25, 30, 35, 40, 45, 50],
    'p': [1, 2],
    'weights': ['uniform', 'distance']
}

In [47]:
knn_grid_cv = GridSearchCV(knn, param_grid=param)
knn_grid_cv.fit(x_train, y_train)
knn_grid_cv.best_params_

{'algorithm': 'auto',
 'leaf_size': 10,
 'n_neighbors': 21,
 'p': 1,
 'weights': 'distance'}

In [48]:
knn_new = KNeighborsClassifier(algorithm = 'auto', leaf_size = 10,
                               n_neighbors=17, p = 1, weights = 'distance')
knn_new.fit(x_train, y_train)
knn_new.score(x_test, y_test)

0.62

# SVM (Support Vector Machine)

## SVC (Support Vector Classifier)

In [49]:
from sklearn.svm import SVC

In [50]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [52]:
svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_test, y_test)

0.5

In [58]:
param = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

In [59]:
svm_grid = GridSearchCV(svc, param_grid=param)
svm_grid.fit(x_train, y_train)
svm_grid.score(x_test, y_test)

0.5925

In [62]:
svm_grid.best_params_

{'kernel': 'linear'}

In [61]:
svm1 = SVC(kernel = 'linear')
svm1.fit(x_train, y_train)
svm1.score(x_test, y_test)

0.5925

In [63]:
svm_grid = GridSearchCV(SVC(), param_grid=param)
svm_grid.fit(x_train, y_train)
svm_grid.score(x_test, y_test)

0.5925

In [64]:
 param = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
     'C': [.1, .4, .6, .8, 1, 10, 50, 100, 200],
     'gamma': [.001, .1, .4, .004, .003]
}

In [None]:
## Huge Computation Time
svm_grid = GridSearchCV(SVC(), param_grid=param, verbose=3)
svm_grid.fit(x_train, y_train)
svm_grid.score(x_test, y_test)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 1/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.600 total time=   0.6s
[CV 2/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.525 total time=   0.6s
[CV 3/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.575 total time=   0.4s
[CV 4/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.600 total time=   0.4s
[CV 5/5] END .C=0.1, gamma=0.001, kernel=linear;, score=0.548 total time=   0.7s
[CV 1/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.517 total time=   0.2s
[CV 2/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.500 total time=   0.2s
[CV 3/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.479 total time=   0.2s
[CV 4/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.483 total time=   0.1s
[CV 5/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.510 total time=   0.2s
[CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.529 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.001, kernel=rb

## SVR (Support Vector Regression)

In [3]:
df = pd.read_csv('Admission_Predict.csv')

In [4]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [8]:
df.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [10]:
x = df.drop(['Serial No.', 'Chance of Admit '], axis = 1)
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1
2,316,104,3,3.0,3.5,8.00,1
3,322,110,3,3.5,2.5,8.67,1
4,314,103,2,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1
496,337,117,5,5.0,5.0,9.87,1
497,330,120,5,4.5,5.0,9.56,1
498,312,103,4,4.0,5.0,8.43,0


In [11]:
y = df['Chance of Admit ']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x , y, random_state = 30, test_size = .2)

In [13]:
from sklearn.svm import SVR

In [17]:
svr = SVR()
svr.fit(x_train, y_train)

SVR()

In [18]:
df.isna().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [19]:
svr.score(x_test, y_test)

0.6851176591184742

In [20]:
from sklearn.metrics import r2_score

In [22]:
svr.predict(x_test)

array([0.79359894, 0.56840662, 0.70121728, 0.64051776, 0.82877673,
       0.75720511, 0.65919306, 0.61314479, 0.73137208, 0.7450226 ,
       0.83095028, 0.68214796, 0.77285966, 0.88845842, 0.80998244,
       0.8247482 , 0.55105976, 0.5986574 , 0.75757418, 0.7159455 ,
       0.70815123, 0.70214425, 0.7920094 , 0.80912574, 0.7717613 ,
       0.56683869, 0.63246794, 0.64924208, 0.62574847, 0.58584348,
       0.6697305 , 0.53829813, 0.78952432, 0.63965643, 0.66426546,
       0.66289981, 0.65128345, 0.79695115, 0.5501086 , 0.66026974,
       0.57509407, 0.61435263, 0.67883631, 0.66912539, 0.78486554,
       0.65539903, 0.91631603, 0.63191225, 0.62692592, 0.77809513,
       0.88682892, 0.69506213, 0.79551633, 0.69391549, 0.67678932,
       0.88428817, 0.63590511, 0.53807879, 0.62764507, 0.71415275,
       0.78007972, 0.68890559, 0.73340895, 0.64024814, 0.70710604,
       0.66720127, 0.66543166, 0.62400627, 0.86396078, 0.6824248 ,
       0.52723608, 0.871495  , 0.64914782, 0.53984195, 0.64897

In [23]:
r2_score(y_test, svr.predict(x_test))

0.6851176591184742

## Stacking

In [24]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [27]:
data = pd.read_csv('winequality-red.csv', sep = ';')

In [28]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [29]:
x = data.drop(columns = 'quality')
y = data['quality']

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=30, test_size=.15)

In [31]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [32]:
knn.score(x_test, y_test)

0.5083333333333333

In [33]:
svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_test, y_test)

0.5125

In [34]:
## How we split in Stacking

In [35]:
## 50% split
train, val_train, test, val_test = train_test_split(x, y, random_state=30, test_size=.5)
x_train, x_test, y_train, y_test = train_test_split(train, test, random_state=30, test_size=.2)

In [36]:
pred_knn = knn.predict(val_train)
pred_svc = svc.predict(val_train)

In [37]:
pred_knn

array([7, 5, 6, 5, 5, 5, 5, 5, 6, 5, 6, 5, 6, 6, 7, 6, 5, 6, 5, 5, 5, 6,
       5, 5, 6, 6, 6, 5, 5, 6, 5, 6, 6, 5, 6, 6, 6, 6, 5, 7, 5, 6, 5, 4,
       6, 7, 6, 5, 5, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5,
       5, 6, 5, 4, 5, 5, 5, 5, 6, 6, 7, 5, 5, 6, 6, 7, 6, 5, 5, 5, 5, 6,
       5, 5, 6, 6, 5, 6, 6, 7, 5, 7, 5, 7, 6, 5, 5, 7, 6, 6, 6, 5, 5, 5,
       5, 5, 5, 6, 7, 6, 6, 6, 6, 5, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 5,
       5, 5, 7, 5, 5, 5, 6, 7, 7, 5, 5, 5, 6, 5, 5, 5, 5, 7, 5, 5, 5, 6,
       5, 7, 6, 7, 7, 5, 5, 6, 6, 5, 6, 5, 5, 6, 7, 6, 6, 6, 5, 6, 5, 5,
       5, 5, 6, 6, 5, 6, 6, 5, 5, 6, 5, 7, 6, 6, 5, 5, 6, 5, 7, 5, 6, 5,
       5, 6, 6, 6, 6, 5, 5, 6, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 7, 5,
       6, 6, 5, 6, 6, 5, 6, 5, 6, 5, 6, 6, 5, 6, 6, 5, 5, 6, 5, 6, 5, 6,
       5, 6, 5, 6, 6, 5, 6, 6, 5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 5, 5, 6, 6,
       4, 5, 6, 6, 6, 6, 5, 5, 6, 6, 5, 6, 4, 6, 5, 6, 5, 5, 5, 6, 5, 6,
       7, 6, 5, 7, 5, 5, 6, 7, 5, 6, 5, 6, 5, 6, 5,

In [38]:
pred_svc

array([6, 6, 6, 5, 6, 5, 6, 5, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 6, 5, 6, 6,
       6, 5, 6, 6, 6, 6, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6, 5, 5,
       5, 6, 5, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6,
       5, 6, 5, 6, 5, 6, 6, 6, 5, 6, 5, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 5,
       5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 5, 5,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 5, 6, 6, 6, 6, 5, 5, 6, 6,
       6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6,
       5, 5, 6, 6, 5, 6, 6, 5, 5, 6, 6, 6, 5, 6, 5, 6, 6, 5, 6, 6, 6, 5,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 5,
       6, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6,
       6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 5, 6, 6,
       6, 5, 6, 5, 6, 6, 5, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5,
       6, 6, 5, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5, 6, 5,

In [40]:
input3 = np.column_stack((pred_knn, pred_svc))
input3

array([[7, 6],
       [5, 6],
       [6, 6],
       ...,
       [6, 5],
       [6, 6],
       [6, 5]], dtype=int64)

In [42]:
output = val_test
output

1147    7
659     4
871     5
1333    5
1411    6
       ..
1073    6
200     7
942     7
1106    6
1329    6
Name: quality, Length: 800, dtype: int64

In [47]:
pd.DataFrame(input3)

Unnamed: 0,0,1
0,7,6
1,5,6
2,6,6
3,5,5
4,5,6
...,...,...
795,5,6
796,5,6
797,6,5
798,6,6


In [48]:
rf = RandomForestClassifier()
rf.fit(input3, output)

RandomForestClassifier()

In [49]:
knn_output = knn.predict(x_test)
svc_output = knn.predict(x_test)

In [52]:
output_stack = np.column_stack((knn_output, svc_output))
output_stack

array([[5, 5],
       [6, 6],
       [6, 6],
       [5, 5],
       [6, 6],
       [6, 6],
       [6, 6],
       [6, 6],
       [6, 6],
       [5, 5],
       [6, 6],
       [6, 6],
       [5, 5],
       [5, 5],
       [6, 6],
       [6, 6],
       [6, 6],
       [6, 6],
       [5, 5],
       [6, 6],
       [5, 5],
       [7, 7],
       [5, 5],
       [5, 5],
       [5, 5],
       [5, 5],
       [5, 5],
       [6, 6],
       [5, 5],
       [5, 5],
       [6, 6],
       [5, 5],
       [6, 6],
       [5, 5],
       [5, 5],
       [6, 6],
       [6, 6],
       [6, 6],
       [6, 6],
       [5, 5],
       [6, 6],
       [6, 6],
       [5, 5],
       [6, 6],
       [6, 6],
       [7, 7],
       [5, 5],
       [5, 5],
       [5, 5],
       [5, 5],
       [6, 6],
       [5, 5],
       [6, 6],
       [5, 5],
       [5, 5],
       [6, 6],
       [6, 6],
       [6, 6],
       [6, 6],
       [5, 5],
       [5, 5],
       [6, 6],
       [5, 5],
       [6, 6],
       [8, 8],
       [5, 5],
       [5,

In [53]:
rf.predict(output_stack)

array([5, 6, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 5, 5, 6, 6, 6, 6, 5, 6, 5, 7,
       5, 5, 5, 5, 5, 6, 5, 5, 6, 5, 6, 5, 5, 6, 6, 6, 6, 5, 6, 6, 5, 6,
       6, 7, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 7, 5,
       5, 5, 6, 6, 6, 6, 5, 5, 6, 6, 7, 5, 5, 5, 6, 7, 5, 5, 5, 6, 5, 6,
       6, 6, 6, 5, 6, 5, 5, 6, 7, 5, 5, 5, 5, 6, 5, 5, 5, 6, 6, 7, 5, 6,
       5, 6, 6, 5, 6, 5, 7, 7, 5, 5, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 7, 7,
       6, 6, 6, 6, 6, 5, 5, 6, 5, 5, 6, 5, 6, 5, 5, 6, 5, 6, 6, 5, 6, 7,
       5, 5, 5, 5, 6, 5], dtype=int64)

In [54]:
rf.score(output_stack, y_test)

0.6375