In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from pandas_profiling import ProfileReport
%matplotlib inline

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")

In [3]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [4]:
x = data.drop(columns = "quality")

In [5]:
y = data.quality

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=20)

In [7]:
knn = KNeighborsClassifier()

In [8]:
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [9]:
knn.score(x_train, y_train)

0.6513761467889908

In [10]:
knn.score(x_test,y_test)

0.5175

In [11]:
parameter_list = {
    "n_neighbors" : [3,5,7,9,11,13,15,17,19,21,23,25,27,29],
    "weights" : ['uniform', 'distance'],
    "p" : [1,2],
    "algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    "leaf_size" : [12,24,30,45,55]
}

In [12]:
grid_cv = GridSearchCV(knn, param_grid=parameter_list)

In [13]:
grid_cv.fit(x_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [12, 24, 30, 45, 55],
                         'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23,
                                         25, 27, 29],
                         'p': [1, 2], 'weights': ['uniform', 'distance']})

In [14]:
grid_cv.best_params_

{'algorithm': 'auto',
 'leaf_size': 12,
 'n_neighbors': 27,
 'p': 1,
 'weights': 'distance'}

In [15]:
knn_1 = KNeighborsClassifier(n_neighbors = 27, p = 1, weights = "distance", leaf_size = 12, algorithm = "auto")

In [16]:
knn_1.fit(x_train, y_train)

KNeighborsClassifier(leaf_size=12, n_neighbors=27, p=1, weights='distance')

In [17]:
knn_1.score(x_train, y_train)

1.0

In [18]:
knn_1.score(x_test, y_test)

0.64

In [20]:
 # Save the model
import pickle
pickle.dump(knn_1, open("knn_1.pkl", "wb"))

# SVM => (Support Vector Machine)

# SVC => (Support Vector Classifier)

In [19]:
from sklearn.svm import SVC

In [20]:
svc = SVC()

In [21]:
svc.fit(x_train, y_train)

SVC()

In [22]:
svc.score(x_train, y_train)

0.5095913261050876

In [23]:
svc.score(x_test, y_test)

0.5

In [24]:
parameter_list = {
    "kernel" : ['linear', 'poly', 'rbf', 'sigmoid']
}
svc_grid = GridSearchCV(svc, param_grid=parameter_list)
svc_grid.fit(x_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [25]:
svc_grid.best_params_

{'kernel': 'linear'}

In [26]:
svc_1 = SVC(kernel = "linear")
svc_1.fit(x_train, y_train)

SVC(kernel='linear')

In [27]:
svc_1.score(x_train, y_train)

0.5829858215179317

In [28]:
svc_1.score(x_test, y_test)

0.5925

# SVR => (Support Vector Regressor)

In [51]:
data1 = pd.read_csv(r"C:\Users\ts-sachin.bhumihar\Downloads\Admission_Prediction.csv")

In [52]:
data1

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.00,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.80
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332.0,108.0,5.0,4.5,4.0,9.02,1,0.87
496,497,337.0,117.0,5.0,5.0,5.0,9.87,1,0.96
497,498,330.0,120.0,5.0,4.5,5.0,9.56,1,0.93
498,499,312.0,103.0,4.0,4.0,5.0,8.43,0,0.73


In [53]:
data1.isnull().sum()

Serial No.            0
GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [55]:
data1['GRE Score'] = data1['GRE Score'].fillna(data1['GRE Score'].mean())

In [56]:
data1['TOEFL Score'] = data1['TOEFL Score'].fillna(data1['TOEFL Score'].mean())

In [57]:
data1['University Rating'] = data1['University Rating'].fillna(data1['University Rating'].mean())

In [58]:
data1.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [59]:
x = data1.drop(['Serial No.', 'Chance of Admit'], axis = 1)

In [60]:
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.000000,118.0,4.0,4.5,4.5,9.65,1
1,324.000000,107.0,4.0,4.0,4.5,8.87,1
2,316.558763,104.0,3.0,3.0,3.5,8.00,1
3,322.000000,110.0,3.0,3.5,2.5,8.67,1
4,314.000000,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1
496,337.000000,117.0,5.0,5.0,5.0,9.87,1
497,330.000000,120.0,5.0,4.5,5.0,9.56,1
498,312.000000,103.0,4.0,4.0,5.0,8.43,0


In [61]:
y = df['Chance of Admit']

In [62]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit, Length: 500, dtype: float64

In [63]:
from sklearn.svm import SVR

In [64]:
svr = SVR()

In [65]:
svr.fit(x_train, y_train)

TypeError: float() argument must be a string or a number, not 'method'

In [66]:
from sklearn.metrics import r2_score

In [67]:
r2_score(y_test, svr.predict(x_test))

NotFittedError: This SVR instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Stacking

In [68]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [69]:
record = pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")

In [70]:
record

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [71]:
x = record.drop(columns = 'quality')
y = record.quality

In [77]:
train, val_train, test, val_test = train_test_split(x,y, test_size=0.50, random_state=30)

In [78]:
x_train, x_test, y_train, y_test = train_test_split(train,test,random_state=30, test_size=0.15)

In [79]:
knn_stacking = KNeighborsClassifier()
knn_stacking.fit(x_train, y_train)

KNeighborsClassifier()

In [80]:
knn_stacking.score(x_test, y_test)

0.5833333333333334

In [81]:
svc_stacking = SVC()
svc_stacking.fit(x_train, y_train)

SVC()

In [82]:
svc_stacking.score(x_test, y_test)

0.5083333333333333

In [83]:
prediction_knn = knn_stacking.predict(val_train)
prediction_svc = svc_stacking.predict(val_train)

In [84]:
prediction_knn

array([7, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 6, 5, 5, 6,
       5, 5, 6, 6, 6, 5, 5, 6, 5, 6, 6, 5, 6, 6, 6, 6, 5, 7, 6, 6, 5, 5,
       6, 7, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5, 6, 6, 5, 5, 6, 5, 5, 6, 5, 5,
       6, 6, 5, 4, 5, 5, 5, 6, 6, 6, 7, 5, 5, 6, 6, 6, 7, 6, 5, 5, 5, 6,
       5, 5, 5, 7, 6, 6, 6, 7, 5, 6, 5, 5, 5, 6, 5, 7, 6, 5, 6, 5, 7, 5,
       5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 7, 6, 5, 5, 6, 5, 5, 6, 6, 5,
       5, 6, 7, 5, 6, 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 6, 7, 5, 5, 5, 5,
       6, 5, 6, 5, 5, 5, 5, 6, 6, 5, 6, 5, 6, 6, 6, 6, 6, 5, 6, 5, 6, 5,
       6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 6, 5, 6, 5, 6, 5,
       5, 6, 6, 5, 6, 5, 5, 6, 6, 6, 6, 5, 6, 5, 5, 6, 5, 4, 5, 6, 7, 5,
       6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 6, 5, 6,
       5, 6, 5, 6, 6, 5, 6, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 6, 6, 5, 6, 6,
       5, 5, 6, 6, 6, 6, 5, 6, 6, 5, 5, 6, 5, 6, 5, 6, 5, 6, 6, 5, 6, 6,
       7, 5, 6, 7, 5, 4, 6, 7, 5, 5, 6, 5, 5, 6, 5,

In [85]:
prediction_svc

array([6, 6, 6, 6, 6, 5, 6, 5, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 6, 5, 6, 6,
       6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 5, 5, 6, 5, 6, 6, 5, 5,
       6, 6, 5, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6,
       5, 6, 5, 6, 5, 6, 6, 6, 5, 6, 5, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 5,
       5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 5, 5,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 5, 6, 6, 6, 6, 5, 5, 6, 6,
       6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6,
       5, 5, 6, 6, 5, 6, 6, 5, 5, 6, 6, 6, 5, 6, 5, 6, 6, 5, 6, 6, 6, 5,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 5,
       6, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6,
       6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 6, 6, 5, 6, 6,
       6, 5, 6, 5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6,
       6, 6, 5, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5, 6, 5,

In [87]:
input3 = np.column_stack((prediction_knn, prediction_svc))

In [91]:
pd.DataFrame(input3)

Unnamed: 0,0,1
0,7,6
1,5,6
2,6,6
3,5,6
4,5,6
...,...,...
795,5,6
796,5,6
797,6,6
798,6,6


In [89]:
output = val_test

In [90]:
output

1147    7
659     4
871     5
1333    5
1411    6
       ..
1073    6
200     7
942     7
1106    6
1329    6
Name: quality, Length: 800, dtype: int64

In [92]:
rf_stacking = RandomForestClassifier()

In [93]:
rf_stacking.fit(input3, output)

RandomForestClassifier()

In [94]:
knn_output = knn_stacking.predict(x_test)
svc_output = svc_stacking.predict(x_test)

In [95]:
output_stack1 = np.column_stack((knn_output, svc_output))

In [96]:
rf_stacking.predict(output_stack1)

array([5, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
       6, 5, 5, 6, 7, 6, 5, 6, 6, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6,
       6, 7, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 7, 6, 6,
       6, 5, 6, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 6, 6, 5, 6, 5, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 5, 5, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6,
       5, 6, 6, 5, 6, 6, 6, 7, 6, 6], dtype=int64)

In [97]:
rf_stacking.score(output_stack1, y_test)

0.5083333333333333