# [Replication, Mix, and Extension]

## **This notebook tutorial is for those who are beginners to machine learning.**

*In this notebook, First I have done some exploration on the data using matplotlib and seaborn.
Then, I use different classifier models to predict the quality of the wine.*

**1. Data Exploratory Analysis**

**2. Principle Comp Decomp**  

**3. Add feature selection and regularization**

**4. Prediction using various ML methods**


*Then I use cross validation evaluation technique to optimize the model performance.*

**1. Grid Search CV**

**2. Cross Validation Score**

**3. Model Ensemble**

*Bonus: play with some neutral nets

**1. DL and its hyper-param tuning**

## **If you find this notebook useful then please upvote.**

In [1]:
# import required packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
%matplotlib inline

In [2]:
import os
print(os.listdir("../input"))

In [3]:
#Loading dataset
wine_data = pd.read_csv('../input/winequality-red.csv')

In [4]:
#Let's check how the data is distributed
wine_data.head()

In [5]:
#Information about the data columns
wine_data.info()

In [7]:
wine_data.describe()

In [8]:
wine_data.corr()

In [9]:
sorted(wine_data['quality'].unique())

## **Let's do some plotting to know how the data columns are distributed in the dataset**

In [10]:
cand_feats = []

In [11]:
#Here we see that fixed acidity does not give any specification to classify the quality.
# usually, in such kinds of plots, the y-axis is the feature, the x-axis is the target (which are used to be divided into various buckets)
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'fixed acidity', data = wine_data)

In [12]:
#Here we see that its quite a downward trend in the volatile acidity as we go higher the quality
# a lower volatile acidity probably means a higher quality
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'volatile acidity', data = wine_data)

In [13]:
cand_feats.append('volatile acidity')

In [14]:
#Composition of citric acid go higher as we go higher in the quality of the wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'citric acid', data = wine_data)

In [15]:
cand_feats.append('citric acid')

In [16]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'residual sugar', data = wine_data)

In [17]:
#Composition of chloride also go down as we go higher in the quality of the wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'chlorides', data = wine_data)

In [18]:
cand_feats.append('chlorides')

In [19]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'free sulfur dioxide', data = wine_data)

In [20]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'total sulfur dioxide', data = wine_data)

In [21]:
#Sulphates level goes higher with the quality of wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'sulphates', data = wine_data)

In [22]:
cand_feats.append('sulphates')

In [23]:
#Alcohol level also goes higher as te quality of wine increases
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'alcohol', data = wine_data)

In [24]:
cand_feats.append('alcohol')

In [26]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'density', data = wine_data)

In [28]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'pH', data = wine_data)

In [29]:
cand_feats.append('pH')

## Preprocessing Data for performing Machine learning algorithms

In [31]:
# see what are the current candidate features; first stage, manually feature selection through barplot
cand_feats

In the second stage of feature selection, try both the PCA and backward-stepwise selection.

In [32]:
# what about making the two classes more balance?
bins = (2, 5.5, 8) # zuo kai you bi, so (2,5.5] , (5.5,8]
group_names = ['bad', 'good']
wine_data['quality_bi'] = pd.cut(wine_data['quality'], bins = bins, labels = group_names)

In [33]:
len(wine_data[wine_data['quality_bi']=='good']) / len(wine_data)

In [34]:
#Now lets assign a label to our quality variable
label_quality = LabelEncoder()

In [35]:
#Bad becomes 0 and good becomes 1 
wine_data['quality_bi'] = label_quality.fit_transform(wine_data['quality_bi'])

In [36]:
wine_data['quality_bi'].value_counts()

In [37]:
sns.countplot(wine_data['quality_bi'])

In [39]:
sns.pairplot(wine_data, vars=list(wine_data.columns[:-2]), kind="scatter", diag_kind="kde", hue = "quality_bi", size = 6)

In [42]:
from collections import Counter
print(Counter(wine_data['quality']), '\n', Counter(wine_data['quality_bi']))

## train test set split

In [73]:
#Now seperate the dataset as response variable and feature variabes
X = wine_data[cand_feats]
y = wine_data['quality_bi']

In [74]:
#Train and Test splitting of data 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 413)

In [75]:
#Applying Standard scaling to get optimized result, only use the mean and std from trainset
sc = StandardScaler()

In [76]:
sc.fit(X_train)
print(sc.mean_, '\n', sc.var_)

In [77]:
X_train = pd.DataFrame(sc.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns)

Now, given the cand_feats (based on various dist plot), we come to the second stage of feature selection. 1. using forward/backward stepwise selection 2. using PCA

In [78]:
from sklearn.feature_selection import RFE
RFE_estimator = LogisticRegression(penalty='l2',fit_intercept=True, intercept_scaling=1, class_weight='balanced',solver='lbfgs', max_iter=500,verbose=0)
selector = RFE(RFE_estimator, n_features_to_select=4, step=1)
selector = selector.fit(X_train, y_train)

In [79]:
selector.support_

In [84]:
cand_feats2 = list(np.array(X.columns)[selector.support_])
cand_feats2

## PCA Procedure

In [85]:
from sklearn.decomposition import PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train)

In [88]:
#plot the graph to find the principal components
plt.figure(figsize=(6,6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), 'ro-')
plt.grid()

In [90]:
#AS per the graph, we can see that 6 (4) principal components attribute for 90% of variation in the data. 
#we shall pick the first 6 (4) components for our prediction.
pca_new = PCA(n_components=4)
pca_new.fit(X_train)

In [91]:
X_train_new = pca_new.transform(X_train)
X_test_new = pca_new.transform(X_test)

In [93]:
X_train_new = pd.DataFrame(X_train_new, columns = ['pc1', 'pc2', 'pc3', 'pc4'])

In [94]:
X_test_new = pd.DataFrame(X_test_new, columns = ['pc1', 'pc2', 'pc3', 'pc4'])

## Our training and testing data is ready now to perform machine learning algorithm

## Decision Tree

In [121]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train[cand_feats2], y_train)
pred_dt = dt.predict(X_test[cand_feats2])

In [122]:
#Let's see how our model performed
print(classification_report(y_test, pred_dt))
print(accuracy_score(y_test, pred_dt))

### Random Forest Classifier

In [123]:
rfc = RandomForestClassifier(n_estimators=200) # 200 diff weak classifiers
rfc.fit(X_train[cand_feats2], y_train)
pred_rfc = rfc.predict(X_test[cand_feats2])

In [124]:
#Let's see how our model performed
print(classification_report(y_test, pred_rfc))
print(accuracy_score(y_test, pred_rfc))

#### Random forest gives the F1 of 80%

In [125]:
#Confusion matrix for the random forest classification
print(confusion_matrix(y_test, pred_rfc))



## Stochastic Gradient Decent Classifier

In [126]:
sgd = SGDClassifier(penalty=None)
sgd.fit(X_train[cand_feats2], y_train)
pred_sgd = sgd.predict(X_test[cand_feats2])

In [127]:
print(classification_report(y_test, pred_sgd))

#### 66% F1 using stochastic gradient descent classifier

In [128]:
print(confusion_matrix(y_test, pred_sgd))

## Support Vector Classifier

In [129]:
svc = SVC()
svc.fit(X_train[cand_feats2], y_train)
pred_svc = svc.predict(X_test[cand_feats2])

In [130]:
print(classification_report(y_test, pred_svc))
print(accuracy_score(y_test, pred_svc))

#### Support vector classifier gets 73% F1

## Logistic Reg

In [131]:
lr = LogisticRegression()
lr.fit(X_train[cand_feats2], y_train)
pred_lr = lr.predict(X_test[cand_feats2])

In [132]:
print(classification_report(y_test, pred_lr))

## NB

In [133]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train[cand_feats2], y_train)
pred_nb = nb.predict(X_test[cand_feats2])

In [134]:
print(classification_report(y_test, pred_nb))
print(accuracy_score(y_test, pred_nb))

In [135]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb, X_train[cand_feats2], y_train, cv=5)
print(cv)
print(cv.mean())

In [138]:
# Voting classifier takes all of the inputs and averages the results. For a "hard" voting classifier each classifier gets 1 vote "yes" or "no" and the result is just a popular vote. For this, you generally want odd numbers
# A "soft" classifier averages the confidence of each of the models. If the average confidence is > 50% that it is a 1， it will be counted as such
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr',lr),('rfc',rfc),('nb',nb),('xgb',xgb)], voting='soft')

In [139]:
cv = cross_val_score(voting_clf, X_train[cand_feats2], y_train, cv=5)
print(cv)
print(cv.mean())

## Let's try to increase our accuracy of models
## Grid Search CV

In [146]:
#Finding best parameters for our SVC model
param = {
    'C': [0.1, 0.8, 0.9, 1.1, 1.3, 1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8, 1,1.1, 1.3,1.4]
}
grid_svc = GridSearchCV(svc, param_grid=param, refit=True, scoring='accuracy', cv=3)

In [147]:
grid_svc.fit(X_train[cand_feats2], y_train)

In [148]:
#Best parameters for our svc model
grid_svc.best_params_

In [150]:
#Let's run our SVC again with the best parameters.
#svc2 = SVC(C = 1.3, gamma =  0.9, kernel= 'rbf')
#svc2.fit(X_train, y_train)
#pred_svc2 = svc2.predict(X_test)
pred_svc2 = grid_svc.predict(X_test[cand_feats2])
print(classification_report(y_test, pred_svc2))

#### Support vector classifier gets 75% F1

# DL and its hyper-param tuning

In [151]:
class Eval:
    def __init__(self, pred, gold):
        self.pred = np.squeeze(pred)
        self.gold = np.squeeze(gold)
        
    def Accuracy(self):
        return np.sum(np.equal(self.pred, self.gold)) / float(len(self.gold))


In [153]:
!pip install -U skorch

In [163]:
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

In [164]:
# inspect if we could use GPU
if torch.cuda.is_available():
    device = torch.device('cuda', 0)
else:
    device = torch.device('cpu')

In [165]:
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=10,
            nonlin=F.relu,
            dropout=0.5,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin
        self.dropout = dropout

        self.dense0 = nn.Linear(20, num_units)
        self.nonlin = nonlin
        self.dropout = nn.Dropout(dropout)
        self.dense1 = nn.Linear(num_units, 10)
        self.output = nn.Linear(10, 2)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = F.relu(self.dense1(X))
        X = F.softmax(self.output(X), dim=-1) # the last layers are a linear output and a softmax
        return X

In [166]:
net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    lr=0.1,
    device=device,
)
# the self created ClassifierModule is a input param for the skorch wrapper

In [167]:
net.fit(X, y)

In [168]:
y_pred = net.predict(X[:10])
y_proba = net.predict_proba(X[:10])

In [169]:
net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs = 20,
    lr = 0.1,
    optimizer__momentum = 0.9,
    verbose = 0,
    train_split = False,
)

In [170]:
params = {
    'lr': [0.001, 0.01, 0.1],
    'max_epochs': [10, 20, 30],
    'module__num_units': [10, 20, 40],
    'module__dropout': [0, 0.5],
    'optimizer__nesterov': [False, True],
}

In [171]:
rs = RandomizedSearchCV(net, params, refit=False, cv=3, scoring='f1', verbose=2, n_iter = 20)

In [172]:
rs.fit(X, y)

In [173]:
print(rs.best_score_, rs.best_params_)

In [174]:
# Create Data Loaders

training_set = torch.utils.data.TensorDataset(torch.Tensor(train_X), torch.Tensor(train_Y))
training_loader = torch.utils.data.DataLoader(training_set, batch_size=64, shuffle=True)

In [175]:
def train(model, device, training_loader, optimizer, epoch):
    # epoch means which epoch where are we, instead of total num of epochs
    model.train() # declare train mode, so that we keep the grads
    total_loss = 0
    for idx, data in enumerate(training_loader, 0):
        inputs, targets = data # a batch of data
        
        # add onehot encoder here! then the OO-style FFNN has been finished. just add some param-tune, then it is done.
        ohe = OneHotEncoder()
        targets = torch.tensor(ohe.fit_transform(targets.reshape(-1,1)).toarray(), dtype=torch.float32)
        
        # 1. Forward
        outputs = model(inputs)
        
        #print(outputs, targets)
        # 2. loss calculation, same dtype into the nn.MSELOSS
        loss = criterion(outputs, targets)
        
        # 3. Zero the parameter gradients
        optimizer.zero_grad()
        
        # 4. Comp grad
        loss.backward()
        
        # 5. One step forward
        optimizer.step()
        
        
        total_loss += loss.item()
    
    print("Train Epoch: {}, Loss per batch: {}".format(epoch, round(total_loss/len(training_loader), 4)))
    train_loss_hist.append(total_loss/len(training_loader))

In [176]:
def test(model, device, testing_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for idx, data in enumerate(testing_loader, 0):
            inputs, S_mat, C0 = data;
    
            # Forward
            outputs = model(inputs)
            
            # Loss
            loss = criterion(outputs * (S_mat[:,1:] - S_mat[:,0:1]) + C0, 
                             torch.max(S_mat[:,1:] - K, torch.zeros(batch_size, 1)))
            
            total_loss += loss.item() 

    print("Test loss per batch: {}".format(round(total_loss/len(testing_loader),4)))
    val_loss_hist.append(total_loss/len(testing_loader))

In [177]:
model = FFNN(num_features, num_hidden, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()
train_loss_hist = []
for epoch in range(10):
    train(model, device, training_loader, optimizer, epoch)

In [178]:
plt.figure()
l1, = plt.plot(train_loss_hist)
#l2, = plt.plot(val_loss_hist)
plt.legend(handles=[l1], labels = ['train','val'], loc='best')

In [179]:
model.eval() # do not update trainable params anymore
nn_predictions = model(torch.Tensor(train_X))

In [180]:
nn_eval = Eval(torch.argmax(nn_predictions, dim=1).detach().numpy(), train_Y)

In [181]:
nn_eval.Accuracy()