In [2]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
%matplotlib inline

We are going to use the wine quality dataset from Kaggle https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009 to classify the wine "quality" into high quality (=1) and low quality (=0) using Support Vector Machine (SVM).
The original dataset was modified by assigning quality scores below 7 as "0" and quality scores of 7 and 8 to "1"

In [4]:
# import the winequality-bin.csv dataset into a dataframe called data
data = pd.read_csv('/content/winequality-bin.csv')

In [5]:
# assign the features to the X and the labels to y
X = data.drop('quality', axis = 1)
y = data['quality']

In [6]:
# split X and y into X_train, X_test, y_train and y_test using a test_size of 30% and a random_state of 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Exercise 1
# training SVM models

In [7]:
# Task 1.1
# what is the size of y_train?
# what is the size of y_test?
# Check the size of y_train and y_test
print("Size of y_train:", y_train.size)
print("Size of y_test:", y_test.size)

Size of y_train: 1119
Size of y_test: 480


In [8]:
# Task 1.2
# create a SVM model called svm_linear_clf using the pipeline pattern shown in O'Reilly textbook page 156
# add a max_iter of 20000 to ensure the training converges
# What value of C did you choose?
"""
The C parameter in SVM is a regularization parameter. It controls the trade-off between
achieving a low training error and a large margin.

* A small C makes the margin wider but may allow more training errors (misclassifications).
* A large C aims to get fewer training errors, but the margin is narrower.

Choosing C=1 is a starting point or default value when building an SVM model.
It provides a reasonable balance between bias and variance.
"""
svm_linear_clf =  Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="linear", C=1, max_iter=20000))
])

In [9]:
# Task 1.3
# train the svm_linear_clf model using the fit function on X_train and y_train
# Train the SVM model
svm_linear_clf.fit(X_train, y_train)

In [10]:
# Task 1.4
# using the predict function, calculate the svm_linear_clf model predictions based on X_test
# hint: use the model.predict(X_test) pattern
y_pred_linear = svm_linear_clf.predict(X_test)

In [11]:
# Task 1.5
# Calculate the accuracy of svm_linear_clf by comparing y_pred to y_test (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
# what is the accuracy?
svm_linear_accuracy = accuracy_score(y_test, y_pred_linear)
print("Accuracy of SVM with linear kernel:", svm_linear_accuracy)

Accuracy of SVM with linear kernel: 0.8604166666666667


In [12]:
# Task 1.6
# create a SVM model called svm_poly_clf using the pipeline pattern shown in O'Reilly textbook page 158
# add a max_iter of 20000 to ensure the training converges
svm_poly_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="poly", max_iter=20000))
])

In [13]:
# Task 1.7
# train the svm_poly_clf model using the fit function on X_train and y_train
svm_poly_clf.fit(X_train, y_train)

In [14]:
# Task 1.8
# using the predict function, calculate the svm_poly_clf model predictions based on X_test
# hint: use the model.predict(X_test) pattern
y_pred_poly  = svm_poly_clf.predict(X_test)

In [15]:
# Task 1.9
# Calculate the accuracy of svm_poly_clf by comparing y_pred to y_test (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
# what is the accuracy?
svm_poly_accuracy = accuracy_score(y_test, y_pred_poly)
print("Accuracy of SVM with polynomial kernel:", svm_poly_accuracy)

Accuracy of SVM with polynomial kernel: 0.8770833333333333


In [16]:
# Task 1.10
# create a SVM model called svm_rbf_clf using the pipeline pattern shown in O'Reilly textbook page 160
# add a max_iter of 20000 to ensure the training converges
svm_rbf_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", max_iter=20000))
])

In [17]:
# Task 1.11
# train the svm_rbf_clf model using the fit function  on X_train and y_train
svm_rbf_clf.fit(X_train, y_train)

In [18]:
# Task 1.12
# using the predict function, calculate the svm_rbf_clf model predictions based on X_test
# hint: use the model.predict(X_test) pattern
y_pred_rbf = svm_rbf_clf.predict(X_test)

In [19]:
# Task 1.13
# Calculate the accuracy of svm_rbf_clf by comparing y_pred to y_test (see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)
svm_rbf_accuracy = accuracy_score(y_test, y_pred_rbf)
print("Accuracy of SVM with RBF kernel:", svm_rbf_accuracy)

Accuracy of SVM with RBF kernel: 0.88125


In [20]:
# Exercise 2
# let's us grid search to find the best SVM model for this dataset

# task 2.1
# import the required sklearn library for performing a grid search
from sklearn.model_selection import GridSearchCV

In [36]:
# Task 2.2
# # Set the grid search parameters as follows to help find the best parameters for our SVC model
# param = {
#     'kernel':[ 'rbf'],
#     'C': [0.1, 1, 10],
#     'gamma' :[0.001, 0.1, 1]
# }

param2 = {
    'C': [0.5,1,1.3,1.5],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.5,1,1.3,1.5],
}

In [39]:
# Task 2.3
# Setup and run the Grid Search for the svm classifier model using 10-fold cross validation
svc = SVC()
grid_svc = GridSearchCV(svc, param_grid=param2, cv=10)

In [40]:
# Task 2.4
# use fit on X_train and y_train to train the grid_svc model
# This may take a while. If it did, try running it on Google Colab or Kaggle with GPU to make the training go faster
grid_svc.fit(X_train, y_train)

In [41]:
# Task 2.5
# use grid_svc.best_params_ to find the best model, print the best parameters

best_svc_model = grid_svc.best_estimator_
best_params = grid_svc.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'C': 1.3, 'gamma': 1, 'kernel': 'rbf'}


In [42]:
# Task 2.6
# what is the accuracy for the best model? Is it better than the previous two models?
# Hint: use accuracy_score function
# Report your Finding Here with as a comment

y_pred_best = best_svc_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Accuracy of the best SVM model:", accuracy_best)

# Finding: The accuracy of the best SVM model (0.90208) is better than the linear (0.86041) and polynomial (0.87708) SVM models.

Accuracy of the best SVM model: 0.9020833333333333


Please note: If you used param2 instead of param for tasks 2.2-2.6, you will notice that that the GridSearch do not converge or take very long time even if you ran the code of Google Colab.

If GridSearchCV is taking too long to converge or is not converging at all, there are a few steps you can take to address the issue including:

1- Reduce the Search Space: The parameter grid you defined might be too large, leading to a combinatorial explosion of possible combinations. Try reducing the number of parameter values or the range of values to search over.

2- Use a Smaller Dataset: If the dataset is large, training on the entire dataset for each parameter combination can be time-consuming. You can try using a smaller subset of the data for the grid search.

3- Use RandomizedSearchCV:
Instead of grid search, you can use RandomizedSearchCV, which randomly selects parameter values from the specified distributions. This can be faster than grid search and may provide comparable results.

For the sake of this assignment we will use option no 3.

In [43]:
# Task 2.7 : Repeart tasks 2.2 - 2.6 using RandomizedSearch instead of GridSearch to find the best model, reprot the accuracy.
# What do you notice regarding the speed of finding best model using RandomSearch?
"""
Yes, running Grid Search on a larger dataset with a wide range
of parameters can take a very long time or may not converge,
especially without GPU acceleration. Using a smaller dataset or
RandomizedSearchCV are good strategies to mitigate this issue.
"""
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'svm_clf__C': [0.1, 1, 10],
    'svm_clf__gamma': [0.01, 0.1, 1]
}

random_svc = RandomizedSearchCV(svm_rbf_clf, param_distributions, n_iter=5, cv=5)
random_svc.fit(X_train, y_train)

In [44]:
# Finding best parametrs and best model
best_svc_model = random_svc.best_estimator_
best_params = random_svc.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'svm_clf__gamma': 1, 'svm_clf__C': 10}


In [45]:
# computing the accuracy
y_pred_best = best_svc_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Accuracy of the best SVM model:", accuracy_best)

Accuracy of the best SVM model: 0.9041666666666667


In [46]:
# Exercise 3
# Let's run a random forest classifier on the same wine dataset

In [47]:
# Task 3.1
# import the required scikit-learn library for running random forest classification
from sklearn.ensemble import RandomForestClassifier

In [48]:
# Task 3.2
# define a model called rfc as a random forest classifier with 200 estimators
rfc = RandomForestClassifier(n_estimators=200, random_state=42)

In [49]:
#Applying Standard scaling to get optimized result
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [50]:
# Task 3.3
# train the rfc random forest classifier on X_train and y_train
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

In [51]:
# Task 3.4
# calculate the predictions for the X_test values. Call the ouput y_pred
y_pred  = rfc.predict(X_test)

In [52]:
# Task 3.5
# what is the accuracy?
rfc_accuracy = accuracy_score(y_test, y_pred)
print(rfc_accuracy)

0.8770833333333333


In [53]:
# Task 3.6
#Let's see how our model performed by running print(classification_report(y_test, y_pred))
# What is the accuracy of the model?
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93       413
           1       0.59      0.39      0.47        67

    accuracy                           0.88       480
   macro avg       0.75      0.67      0.70       480
weighted avg       0.86      0.88      0.87       480



In [None]:
# submit this notebook with all your embedded answers and script outputs as the only deliverable for this assignment