In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc, cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from drawdata import draw_line, draw_scatter, draw_histogram

Using the following cell, you can draw a data scatter plot, try creating data points where the decision boundary is not linear.

In [None]:
draw_scatter()

In [None]:
# if the data is not showing or you are getting an error, you can use the following code to load the data
df = pd.read_csv('data/polynomial_data.csv')

We can see that the data is not linearly separable.

In [None]:
# You can experimente with the following too
# draw_line(), draw_histogram()

In [None]:
# From the data above, press "copy csv" and then run the following code to get the data into a pandas dataframe.
# df = pd.read_clipboard(sep=',')

In [None]:
df

In [None]:
df['z'].value_counts()

In [None]:
X = df.drop('z', axis=1)
y = df['z']

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
X = X.values

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('Scatter plot of X1 vs X2')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y, shuffle=True)

In [None]:
model_pipeline = Pipeline([
    ('scaler', QuantileTransformer(n_quantiles=100)),
    ('model', LogisticRegression(solver='liblinear'))
])

In [None]:
params = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__penalty': ['l1', 'l2']
}

In [None]:
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('Scatter plot of X1 vs X2')
plt.show()

In [None]:
grid = GridSearchCV(model_pipeline, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [None]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

In [None]:

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")
print(f"Cohen's Kappa: {cohen_kappa_score(y_test, y_pred)}")
print(f"Classification Report: {classification_report(y_test, y_pred)}")

In [None]:
# plot the scatter plot of predicted vs actual
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred)
plt.title('Predicted vs Actual')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
poly_params = {
    'poly__degree': [2, 3, 4, 5, 10],
    'scaler__n_quantiles': [100, 1000],
    'model__C': [0.01, 0.1, 1, 10, 100, 250, 500, 1000],
    # 'model__penalty': ['l1', 'l2'],
    # 'model__solver': ['liblinear', 'lbfgs', 'saga']
}

In [None]:
# now let's convert features to polynomial features
model_pipeline = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', QuantileTransformer()),
    ('model', LogisticRegression(max_iter=2000))
])

In [None]:
grid = GridSearchCV(model_pipeline, poly_params, 
                        #   n_iter=100, 
                          cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_

In [None]:
best_model

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
# plot the scatter plot of predicted vs actual
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred)
plt.title('Predicted vs Actual')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
#predict on the entire dataset
y_pred = best_model.predict(X)

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title('Predicted')
plt.xlabel('X1')
plt.ylabel('X2')
plt.subplot(1, 2, 2)
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title('Actual')
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()

---

Now let's compare it to SVC with a linear kernel and a polynomial kernel.

In [None]:
svc_linear_pipeline = Pipeline([
    ('scaler', QuantileTransformer()),
    ('model', SVC(kernel='linear'))
])

svc_linear_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__gamma': [0.01, 0.1, 1, 10, 100]
}

svcl_grid = GridSearchCV(svc_linear_pipeline, svc_linear_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
svcl_grid.fit(X_train, y_train)

In [None]:
best_model = svcl_grid.best_estimator_
best_model

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

# plot the scatter plot of predicted vs actual
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred)
plt.title('Predicted vs Actual')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
# Now let's try the rbf kernel
svc_poly_pipeline = Pipeline([
    ('scaler', QuantileTransformer()),
    ('model', SVC(kernel='poly'))
])

svc_poly_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__gamma': [0.01, 0.1, 1, 10, 100],
    'model__degree': [2, 3, 4, 5]
}

svp_grid = GridSearchCV(svc_poly_pipeline, svc_poly_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
svp_grid.fit(X_train, y_train)

best_model = svp_grid.best_estimator_
best_model

y_pred = best_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

# plot the scatter plot of predicted vs actual
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred)
plt.title('Predicted vs Actual')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

So, as we can see, the linear kernel is not able to separate the data, but the polynomial kernel is able to do so. \
Whilst the polynomial kernel gives a better result, it is also more computationally expensive; \
To get the best of both worlds, we can use the RBF kernel, which is able to separate the data and is also computationally efficient, as we will see in the next cell.

In [None]:
# Now let's try the rbf kernel
svc_rbf_pipeline = Pipeline([
    ('scaler', QuantileTransformer()),
    ('model', SVC(kernel='rbf'))
])

svc_rbf_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__gamma': [0.01, 0.1, 1, 10, 100]
}

svcr_grid = GridSearchCV(svc_rbf_pipeline, svc_rbf_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
svcr_grid.fit(X_train, y_train)

best_model = svcr_grid.best_estimator_
best_model

y_pred = best_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

# plot the scatter plot of predicted vs actual
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred)
plt.title('Predicted vs Actual')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

As we can see, the RBF kernel is able to separate the data really well, and it is also computationally efficient.

In [None]:
#predict on the entire dataset
y_pred = best_model.predict(X)

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title('Predicted')
plt.xlabel('X1')
plt.ylabel('X2')
plt.subplot(1, 2, 2)
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title('Actual')
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()