In [None]:
import numpy as np

# ----- make nice figures -----
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 150
# -----------------------------


In [None]:
data = np.loadtxt('perovskite_data.txt')
print(data.shape)

In [None]:
C = data[:, 0]
X = data[:, 1:15]

C_train = C[0:280]
X_train = X[0:280,:]

C_valid = C[280:]
X_valid = X[280:, :]

# normalize data
mu_X = np.mean(X_train)
sig_X = np.std(X_train)
X_train = (X_train - mu_X)/sig_X
X_valid = (X_valid - mu_X)/sig_X

In [None]:
from sklearn.svm import SVC

# This is how you specify which kernel to use
linear_svm_model = SVC(kernel="linear")

# train the model - note we don't have have to form a design matrix
linear_svm_model.fit(X_train, C_train)

# Predict validation data
C_valid_model = linear_svm_model.predict(X_valid)

In [None]:
# Precision and recall. We'll use the functions in sklearn.metrics
from sklearn.metrics import precision_score, recall_score

precision = precision_score(C_valid, C_valid_model)
recall = recall_score(C_valid, C_valid_model)

print("Precision = " + str(precision))
print("Recall = " + str(recall))

In [None]:
# polynomial kernel
poly_svm_model = SVC(kernel="poly", degree=3)

# train the model - note we don't have have to form a design matrix
poly_svm_model.fit(X_train, C_train)

# Predict validation data
C_valid_model = poly_svm_model.predict(X_valid)

precision = precision_score(C_valid, C_valid_model)
recall = recall_score(C_valid, C_valid_model)

print("Precision = " + str(precision))
print("Recall = " + str(recall))

In [None]:
# Gaussian kernel
rbf_svm_model = SVC(kernel="rbf")

# train the model - note we don't have have to form a design matrix
rbf_svm_model.fit(X_train, C_train)

# Predict validation data
C_valid_model = rbf_svm_model.predict(X_valid)

precision = precision_score(C_valid, C_valid_model)
recall = recall_score(C_valid, C_valid_model)

print("Precision = " + str(precision))
print("Recall = " + str(recall))

# Dealing with  Class Imbalance

In [None]:
num_pos = np.sum(C_train == 1)
num_neg = np.sum(C_train == -1)

print("# Positively classified data in training set = " + str(num_pos))
print("# Negatively classified data in training set = " + str(num_neg))

In [None]:
# This is how you specify a polynomial kernel
rbf_svm_model = SVC(kernel="rbf", class_weight="balanced")

# train the model - note we don't have have to form a design matrix
rbf_svm_model.fit(X_train, C_train)

# Predict validation data
C_valid_model = rbf_svm_model.predict(X_valid)

precision = precision_score(C_valid, C_valid_model)
recall = recall_score(C_valid, C_valid_model)

print("Precision = " + str(precision))
print("Recall = " + str(recall))

In [None]:
# This is how you specify a polynomial kernel
poly_svm_model = SVC(kernel="poly", degree=3, class_weight="balanced")

# train the model - note we don't have have to form a design matrix
poly_svm_model.fit(X_train, C_train)

# Predict validation data
C_valid_model = poly_svm_model.predict(X_valid)

precision = precision_score(C_valid, C_valid_model)
recall = recall_score(C_valid, C_valid_model)

print("Precision = " + str(precision))
print("Recall = " + str(recall))

## Data from lecture 15

In [None]:
data = np.loadtxt('lec15.txt')
num_points = data.shape[0]

x = data[:, 0:2]
c = data[:, 2]

# Use the same training data as past examples
num_train = int(num_points*0.8*0.8)
X_train = x[0:num_train]
C_train = c[0:num_train]

# normalize
mu_X = np.mean(X_train)
sig_X = np.std(X_train)

# Let's use SVC with balanced classes
rbf_svm_model = SVC(kernel="rbf", class_weight="balanced")

# train the model - note we don't have have to form a design matrix
rbf_svm_model.fit(X_train_norm, C_train)

# Let's use SVC with balanced classes
poly_svm_model = SVC(kernel="poly", degree=5, class_weight="balanced")

# train the model - note we don't have have to form a design matrix
poly_svm_model.fit((X_train-mu_X)/sig_X, C_train)

In [None]:
# --------- CLASSIFY ENTIRE PLANE AND PLOT --------
# You could ignore this if you wanted


# turn 0,1 to color strings just for plotting
import matplotlib.pyplot as plt
COLORS = ['#F00D2C', '#553C67']
c_color = []
for i in range(num_points):
    c_color.append(COLORS[int(c[i])])

# Classify all the points in the plane
x1_plot = np.linspace(0, 1, 100)
x2_plot = np.linspace(0, 1.3, 100)
# Form all combinations from x1_plot and x2_plot
xx1, xx2 = np.meshgrid(x1_plot, x2_plot)
# Flatten xx1 and xx2 to a list of points
x_plot = np.array([xx1.ravel(), xx2.ravel()]).transpose()

# classify each point according to each model
c_plot_rbf = rbf_svm_model.predict((x_plot - mu_X)/sig_X)
c_plot_poly = poly_svm_model.predict((x_plot - mu_X)/sig_X)

# put back into matrix form
c_plot_rbf = c_plot_rbf.reshape(xx1.shape)
c_plot_poly = c_plot_poly.reshape(xx1.shape)

In [None]:
# plot classification at each point as a colored region
from matplotlib.colors import ListedColormap
plt.pcolormesh(xx1, xx2, c_plot_rbf, cmap=ListedColormap(COLORS))
plt.scatter(x[:,0], x[:,1], marker='^', edgecolors='k', linewidth=0.5, c=c_color)
plt.xlabel('X1')
plt.ylabel('X2')

In [None]:
plt.pcolormesh(xx1, xx2, c_plot_poly, cmap=ListedColormap(COLORS))
plt.scatter(x[:,0], x[:,1], marker='^', edgecolors='k', linewidth=0.5, c=c_color)
plt.xlabel('X1')
plt.ylabel('X2')

# Motivation for tree based classifiers.

In [None]:
data = np.loadtxt('lec18.txt')
num_points = data.shape[0]
X = data[:, 0:2]
C = data[:, 2]

mu_X = np.mean(X)
sig_X = np.std(X)

In [None]:
from cycler import cycler
COLORS = ['#553C67', '#F00D2C']
default_cycler = cycler(color=COLORS)
plt.rc('axes', prop_cycle=default_cycler) 
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{bm}')

I_pos = C == 1
I_neg = C == 0


# plot points classified as negative
plt.scatter(X[I_neg,0], X[I_neg,1],  edgecolors='k', linewidth=0.5)

# plot points classified as positive
plt.scatter(X[I_pos,0], X[I_pos,1],  edgecolors='k', linewidth=0.5)

plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.title('Data in X space')

In [None]:
# This is how you specify a polynomial kernel
rbf_svm_model = SVC(kernel="rbf", class_weight='balanced')

# train the model
rbf_svm_model.fit((X-mu_X)/sig_X, C)

In [None]:
# Classify all the points in the plane
x1_plot = np.linspace(0, 1, 100)
x2_plot = np.linspace(0, 1, 100)

# Form all combinations from x1_plot and x2_plot
xx1, xx2 = np.meshgrid(x1_plot, x2_plot)

# Flatten xx1 and xx2 to a list of points
x_plot = np.array([xx1.ravel(), xx2.ravel()]).transpose()

# classify each point according to each model
c_plot_rbf = rbf_svm_model.predict((x_plot-mu_X)/sig_X)

# put back into matrix form
c_plot_rbf = c_plot_rbf.reshape(xx1.shape)

# plot predictions
plt.pcolormesh(xx1, xx2, c_plot_rbf, cmap=ListedColormap(COLORS))

# plot points classified as negative
plt.scatter(X[I_neg,0], X[I_neg,1], edgecolors='k', linewidth=0.5)

# plot points classified as positive
plt.scatter(X[I_pos,0], X[I_pos,1], edgecolors='k', linewidth=0.5)