In [None]:
### MLP CODE ###

#Code to set up two classification problems
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
def genGaussianSamples(N, m, C):
    A = np.linalg.cholesky(C)
    U = np.random.randn(N,2)
    return(U @ A.T + m)
NClasses = 3
# Priors
#
w = np.random.rand(NClasses)
w = w / np.sum(w)
N = 1000 # total data (Training = Test)
NPrior = np.floor(w * N).astype(int)
Scale = 10
Means = Scale*np.random.rand(NClasses, 2)
print(Means)
from sklearn.datasets import make_spd_matrix
CovMatrices = np.zeros((NClasses,2,2))
for j in range(NClasses):
    CovMatrices[j,:,:] = make_spd_matrix(2)
    AllData_train = list()
for j in range(NClasses):
    AllData_train.append(genGaussianSamples(NPrior[j], Means[j,:], CovMatrices[j,:,:]))
    X_train = AllData_train[0]
    y_train = np.ones((NPrior[0], 1))
for j in range(NClasses-1):
    Xj = genGaussianSamples(NPrior[j+1], Means[j+1,:], CovMatrices[j+1,:,:])
    X_train = np.append(X_train, Xj, axis=0)
    yj = (j+2)*np.ones((NPrior[j+1], 1))
    y_train = np.append(y_train, yj)
AllData_test = list()
for j in range(NClasses):
    AllData_test.append(genGaussianSamples(NPrior[j], Means[j,:], CovMatrices[j,:,:]))
    X_test = AllData_test[0]
    y_test = np.ones((NPrior[0], 1))
for j in range(NClasses-1):
    Xj = genGaussianSamples(NPrior[j+1], Means[j+1,:], CovMatrices[j+1,:,:])
    X_test = np.append(X_test, Xj, axis=0)
    yj = (j+2)*np.ones((NPrior[j+1], 1))
    y_test = np.append(y_test, yj)
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12,4))
plt.subplots_adjust(wspace=0.3)
for j in range(NClasses):
    Xplt = AllData_train[j]
    ax[0].scatter(Xplt[:,0], Xplt[:,1], s=3)
    ax[0].grid(True)
    ax[0].set_title("Training Data Distributions")
    ax[1].plot(y_train)
    ax[1].set_title("Training Targets")
for j in range(NClasses):
    Xplt = AllData_test[j]
    ax[2].scatter(Xplt[:,0], Xplt[:,1], s=3)
    ax[2].grid(True)
    ax[2].set_title("Test Data Distributions")
plt.savefig('TrainTestDistcomp',tight = 'bbox_inches')

In [None]:
#ten cross validation and plotting boxplot
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

onehot_encoder = OneHotEncoder(sparse=False)
y_onehot_train = onehot_encoder.fit_transform(y_train.reshape(-1, 1))#Onehot encoding
y_onehot_test = onehot_encoder.fit_transform(y_test.reshape(-1, 1))
clf = MLPClassifier(learning_rate_init = 0.1)
gnb = GaussianNB()
kf = KFold(n_splits = 10)

X = np.concatenate((X_train,X_test))
Y_onehot = np.concatenate((y_onehot_train,y_onehot_test))
Y = np.concatenate((y_train, y_test))
models = [gnb, clf]
data = []
names = ["Naive Bayes", "MLP"]
df = pd.DataFrame(index=range(10 * len(models)))
l = 0
for model in models:
    model_name = names[l]
    if model_name == 'Naive Bayes':
        acc = cross_val_score(model, X, Y, cv=10)#Storing the Ten-Fold crossvalidation accuracy
    else:
        acc = cross_val_score(model, X, Y_onehot, cv=10)
    for fold_idx, accuracy in enumerate(acc):
        data.append((model_name, fold_idx, accuracy))
        df = pd.DataFrame(data, columns=['model_name_unigrams', 'fold_idx', 'accuracy'])
    l = l + 1
   
   
sns.boxplot(x='model_name_unigrams', y='accuracy', data=df)#Plotting the Boxplots
plt.savefig('Boxplotgnbmlpcomp',bbox_inches = 'tight')
plt.show()
print(data)

In [None]:
# clf = MLPClassifier(learning_rate_init = 0.1, alpha = 0.00001, random_state = 1) ##Code used to check convergence and to check test accuracy while changing alpha##
# clf.fit(X_train,y_onehot_train)
# fig, ax = plt.subplots(figsize=(6,4))
# print(clf.score(X_test, y_onehot_test))
# ax.set_title("Convergence")
# ax.set_xlabel("Iteration", fontsize = 14)
# ax.set_ylabel("Loss", fontsize = 14)
# ax.plot(clf.loss_curve_)
# plt.savefig('Convergence01', bbox_inches = 'tight')


#Code to plot the decision boundary
h = 0.1
x_min, x_max = X[:,0].min() - 1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:,1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
print(x_min, x_max)
print(y_min, y_max)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
print(Z)
Z = np.argmax(Z, axis=1)
# print(Z)
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) #PLotting the decision boundary
plt.axis("tight")
colors = "bry"
print(y_train)

# Plotting the training points
for i, color in zip(clf.classes_, colors):
    idx = np.where(y_train == i+1)
    plt.scatter(
        X_train[idx, 0],
        X_train[idx, 1],
        c=color,
        cmap=plt.cm.Paired,
        edgecolor="black",
        s=20,
    )
plt.title("Decision boundary for MLP classifier")
plt.axis("tight")
plt.legend()
plt.savefig("MLPDecisioncomp", bbox_inches = 'tight')
plt.show()

In [None]:
### K_MEANS CODE ###

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from scipy import spatial

def genGaussianSamples(N, m, C):
    A = np.linalg.cholesky(C)
    U = np.random.randn(N,2)
    return(U @ A.T + m)
# Define three means
#
Means = np.array([[0, 3], [3, 0], [4,4]])
# Define three covariance matrices ensuring
# they are positive definite
#
from sklearn.datasets import make_spd_matrix
CovMatrices = np.zeros((3,2,2))
for j in range(3):
    CovMatrices[j,:,:] = make_spd_matrix(2)
# Priors
#
w = np.random.rand(3)
w = w / np.sum(w)
# How many data in each component (1000 in total)
#
nData = np.floor(w * 1000).astype(int)
# Draw samples from each component
#
X0 = genGaussianSamples(nData[0], Means[0,:], CovMatrices[0,:,:])
X1 = genGaussianSamples(nData[1], Means[1,:], CovMatrices[1,:,:])
X2 = genGaussianSamples(nData[2], Means[2,:], CovMatrices[2,:,:])
# Append into an array for the data we need
#
X = np.append(np.append(X0, X1, axis=0), X2, axis=0)


In [None]:
# Plotting the contours 
def gauss2D(x, m, C):
    Ci = np.linalg.inv(C)
    dC = np.linalg.det(Ci)
    num = np.exp(-0.5 * np.dot((x-m).T,np.dot(Ci, (x-m))))
    den = 2 * np.pi * np.sqrt(dC)
    
    return num/den

def twoDGaussianPlot(nx, ny, m, C):
    x = np.linspace(-8, 8, nx)
    y = np.linspace(-8, 8, ny)
    X, Y = np.meshgrid(x, y, indexing = 'ij')
    Z = np.zeros([nx, ny])
    for i in range(nx):
        for j in range(ny):
            xvec = np.array([X[i, j], Y[i, j]])
            Z[i, j] = gauss2D(xvec, m ,C)
    return X, Y, Z
nx, ny = 50, 40
m1 = Means[0,:]
C1 = CovMatrices[0,:,:]
Xp1, Yp1, Zp1 = twoDGaussianPlot(nx, ny, m1, C1)
m2 = Means[1,:]
C2 = CovMatrices[1,:,:]
Xp2, Yp2, Zp2 = twoDGaussianPlot(nx, ny, m2, C2)
m3 = Means[2,:]
C3 = CovMatrices[2,:,:]
Xp3, Yp3, Zp3 = twoDGaussianPlot(nx, ny, m3, C3)
plt.figure(figsize = (15,10))
fig, ax = plt.subplots(figsize = (5,5))
ax.scatter(X0[:,0], X0[:,1], c = "c", s=4)
ax.scatter(X1[:,0], X1[:,1], c = "m", s=4)
ax.scatter(X2[:,0], X2[:,1], c= "r", s=4)
ax.set_xlim(-2, 8)
ax.set_ylim(-2, 8)
plt.contour(Xp1,Yp1,Zp1,5)
plt.contour(Xp2,Yp2,Zp2,5)
plt.contour(Xp3,Yp3,Zp3,5)
plt.savefig('Contourplotofdata', tight = 'bbox_inches')

In [None]:
# K-Means implementation
K = 3
datadist = np.zeros((len(X), K))
def Kmeans(X, k, num_of_iter):
        ix = np.random.choice(len(X), k, replace = False)
        centroids = np.reshape(X[ix, :], (3,2)) #Randomly choosing centroids
        for i in range(K):
            datadist[:,i] = np.sqrt(np.sum((X - centroids[i,:])**2,axis = 1)) #Finding the euclidean distance between centroids and data points
        Class = np.array(np.argmin(datadist, axis = 1)) #Assigning datapoints to classes based on distance to centroid
       # p = np.argmin(spatial.distance.cdist(X, centroids, 'euclidean'), axis = 1)
   
        for _ in range(num_of_iter):
            centroids = []
            for j in range(k):
                tmp_cent = X[Class == j].mean(axis = 0)
                centroids.append(tmp_cent)
            centroids = np.vstack(centroids)
            plt.scatter(centroids[:,0],centroids[:,1],c = 'black',s= 20)#Plotting centroids obtained in each iteration
            for i in range(K):
                datadist[:,i] = np.sqrt(np.sum((X - centroids[i,:])**2,axis = 1)) #Finding the euclidean distance between centroids and data points
                Class = np.array(np.argmin(datadist, axis = 1)) #Assigning datapoints to classes based on distance to centroid
        plt.scatter(centroids[:,0], centroids[:,1], c='r',s = 50)# Plotting the centroid after iterations
        print(centroids)
        return Class   
        #print(p)
plt.figure(figsize = (8,8))
Out = Kmeans(X, K ,300)


for i in range(K):
    plt.scatter(X[Out == i,0],X[Out == i,1],s = 20)
plt.savefig('Centroidconvergence', tight = 'bbox_inches')
plt.show()


In [None]:
# K-Means using sklearn
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(X)
kmeans.labels_
for i in range(K):
    plt.scatter(X[kmeans.labels_ == i,0],X[kmeans.labels_ == i,1],s = 6)
plt.savefig('SklearnKmeans', tight = 'bbox_inches')
plt.show()
kmeans.cluster_centers_

In [None]:
#Clustering Seeds dataset using K-Means

from pandas import crosstab
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("D:\Msc Lab FML\seeds_dataset.txt",sep =r'\t', header = None) #Reading the seeds dataset(UCI)
df.head()
scaler = StandardScaler() #Standardization
labels = df.iloc[:,7] #Target values
print(labels)
X_train = df.drop(columns = [7])
print(X_train)
X_train = scaler.fit_transform(X_train)
print(X_train)
Output = KMeans(n_clusters = 3, random_state = 0).fit(X_train)
Output.cluster_centers_
print(Output.labels_)
crosstab(np.array(labels), np.array(Output.labels_), rownames = ['label'], colnames = ['Cluster']) #Creating a table of labels and predicted clusters