In [1]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# Task 11 Modules
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import BayesianGaussianMixture
from sklearn.metrics import accuracy_score

# import train_test_split
from sklearn.model_selection import train_test_split

In [2]:
DOWNLOAD_ROOT = "http://www.macs.hw.ac.uk/%7Eek19/data/"
CURRENT_PATH = os.path.join(os.getcwd(), "datasets")
X_FILE = "x_train_gr_smpl.csv"
Y_FILE ="y_train_smpl.csv"

def fetch_data(download_root=DOWNLOAD_ROOT, current_path=CURRENT_PATH):
    if os.path.isfile(os.path.join(current_path, X_FILE)) or os.path.isfile(os.path.join(current_path, Y_FILE)):
        return
    
    os.makedirs(current_path, exist_ok=True)
    
    urllib.request.urlretrieve(download_root + X_FILE, os.path.join(current_path, X_FILE))
    urllib.request.urlretrieve(download_root + Y_FILE, os.path.join(current_path, Y_FILE))

In [3]:
fetch_data()

In [4]:
def load_features_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, X_FILE))

def load_labels_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, Y_FILE))

In [5]:
features = load_features_data()

In [6]:
labels = load_labels_data()

In [7]:
# KMeans

In [8]:
accuracyValues = []
for i in range(1,16):
    kmeans = KMeans(n_clusters=i, random_state=42)
    
    # Splitting dataset into training set and test set, 70/30 training/test, seed = 10
    X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.3,random_state=10)
    
    y_pred = kmeans.fit(X_train, Y_train)
    y_pred = y_pred.predict(X_test)
    
    accuracy = accuracy_score(Y_test, y_pred)
    
    print("Cluster_count: " + str(i) + ". Accuracy: " + str(accuracy))
    accuracyValues.append(accuracy)


Cluster_count: 1. Accuracy: 0.02476780185758514
Cluster_count: 2. Accuracy: 0.14069487444100448
Cluster_count: 3. Accuracy: 0.18059855521155832
Cluster_count: 4. Accuracy: 0.11386308909528724


KeyboardInterrupt: 

In [None]:
# Produce accuracy graph
x = np.arange(1, len(accuracyValues)+1) 

plt.title("KMeans Cluster Accuracy")  
plt.xlabel("Cluster Count")  
plt.ylabel("Accuracy")
plt.plot(x, accuracyValues, color ="green")  
plt.show()


In [None]:
# INERTIA for KMeans

In [9]:
inertia = []
K = range(1,16)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(features)
    inertia.append(kmeanModel.inertia_)


KeyboardInterrupt: 

In [None]:
# Plot the elbow
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.show()


In [None]:
#BayesianGaussianMixture

In [14]:
accuracyValues = []
for i in range(1,16):
    clf = BayesianGaussianMixture(n_components=i, covariance_type='full', random_state=42)
    
    # Splitting dataset into training set and test set, 70/30 training/test, seed = 10
    X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.3, random_state=10)
    
    y_pred = clf.fit(X_train, Y_train).predict(X_test)
    
    accuracy = accuracy_score(Y_test, y_pred)
    
    print("Cluster_count: " + str(i) + ". Accuracy: " + str(accuracy))
    accuracyValues.append(accuracy)

Cluster_count: 1. Accuracy: 0.02476780185758514
Cluster_count: 2. Accuracy: 0.1585827313381493
Cluster_count: 3. Accuracy: 0.23460612315101478


KeyboardInterrupt: 

In [None]:
# Produce accuracy graph
x = np.arange(1, len(accuracyValues)+1)

plt.title("EM Accuracy")
plt.xlabel("Cluster Count")
plt.ylabel("Accuracy")
plt.plot(x, accuracyValues, color ="green")
plt.show()


In [None]:
#AgglomerativeClustering

In [None]:
accuracyValues = []
for i in range(1,16):
    clustering = AgglomerativeClustering(n_clusters=i)
    
    # Splitting dataset into training set and test set, 70/30 training/test, seed = 10
    X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.3, random_state=10)
    y_pred = clustering.fit(X_train, Y_train)
    y_pred = clustering.fit_predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    
    print("Cluster_count: " + str(i) + ". Accuracy: " + str(accuracy))
    accuracyValues.append(accuracy)


Cluster_count: 1. Accuracy: 0.02476780185758514


In [None]:
# Produce accuracy graph
x = np.arange(1, len(accuracyValues)+1)

plt.title("Agglomerative Cluster Accuracy")
plt.xlabel("Cluster Count")
plt.ylabel("Accuracy")
plt.plot(x, accuracyValues, color ="green")
plt.show()