In [21]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.cluster import KMeans

# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt


In [17]:
# ==============================================
# FILES MANAGEMENT
# ==============================================

DOWNLOAD_ROOT = "http://www.macs.hw.ac.uk/%7Eek19/data/"
CURRENT_PATH = os.path.join(os.getcwd(), "datasets")
X_FILE = "x_train_gr_smpl.csv"
Y_FILE ="y_train_smpl.csv"

def fetch_file(file, download_root=DOWNLOAD_ROOT, current_path=CURRENT_PATH):
    if os.path.isfile(os.path.join(current_path, file)):
        return
    
    os.makedirs(current_path, exist_ok=True)
    urllib.request.urlretrieve(download_root + file, os.path.join(current_path, file))

In [18]:
# fetching all files needed in the task
fetch_file(X_FILE)
fetch_file(Y_FILE)

for i in range(10):
    fetch_file("y_train_smpl_" + str(i) + ".csv")

In [19]:
# Loading functions 
def load_features_data(current_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(current_path, X_FILE)).rename(columns=lambda s: "pixel_"+s, index=lambda s: "sign_"+str(s))

def load_labels_data(labels_n=None):
    if labels_n is None:
        return pd.read_csv(os.path.join(CURRENT_PATH, Y_FILE)).rename(columns=lambda s:"class", index=lambda s: "sign_"+str(s))
    else:
        return pd.read_csv(os.path.join(CURRENT_PATH, "y_train_smpl_" + str(labels_n) + ".csv")).rename(columns=lambda s:"class_"+str(labels_n), index=lambda s: "sign_"+str(s))

In [33]:
# getting file with all labels
all_labels = load_labels_data()

# getting file with alla features
features = load_features_data()

In [34]:
k = 10
kmeans = KMeans(n_clusters=k, random_state=42)
prediction = kmeans.fit_predict(all_labels)

In [37]:
prediction

array([7, 7, 7, ..., 6, 6, 6])