In [None]:
No outputs in this section, as it was run on the HPC as regular python scripts, not a notebook.

In [None]:
import os
import re
import mmh3
import time
import fasttext
import numpy as np
from copy import copy
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, \
    Birch, MiniBatchKMeans, SpectralClustering
from sklearn.mixture import GaussianMixture
from itertools import combinations


def vectorise_text(review_list, fasttext_model, create_file=False, name='Train'):
    """
    Given a list of ratings and reviews and a fasttext model, this function
    returns the average of the word vectors as represented by the model.
    """
    output = []
    counter = 0
    for review in review_list:
        if counter % 1000 == 1:
            print(f'{np.round(100 * counter / len(review_list), 6)} % done')
        text_list = []
        for word in review[1]:
            temp_vector = fasttext_model.get_word_vector(word)
            text_list.append(temp_vector)
        output.append(np.sum(np.array(text_list), axis=0) / len(text_list))
        counter += 1
    output = [[review_list[i][0], el] for i, el in enumerate(output)]
    if create_file:
        df = pd.DataFrame(output)
        df.to_csv(f'{name}_fasttext_vectors.csv', header=True, index=False)
    return output


def minhash_text(review_list, q=9, minhash_length=100, create_file=False, name=''):
    """
    Given the review list including the star ratings, return a list
    of each review minhashed. The seed and length of minhash vector
    can be specified.

    :param name:
    :param create_file:
    :param review_list:         contains [stars, review_text]
    :param q:
    :param minhash_length:
    :param seed:
    :return:
    """
    output = []
    counter = 0
    for review in review_list:
        if counter % 1000 == 1:
            print(f'{np.round(100 * counter / len(review_list), 6)} % done')
        shingles_list = q_shingles(review[1], q)
        output.append(minhash(shingles_list, k=minhash_length, stars=int(review[0])))
        counter += 1
    if create_file:
        df = pd.DataFrame(output, dtype=int)
        df.to_csv(f'{name}_minhash_vectors.csv', header=True, index=False)
    return output


def minhash(shingles_list, k=1, stars=None):
    """
    Given list of shingles of words representing a text, compute the
    list of min-hashes of length k for that text, can add star rating
    as the first column if specified
    :param stars:
    :param shingles_list:
    :param k:
    :return minh:
    """
    minh = []
    if stars is not None:
        minh.append(stars)

    for i in range(k):  # define the seeds for the hash function
        temp = np.inf
        if shingles_list:
            for shingle in shingles_list:
                temp = min(temp, mmh3.hash(shingle, seed=i, signed=False))
            minh.append(int(temp))
        else:
            minh.append(0)
    return minh


def q_shingles(string, q, characters=True):
    """
    Given string and length of shingles, returns a set of
    all shingles of characters/words in the string.
    :param characters:
    :param string:
    :param q:
    :return output:
    """
    output = set()
    if characters:
        length = len(string)
        for i in range(length - q):
            output.add(string[i:i + q])
    else:
        string_list = string.split()
        length = len(string_list)
        for i in range(length - q):
            output.add(tuple(string[i:i + q]))
    return output


Run fasttext, save the model with the test accuracy in title

In [None]:
model = fasttext.train_supervised('train_fasttext.txt', autotuneValidationFile='test_fasttext.txt', autotuneDuration=600)
res = model.test('test_fasttext.txt')
model.save_model(f'autotuned_apparel{res[1]}.bin')


Create Minhash vectors

In [None]:
t = time.time()
data_path = os.path.join(os.getcwd(), 'data')
train_list, test_list = load_review_lists(filepath=data_path)
print(f'Loading data took: {time.time() - t}')
t = time.time()

minhash_text(test_list, create_file=True, name='testaaaa')
print(f'Vectorising test data took: {time.time() - t}')

minhash_text(train_list, create_file=True, name='trainaaaa')
print(f'Vectorising training data took: {time.time() - t}')


Create Fasttext vectors

In [None]:
t = time.time()
data_path = os.path.join(os.getcwd(), 'data')
train_list, test_list = load_review_lists(filepath=data_path)
model_path = 'autotuned_apparel1_0.552265.bin'
model = fasttext.load_model(model_path)
print(f'Loading data took: {time.time() - t}')
t = time.time()

vectorise_text(train_list, fasttext_model=model, create_file=True, name='train1')
print(f'Vectorising training data took: {time.time() - t}')

vectorise_text(test_list, fasttext_model=model, create_file=True, name='test1')
print(f'Vectorising test data took: {time.time() - t}')


Run random classification

In [None]:
filename_list = ['train_fasttext', 'train_minhash', 'test_fasttext', 'test_minhash']
path_list = [os.path.join(os.getcwd(), f'{name}_vectors.csv') for name in filename_list]

n_test = None

# Loading Test data
t = time.time()
ft_test_v, ft_test_r = load_review_vectors(path_list[2], no_reviews=n_test)
mh_test_v, mh_test_r = load_review_vectors(path_list[3], no_reviews=n_test)
print(f'Loading test data took: {time.time() - t}')
print(f'Shape of test data:\nft: {ft_test_v.shape}\nmh: {mh_test_v.shape}')

np.random.seed(42)
ft_true = sum((ft_test_r - np.random.randint(1, 6, size=ft_test_r.shape)) == 0)/len(ft_test_r)
mh_true = sum((mh_test_r - np.random.randint(1, 6, size=mh_test_r.shape)) == 0)/len(mh_test_r)
print(f'Proportion correct on Fasttext: {ft_true}')
print(f'Proportion correct on Minhash:  {mh_true}')

Define functions for clustering

In [None]:
def clustering(points, method='k_means', homemade=False, k=5, centroids=None, tol=1e-5, show_cluster=False,
               title='Clusters Shown', birch_thresh=0.01):
    """
    K-means clustering using either the homemade implementation or sk-learn's.
    It can show a 2D plot of the first two dimensions of the clusters.
    In the homemade version the centroids can be specified.
    :param method:
    :param points:
    :param homemade:
    :param k:
    :param centroids:
    :param tol:
    :param show_cluster:
    :param title:
    :return:
    """
    points = np.array(points)
    if method[0].lower() == 'k':
        if homemade:
            if centroids is None:
                random_indices = np.random.randint(0, len(points), k)
                centroids = np.array([points[i] for i in random_indices])

            cluster_assignments = np.zeros(len(points), dtype=int)
            temp = np.ones(centroids.shape) - centroids
            while np.array([el > tol for el in np.abs(centroids - temp)]).any():
                temp = copy(centroids)
                # cluster_assignments = [np.argmin(np.sum(np.abs(centroids - point), axis=1)) for i, point in enumerate(points)]
                for i, point in enumerate(points):
                    distances = np.sum(np.abs(centroids - point), axis=1)
                    cluster_assignments[i] = int(np.argmin(distances))
                for i in range(k):
                    ci_points = points[cluster_assignments == i]
                    centroids[i] = np.sum(ci_points, axis=0) / len(ci_points)
        else:
            model = KMeans(n_clusters=k, init='k-means++').fit(points)
            cluster_assignments = model.labels_
    elif method[0].lower() == 'a':
        model = AgglomerativeClustering(n_clusters=k).fit(points)
        cluster_assignments = model.labels_
    elif method[0].lower() == 'b':
        model = Birch(n_clusters=k, threshold=birch_thresh).fit(points)
        cluster_assignments = model.labels_
    elif method[0].lower() == 'm':
        model = MiniBatchKMeans(n_clusters=k, init='k-means++').fit(points)
        cluster_assignments = model.labels_
    elif method[0].lower() == 's':
        model = SpectralClustering(n_clusters=k).fit(points)
        cluster_assignments = model.labels_
    elif method[0].lower() == 'g':
        model = GaussianMixture(n_components=k, init_params='k-means++').fit(points)
        cluster_assignments = model.predict(points)

    if show_cluster:
        show_clustering(points, cluster_assignments, title=title)

    return cluster_assignments, model


def show_clustering(points, assignments, title='Clusters shown'):
    points = np.array(points)
    if points.shape[1] > 2:
        print('Show_cluster will only show the first 2 dimensions of points!!')
    cmap = {0: 'b', 1: 'y', 2: 'g', 3: 'r', 4: 'm', 5: 'c', 6: 'k'}
    try:
        color_list = [cmap[el] for el in assignments]
    except ValueError:
        print('Only 7 colours are available, plot failed')
        raise ValueError
    plt.scatter(points[:, 0], points[:, 1], c=color_list)
    plt.title(title)
    plt.show()


def cluster_closeness_matrix(true_labels, clusters, decimals=3):
    """
    Calculate the percentage of labels corresponding to each true label
    in each cluster.
    """
    k = len(np.unique(true_labels))  # number of labels
    cluster_closeness_mat = []
    weights = [sum(true_labels == i) for i in range(1, k + 1)]
    for i in range(k):  # for every label
        # j is star ratings so between 1-5.
        counts = [np.sum(true_labels[clusters == i] == j) for j in range(1, k + 1)]

        if any(counts):  # avoid division by 0
            cluster_closeness_mat.append(counts / sum(counts))
        else:
            cluster_closeness_mat.append([0 for _ in range(k)])
    cluster_closeness_mat = np.round(np.array(cluster_closeness_mat), decimals)
    return cluster_closeness_mat, weights


def assign_clusters(m, label_counts=None):
    if label_counts is None:
        label_counts = np.ones(len(m))

    real_combs = []
    # Create all combinations of mappings:
    comb_mat = combinations([(i, j) for i in range(5) for j in range(5)], 5)
    for el in list(comb_mat):
        temp = np.array(el)
        if len(np.unique(temp[:, 0])) == 5 and len(np.unique(temp[:, 1])) == 5:
            real_combs.append(el)
    # Calculate the proportion correct from each combination
    max_i, max_sum = 0, 0
    for i, coord_set in enumerate(real_combs):
        temp_sum = 0
        for j, coord in enumerate(coord_set):
            temp_sum += m[coord] * label_counts[j]
        if temp_sum > max_sum:
            max_sum = temp_sum
            max_i = i
    # Find the maximum combination
    assignments = np.array(real_combs[max_i])[:, 1] + 1
    a_dict = {}
    # Index is cluster, element is star rating
    for i, el in enumerate(assignments):
        a_dict[i] = el
    return a_dict


def p_correct_clusters(true_labels, test_vectors, cluster_map, assigned_labels=None, train_vectors=None, knn=15,
                       model=None):
    """
    Given the test vectors, and vectors on which the cluster was trained
    find the knn of each point and use majority voting to assign this
    a label. Count the number of correctly assigned labels and return a
    proportion of correctly labelled points.
    :param cluster_map:
    :param test_vectors:
    :param train_vectors:
    :param assigned_labels:
    :param true_labels:
    :param knn:
    :return:
    """
    correct, incorrect = 0, 0
    if model is None:
        for idx, point in enumerate(test_vectors):
            distances = np.sum(np.abs(point - train_vectors), axis=1)
            label = np.bincount(
                assigned_labels[np.argpartition(distances, knn)[:knn]]).argmax()  # chose lowest if tied
            correct += label == true_labels[idx]
    else:
        incorrect = np.count_nonzero(true_labels - np.array([cluster_map[el] for el in model.predict(test_vectors)]))
    return correct / len(true_labels) if correct else (1 - incorrect / len(true_labels))


Run clustering

In [None]:
knn, k = 9, 5
n_train, n_test = 1000, 100
cluster_types = ['K-means', 'Minibatch-Kmeans']     # , 'Gaussian-Mixture' , 'Agglomerative', 'Birch', 'Spectral']


filename_list = ['train_fasttext', 'train_minhash', 'test_fasttext', 'test_minhash']
path_list = [os.path.join(os.getcwd(), f'{name}_vectors.csv') for name in filename_list]

# Loading Training Data
t = time.time()
ft_train_v, ft_train_r = load_review_vectors(path_list[0], no_reviews=n_train)
mh_train_v, mh_train_r = load_review_vectors(path_list[1], no_reviews=n_train)
print(f'Loading training data took: {time.time() - t}')
print(f'Shape of training data:\nft: {ft_train_v.shape}\nmh: {mh_train_v.shape}')
# Loading Test data
t = time.time()
ft_test_v, ft_test_r = load_review_vectors(path_list[2], no_reviews=n_test)
mh_test_v, mh_test_r = load_review_vectors(path_list[3], no_reviews=n_test)
print(f'Loading test data took: {time.time() - t}')
print(f'Shape of test data:\nft: {ft_test_v.shape}\nmh: {mh_test_v.shape}')


proportion_correct, cluster_assignments, cc_mats, models, weights = [], [], [], [], []
for i, name in enumerate(cluster_types):
    # Run the clustering
    t = time.time()
    labels_ft, model_ft = clustering(ft_train_v, method=name)
    print(f'{name} took: {time.time() - t} seconds on fasttext')
    labels_mh, model_mh = clustering(mh_train_v, method=name)
    print(f'{name} took: {time.time() - t} seconds on minhash')

    t = time.time()
    # Proportion of each class in the clusters, each row is a cluster column is star rating
    m1, w1 = cluster_closeness_matrix(ft_train_r, labels_ft, decimals=4)
    m2, w2 = cluster_closeness_matrix(mh_train_r, labels_mh, decimals=4)

    # Using the maximum proportions assign each cluster a star rating, creates a dict:
    label_map_ft, label_map_mh = assign_clusters(m1, w1), assign_clusters(m2, w2)
    # Use predict method and compare to the assigned clusters
    correct_proportion_ft = p_correct_clusters(ft_test_r, ft_test_v, label_map_ft, model=model_ft)
    correct_proportion_mh = p_correct_clusters(mh_test_r, mh_test_v, label_map_mh, model=model_mh)
    print(f'Calculating the correct proportion took {time.time()-t} seconds')

    # Append all desired data to corresponding lists
    cluster_assignments.append((label_map_ft, label_map_mh))
    cc_mats.append((m1, m2))
    models.append((model_ft, model_mh))
    weights.append((w1, w2))
    proportion_correct.append((correct_proportion_ft, correct_proportion_mh))

print(proportion_correct)
print(f'The proportion of each class in each cluster is: {weights}')
