In [None]:
# Import libraries:
import os
import umap 
import re
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from text_preprocessing import *
import sklearn.cluster as cluster
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

In [None]:
# Load data:
# I initially used the annotations files to get the all the words from the each document:
directory= 'dataset/training_data/annotations/'
print('Found {} files for training in the given directory'.format(len(os.listdir(directory))))

list_of_documents = []
for filename in os.listdir(directory):
	f = os.path.join(directory, filename)
	# checking if it is a file
	if os.path.isfile(f):
		file = open(f)
		data = json.load(file)
		document = ''
		for i in range(len(data['form'])):
			document = document + ' ' + data['form'][i]['text']
		list_of_documents.append(document)

In [None]:
# I noticed that the text extracted from the annotations files is quite
# "dirty", so I did some text pre-processing to clean it:
DOCS = []
shortword = re.compile(r'\W*\b\w{1,3}\b')
for i in list_of_documents:
	cleaned = preprocess_text(i, [to_lower, remove_punctuation, remove_number, remove_whitespace, remove_stopword])
	DOCS.append(shortword.sub('', cleaned))
# Example of comparison between pre and post-processed text:
print(list_of_documents[10])
print('\n')
print(DOCS[10])

In [None]:
# Vectorize list of words:
tfidf_vectorizer = TfidfVectorizer(min_df=3, stop_words='english')
X = tfidf_vectorizer.fit_transform(DOCS)

In [None]:
# First try: K-means clustering. We try for a range of different classes K 
# and we plot the sum of the squared distances of each sample to its cluster centre
RSS = []
for k in range(1,50):
	km = cluster.KMeans(n_clusters=k, init="k-means++", max_iter=100, n_init=20)
	km.fit(X)
	RSS.append(km.inertia_)
# 	print(metrics.silhouette_score(X, km.labels_, sample_size=149))

sns.set()
plt.plot(range(1,50),RSS,'bx-', linewidth=3)
plt.xlabel('k')
plt.ylabel('RSS')
plt.show()

The RSS decreases as the number of classes increases, as expected, but there is no "elbow" in the curve, i.e. we don't see a steep decrement of the RSS within low numbers of classes. So it seems K-means is failing to cluster the data.

In light of this, I tried to see if hierarchical clustering could perform better. I tried different linkages and affinity metrics.

In [None]:
# Function to plot the dendogram:
def plot_dendrogram(model, **kwargs):
    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    dendrogram(linkage_matrix, **kwargs)

In [None]:
# Iterate through the different affinity metrics and linkages:
for ind, metric in enumerate(["cosine", "euclidean", "cityblock"]):
    for ind, link in enumerate(["complete", "average", "single"]):
        model = AgglomerativeClustering(n_clusters=None, distance_threshold=0, compute_full_tree=True, linkage=link, affinity=metric)
        model.fit(X.toarray())
        fig, ax = plt.subplots()
        plot_dendrogram(model, truncate_mode="level", p=3)
        ax.set_title("Linkage = {}    Metric = {}".format(link,metric))
        ax.set_xlabel("Number of points in node (or index of point if no parenthesis).")
        fig.show()
# Try also ward linkage, which only accepts euclidean as metric:
metric = "euclidean"
link = "ward"
model = AgglomerativeClustering(n_clusters=None, distance_threshold=0, compute_full_tree=True, linkage=link, affinity=metric)
model.fit(X.toarray())
fig, ax = plt.subplots()
plot_dendrogram(model, truncate_mode="level", p=3)
ax.set_title("Linkage = {}    Metric = {}".format(link,metric))
ax.set_xlabel("Number of points in node (or index of point if no parenthesis).")
fig.show()
        

By inspecting the dendogram plots I was not able to see a clear number of clusters.

I have then decided to reduce the dimensionality of the vectorized text in order to be able to plot and visualize the training data.

In [None]:
# Perform PCA:
pca = PCA(n_components=2)
two_dim = pca.fit_transform(X.toarray())
scatter_x = two_dim[:, 0] # first principle component
scatter_y = two_dim[:, 1] # second principle component
# Plot 2D dataset:
fig, ax = plt.subplots()
ax.scatter(scatter_x, scatter_y)
plt.xlabel("PCA First component")
plt.ylabel("PCA Second component")
plt.show()

Looking at the scatterplot the clusters in our data are not clearly visible, expect for 2 small clusters on the bottom right and top left. Given this information, I have decided to re-run the hierarchical clustering, this tipe specifiying a the number of classes to be 3.

For each metric and linkage I replicate the scatterplot to see whether the clustering algorithm has been able to distinguish the 3 clusters. 

In [None]:
# Iterate again through the different affinity metrics and linkages:
for ind, metric in enumerate(["cosine", "euclidean", "cityblock"]):
    for ind, link in enumerate(["complete", "average", "single"]):
        model = AgglomerativeClustering(n_clusters=3, distance_threshold=None, compute_full_tree=True, linkage=link, affinity=metric)
        model.fit(X.toarray())
        clusters = model.fit_predict(X.toarray())
        fig, ax = plt.subplots()
        cmap = {0: 'yellow', 1: 'blue', 2: 'red'}
        # scatter every cluster with a different colour:
        for group in np.unique(clusters):
            ix = np.where(clusters == group)
            ax.scatter(scatter_x[ix], scatter_y[ix], c=cmap[group], label=group)
            ax.set_title("Linkage = {}    Metric = {}".format(link,metric))
        plt.xlabel("PCA First component")
        plt.ylabel("PCA Second component")

# Try also ward linkage, which only accepts euclidean as metric:        
metric = "euclidean"
link = "ward"
model = AgglomerativeClustering(n_clusters=3, distance_threshold=None, compute_full_tree=True, linkage=link, affinity=metric)
model.fit(X.toarray())
clusters = model.fit_predict(X.toarray())
fig, ax = plt.subplots()
cmap = {0: 'yellow', 1: 'blue', 2: 'red'}
# scatter every cluster with a different colour:
for group in np.unique(clusters):
    ix = np.where(clusters == group)
    ax.scatter(scatter_x[ix], scatter_y[ix], c=cmap[group], label=group)
ax.set_title("Linkage = {}    Metric = {}".format(link,metric))
plt.xlabel("PCA First component")
plt.ylabel("PCA Second component")
        
plt.show()

As visible from the last graph, the ward linkage with an euclidean metric is able to distinguish the 3 clusters.

As a **classifier**, I wanted to use the pre-trained VGG net. For that, I need labels for the images in the test data, so I can evaluate the performance of the classifier. I also organized the training set with the newly found label, so that they could come handy to partially re-train the VGG net (i.e., do transfer learning).

I could not manually label the images in the test data, so I classified them with the clustering model learned before (linkage ward + euclidean metric) and scatter-plotted them against the training data.  

In [None]:
# Load the test data:
directory= 'dataset/testing_data/annotations/'
print('Found {} files in the given directory'.format(len(os.listdir(directory))))

list_of_documents = []
# iterate over files in that directory
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        file = open(f)
        data = json.load(file)
        document = ''
        for i in range(len(data['form'])):
            document = document + ' ' + data['form'][i]['text']
        list_of_documents.append(document)

DOCS = []
shortword = re.compile(r'\W*\b\w{1,3}\b')

for i in list_of_documents:
    cleaned = preprocess_text(i, [to_lower, remove_punctuation, remove_number, remove_whitespace, remove_stopword])
    DOCS.append(shortword.sub('', cleaned))

tfidf_vectorizer = TfidfVectorizer(min_df=3, stop_words='english')
X = tfidf_vectorizer.fit_transform(DOCS)

pca = PCA(n_components=2)
two_dim = pca.fit_transform(X.toarray())
scatter_x2 = two_dim[:, 0] # first principle component
scatter_y2 = two_dim[:, 1] # second principle component
fig, ax = plt.subplots()
ax.scatter(scatter_x, scatter_y, label='training')
ax.scatter(scatter_x2, scatter_y2, c='black', label='test')
plt.xlabel("PCA First component")
plt.ylabel("PCA Second component")
ax.legend()
plt.title('Training data + test data (in black)')


for ind, metric in enumerate(["euclidean"]):
    for ind, link in enumerate(["ward"]):
        model = AgglomerativeClustering(n_clusters=3, distance_threshold=None, compute_full_tree=True, linkage=link, affinity=metric)
        model.fit(X.toarray())
        clusters = model.fit_predict(X.toarray())
#         pca = PCA(n_components=2)
#         two_dim = pca.fit_transform(X.toarray())
#         scatter_x = two_dim[:, 0] # first principle component
#         scatter_y = two_dim[:, 1] # second principle component
        fig, ax = plt.subplots()
        # scatter every cluster with a different colour:
        cmap = {0: 'yellow', 1: 'blue', 2: 'red'}
        for group in np.unique(clusters):
            ix = np.where(clusters == group)
            ax.scatter(scatter_x2[ix], scatter_y2[ix], c=cmap[group], label=group)
        ax.legend()
        plt.xlabel("PCA First component")
        plt.ylabel("PCA Second component")
        plt.title('Predictions on the test data')

plt.show()

(Classification using VGG net is in the other notebook file).