In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('tf-df.csv')

In [7]:
df.head()

Unnamed: 0,word,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,...,Document 15,Document 16,Document 17,Document 18,Document 19,Document 20,Document 21,Document 22,Document 23,df
0,administr,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,repres,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2
2,of,4,2,0,0,0,0,1,0,6,...,0,0,6,13,4,3,4,0,3,14
3,sole,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,necessarili,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
doc_headers = [column for column in df.columns if column not in ['word', 'df']]

In [9]:
vecs = pd.DataFrame(df)

In [10]:
# ntc
import numpy as np
norm_doc_freq = np.log(len(doc_headers)/vecs['df'])

In [11]:
# tf.df
for doc in doc_headers:
    vecs[doc] = vecs[doc] * norm_doc_freq

In [12]:
# Cosine normalization:
for doc in doc_headers:
    sum_sq = np.sum(vecs[doc]**2)
    vecs[doc] = vecs[doc]/sum_sq

In [13]:
# KNN step a: randomly assign docs to clusters
cluster_centroids = [vecs[doc_headers[no]] for no in range(5)]

In [14]:
dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)

In [15]:
# Populate the distance-matrix
for doc in range(len(doc_headers)):
    for centroid in range(len(cluster_centroids)):
        vec_difference = vecs[doc_headers[doc]] - cluster_centroids[centroid]
        magnitude = np.sum(vec_difference**2)
        dist_matrix[doc][centroid] = magnitude 

In [16]:
distances = np.apply_along_axis(np.argmin, 1, dist_matrix)

In [17]:
# initialize the clusters
clusters = dict()

In [18]:
for i in range(5):
    clusters[i] = []

In [19]:
for doc in range(len(distances)):
    nearest = distances[doc]
    clusters[nearest].append(doc)

In [20]:
# Get the vectors in cluster 1
cluster_vectors = [vecs[doc_headers[doc_nos]] for doc_nos in clusters[1]]

In [21]:
def get_centroids(clusters):
    cluster_centroids = []
    for i in clusters:
        cluster_vectors = [vecs[doc_headers[doc_nos]] for doc_nos in clusters[i]]
        centroid = np.sum(cluster_vectors, axis=0)/len(clusters[i])
        cluster_centroids.append(centroid)
    return cluster_centroids

In [22]:
# Get a matrix of shape(docs x clusters) containing the distance of doc from each cluster centroid.
def get_absolute_distances(doc_headers, cluster_centroids):
    dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)
    for vec in range(len(doc_headers)):
        for centroid in range(len(cluster_centroids)):
            vec_difference = vecs[doc_headers[vec]] - cluster_centroids[centroid]
            magnitude = np.sum(vec_difference**2)
            dist_matrix[vec][centroid] = magnitude
    return dist_matrix

In [23]:
def get_clusters(dist_matrix):
    clusters = dict()
    for i in range(len(dist_matrix[0])):
        clusters[i] = []
    distances = np.apply_along_axis(np.argmin, 1, dist_matrix)
    for doc in range(len(distances)):
        nearest = distances[doc]
        clusters[nearest].append(doc)
    return clusters

In [24]:
class KNN:
    def __init__(self, N):
        self.N = N
    
    def get_empty_clusters(self, N):
        clusters = dict()
        for i in range(N):
            clusters[i] = []
        return clusters
    
    def initialize_state(self, doc_headers, vecs):
        self.doc_headers = doc_headers
        self.vecs = vecs
        self.clusters = self.get_empty_clusters(self.N)
        
        indices = np.arange(len(doc_headers))
        np.random.shuffle(indices)
        
        #  randomly assign clusters
        for i in range(self.N):
            self.clusters[i].append(indices[i])
    
    # Calculates the centroids given clusters
    def get_centroids(self, clusters):
        cluster_centroids = []
        for i in clusters:
            cluster_vectors = [vecs[doc_headers[doc_nos]] for doc_nos in clusters[i]]
            centroid = np.sum(cluster_vectors, axis=0)/len(clusters[i])
            cluster_centroids.append(centroid)
        return cluster_centroids

    # Get a matrix of shape(docs x clusters) containing the distance of doc from each cluster centroid.
    def get_absolute_distances(self, doc_headers, cluster_centroids):
        dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)
        for vec in range(len(doc_headers)):
            for centroid in range(len(cluster_centroids)):
                vec_difference = vecs[doc_headers[vec]] - cluster_centroids[centroid]
                magnitude = np.sum(vec_difference**2)
                dist_matrix[vec][centroid] = magnitude
        return dist_matrix
    
    def get_clusters(self, dist_matrix):
        clusters = self.get_empty_clusters(self.N)
        distances = np.apply_along_axis(np.argmin, 1, dist_matrix)
        for doc in range(len(distances)):
            nearest = distances[doc]
            clusters[nearest].append(doc)
        return clusters
    
    def cluster_hierarchy(self):
        clust_hier = dict()
        clusters = self.clusters
        for i in clusters:
            clust_hier[i] = dict()
            dist_from_centroid = self.dist_matrix[clusters[i], i]
            closest = np.argmin(dist_from_centroid)
            leader = clusters[i][closest]
            clust_hier[i]['leader'] = leader
            clust_hier[i]['followers'] = [doc for doc in clusters[i] if doc!=leader]
        return clust_hier
    
    def cluster(self, doc_headers, vecs):
        self.initialize_state(doc_headers, vecs)
        calculated_clusters = self.clusters
        converged = False
        
        while(converged == False):
            self.clusters = calculated_clusters
            cluster_centroids = self.get_centroids(calculated_clusters)
            self.dist_matrix = self.get_absolute_distances(doc_headers, cluster_centroids)
            calculated_clusters = self.get_clusters(self.dist_matrix)
            converged = self.clusters == calculated_clusters
        return calculated_clusters

In [25]:
knn = KNN(5)

In [26]:
knn.cluster(doc_headers, vecs)

{0: [12],
 1: [21],
 2: [5],
 3: [0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 22],
 4: [8]}

In [30]:
hier = knn.cluster_hierarchy()

In [32]:
hier

{0: {'followers': [], 'leader': 12},
 1: {'followers': [], 'leader': 21},
 2: {'followers': [], 'leader': 5},
 3: {'followers': [0,
   2,
   3,
   4,
   6,
   7,
   9,
   10,
   11,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   22],
  'leader': 1},
 4: {'followers': [], 'leader': 8}}

In [50]:
leaders = [hier[cluster]['leader'] for cluster in hier]
followers = [hier[cluster]['followers'] for cluster in hier]

In [59]:
clusters = list(hier.values())

In [64]:
leader_ids = list(map(lambda x: x['leader'], clusters))
followers_ids = list(map(lambda x: x['followers'], clusters))

In [66]:
scores

NameError: name 'scores' is not defined