In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('tf-df.csv')

In [4]:
df.head()

Unnamed: 0,word,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,...,Document 28,Document 29,Document 30,Document 31,Document 32,Document 33,Document 34,Document 35,Document 36,df
0,opinion,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,phd,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,latest,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,contact,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,inform,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [5]:
doc_headers = [column for column in df.columns if column not in ['word', 'df']]

In [6]:
vecs = pd.DataFrame(df)

In [7]:
# ntc
import numpy as np
norm_doc_freq = np.log(len(doc_headers)/vecs['df'])

In [8]:
# tf.df
for doc in doc_headers:
    vecs[doc] = vecs[doc] * norm_doc_freq

In [9]:
# Cosine normalization:
for doc in doc_headers:
    sum_sq = np.sum(vecs[doc]**2)
    vecs[doc] = vecs[doc]/sum_sq

In [10]:
# KNN step a: randomly assign docs to clusters
cluster_centroids = [vecs[doc_headers[no]] for no in range(5)]

In [11]:
dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)

In [12]:
# Populate the distance-matrix
for doc in range(len(doc_headers)):
    for centroid in range(len(cluster_centroids)):
        vec_difference = vecs[doc_headers[doc]] - cluster_centroids[centroid]
        magnitude = np.sum(vec_difference**2)
        dist_matrix[doc][centroid] = magnitude 

In [13]:
distances = np.apply_along_axis(np.argmin, 1, dist_matrix)

In [14]:
# initialize the clusters
clusters = dict()

In [15]:
for i in range(5):
    clusters[i] = []

In [16]:
for doc in range(len(distances)):
    nearest = distances[doc]
    clusters[nearest].append(doc)

In [17]:
# Get the vectors in cluster 1
cluster_vectors = [vecs[doc_headers[doc_nos]] for doc_nos in clusters[1]]

In [18]:
def get_centroids(clusters):
    cluster_centroids = []
    for i in clusters:
        cluster_vectors = [vecs[doc_headers[doc_nos]] for doc_nos in clusters[i]]
        centroid = np.sum(cluster_vectors, axis=0)/len(clusters[i])
        cluster_centroids.append(centroid)
    return cluster_centroids

In [19]:
# Get a matrix of shape(docs x clusters) containing the distance of doc from each cluster centroid.
def get_absolute_distances(doc_headers, cluster_centroids):
    dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)
    for vec in range(len(doc_headers)):
        for centroid in range(len(cluster_centroids)):
            vec_difference = vecs[doc_headers[vec]] - cluster_centroids[centroid]
            magnitude = np.sum(vec_difference**2)
            dist_matrix[vec][centroid] = magnitude
    return dist_matrix

In [20]:
def get_clusters(dist_matrix):
    clusters = dict()
    for i in range(len(dist_matrix[0])):
        clusters[i] = []
    distances = np.apply_along_axis(np.argmin, 1, dist_matrix)
    for doc in range(len(distances)):
        nearest = distances[doc]
        clusters[nearest].append(doc)
    return clusters

In [21]:
class KNN:
    def __init__(self, N):
        self.N = N
    
    def get_empty_clusters(self, N):
        clusters = dict()
        for i in range(N):
            clusters[i] = []
        return clusters
    
    def initialize_state(self, doc_headers, vecs):
        self.doc_headers = doc_headers
        self.vecs = vecs
        self.clusters = self.get_empty_clusters(self.N)
        
        #indices = np.arange(len(doc_headers))
        # np.random.shuffle(indices)
        doc_count = len(doc_headers)
        indices = np.arange(start=0, stop=doc_count, step=doc_count/self.N, dtype=np.int)
        
        for i in range(self.N):
            self.clusters[i].append(indices[i])
    
    # Calculates the centroids given clusters
    def get_centroids(self, clusters):
        cluster_centroids = []
        for i in clusters:
            cluster_vectors = [self.vecs[self.doc_headers[doc_nos]] for doc_nos in clusters[i]]
            #centroid = np.sum(cluster_vectors, axis=0)/len(clusters[i])
            centroid = np.mean(cluster_vectors, axis=0)
            cluster_centroids.append(centroid)
        return cluster_centroids

    # Get a matrix of shape(docs x clusters) containing the distance of doc from each cluster centroid.
    def get_absolute_distances(self, doc_headers, cluster_centroids):
        dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)
        for vec in range(len(doc_headers)):
            for centroid in range(len(cluster_centroids)):
                vec_difference = self.vecs[doc_headers[vec]] - cluster_centroids[centroid]
                magnitude = np.sum(vec_difference**2)
                dist_matrix[vec][centroid] = np.sqrt(magnitude)
        return dist_matrix
    
    def get_clusters(self, dist_matrix):
        clusters = self.get_empty_clusters(self.N)
        distances = np.apply_along_axis(np.argmin, 1, dist_matrix)
        for doc in range(len(distances)):
            nearest = distances[doc]
            clusters[nearest].append(doc)
        return clusters
    
    def cluster_hierarchy(self):
        clust_hier = dict()
        clusters = self.clusters
        for i in clusters:
            clust_hier[i] = dict()
            dist_from_centroid = self.dist_matrix[clusters[i], i]
            closest = np.argmin(dist_from_centroid)
            leader = clusters[i][closest]
            clust_hier[i]['leader'] = leader
            clust_hier[i]['followers'] = [doc for doc in clusters[i] if doc!=leader]
        return clust_hier
    
    def cluster(self, doc_headers, vecs):
        self.initialize_state(doc_headers, vecs)
        self.clusters = self.iteration(self.clusters)
        converged = False
        
        while(converged == False):
            calculated_clusters = self.iteration(self.clusters)
            converged = self.clusters == calculated_clusters
            self.clusters = calculated_clusters
        return calculated_clusters
    
    def iteration(self, clusters):
        cluster_centroids = self.get_centroids(clusters)
        self.dist_matrix = self.get_absolute_distances(self.doc_headers, cluster_centroids)
        return self.get_clusters(self.dist_matrix)

In [22]:
knn = KNN(5)

In [23]:
import scorer

In [24]:
vecs = df.copy()

In [25]:
knn.cluster(doc_headers, scorer.nnc_normalize(vecs[doc_headers], vecs['df']))

{0: [0, 1, 5, 6, 34, 35],
 1: [3, 7, 15],
 2: [8, 9, 11, 13, 14, 31],
 3: [2, 12, 19, 21, 23, 24, 29, 33],
 4: [4, 10, 16, 17, 18, 20, 22, 25, 26, 27, 28, 30, 32]}

In [27]:
make_legible()

NameError: name 'make_legible' is not defined

In [28]:
titles_previews = pd.read_csv('title_preview.csv')

In [29]:
tp = titles_previews.copy()

In [30]:
cluster_hierarchy = knn.cluster_hierarchy()

In [31]:
knn.cluster_hierarchy()

{0: {'followers': [0, 1, 6, 34, 35], 'leader': 5},
 1: {'followers': [7, 15], 'leader': 3},
 2: {'followers': [8, 11, 13, 14, 31], 'leader': 9},
 3: {'followers': [12, 19, 21, 23, 24, 29, 33], 'leader': 2},
 4: {'followers': [4, 10, 17, 18, 20, 22, 25, 26, 27, 28, 30, 32],
  'leader': 16}}

In [32]:
cluster_hierarchy

{0: {'followers': [0, 1, 6, 34, 35], 'leader': 5},
 1: {'followers': [7, 15], 'leader': 3},
 2: {'followers': [8, 11, 13, 14, 31], 'leader': 9},
 3: {'followers': [12, 19, 21, 23, 24, 29, 33], 'leader': 2},
 4: {'followers': [4, 10, 17, 18, 20, 22, 25, 26, 27, 28, 30, 32],
  'leader': 16}}

In [33]:
leaders_idx = [cluster_hierarchy[cluster]['leader'] for cluster in cluster_hierarchy]
leaders = [scorer.doc_headers[idx] for idx in leaders_idx]
followers_idx = [cluster_hierarchy[cluster]['followers'] for cluster in cluster_hierarchy]

In [34]:
def make_legible():
    legible_hier = dict()
    def get_title_by_idx(idx):
        header = doc_headers[idx]
        table = tp
        table = table[table['Document'] == header]
        return table['title'].values[0]
    for cluster in range(len(leaders_idx)):
        leader = get_title_by_idx(leaders_idx[cluster])
        followers = [get_title_by_idx(follower_id) for follower_id in followers_idx[cluster]]
        legible_hier[cluster] = dict()
        legible_hier[cluster]['leader'] = str(leader)
        legible_hier[cluster]['follower'] = [str(follower) for follower in followers]
    return legible_hier

In [35]:
make_legible()

{0: {'follower': ['Freeman Moore - SMU Spring 2018',
   'SMU CSE 5337/7337 Spring 2018 Schedule',
   'building1',
   'CSE 7337 Spring 2018 distance students exam 1 location',
   'Porter Stemmer Online'],
  'leader': 'building2'},
 1: {'follower': ['This is the magic file', 'CSE 5337/7337 User-Agent'],
  'leader': 'SMU CSE 5/7337 Spring 2018 text files"'},
 2: {'follower': ['Mockingbird part 5',
   'Mockingbird part 3',
   'Mockingbird part 2',
   'Mockingbird novel part 1',
   'golf3'],
  'leader': 'Mockingbird part 4'},
 3: {'follower': ['basketball2',
   'football2',
   'baseball5',
   'basketball4',
   'baseball4',
   'golf5',
   'golf1'],
  'leader': 'SMU CSE 5/7337 Spring 2018 Textfiles'},
 4: {'follower': ['football5',
   'baseball1',
   'baseball2',
   'football3',
   'football1',
   'basketball5',
   'Levenshtein Distance demo',
   'basketball3',
   'baseball3',
   'basketball1',
   'golf4',
   'golf2'],
  'leader': 'football4'}}

In [36]:
legible = make_legible()

In [38]:
docs = vecs.loc[:4, doc_headers[:7]]

In [39]:
docs

Unnamed: 0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7
0,0.003689,0.0,0.0,0.0,0.0,0.0,0.0
1,0.003689,0.0,0.0,0.0,0.0,0.0,0.0
2,0.002976,0.000332,0.0,0.0,0.0,0.0,0.0
3,0.003689,0.0,0.0,0.0,0.0,0.0,0.0
4,0.002558,0.000285,0.0,0.0,0.0,0.0,0.0


In [40]:
docs.columns = list(map(lambda x: 'd'+str(x), np.arange(7)))

In [41]:
docs['d1'] = [1, 9, 4, 7.375, 8]
docs['d2'] = [1, 9, 4, 5.875, 8]
docs['d3'] = [1, 8, 3, 6, 8]
docs['d4'] = [2, 13, 9, 10.83333, 6]
docs['d5'] = [2, 12, 2, 7.333333, 6]
docs['d6'] = [2, 14, 4, 8.25, 8]
docs['d7'] = [3, 8, 4, 6, 4]

In [43]:
docs['d1']

0    1.000
1    9.000
2    4.000
3    7.375
4    8.000
Name: d1, dtype: float64

In [44]:
docs

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7
0,0.003689,1.0,1.0,1,2.0,2.0,2.0,3
1,0.003689,9.0,9.0,8,13.0,12.0,14.0,8
2,0.002976,4.0,4.0,3,9.0,2.0,4.0,4
3,0.003689,7.375,5.875,6,10.83333,7.333333,8.25,6
4,0.002558,8.0,8.0,8,6.0,6.0,8.0,4


In [45]:
knn = KNN(2)

In [46]:
knn.cluster(docs.columns, docs)

{0: [0], 1: [1, 2, 3, 4, 5, 6, 7]}

In [47]:
test_data = pd.read_csv('test_data.csv')

In [48]:
td = test_data.iloc[:, :-1].copy()

In [49]:
td

Unnamed: 0,tf doc1,tf doc2,tf doc3,tf doc4,tf doc5,tf doc6,tf doc7,tf doc8,tf doc9,tf doc10,tf doc11
0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0
2,1,1,0,1,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,1,1,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,1,1,0,1,1,0


In [50]:
knn_test = KNN(2)

In [51]:
knn_test.cluster(td.columns, td)

{0: [0, 1, 2, 3], 1: [4, 5, 6, 7, 8, 9, 10]}

In [53]:
chier = knn.cluster_hierarchy()

In [83]:
scores = dict()
for cluster in chier:
    scores[cluster] = list()
    followers_idx = chier[cluster]['followers']
    leader_idx = chier[cluster]['leader']
    def get_doc_by_id(id):
        header = doc_headers[id]
        return df[header].copy()
    for i in range(len(followers_idx)):
        vec_sub = get_doc_by_id(leader_idx) - get_doc_by_id(followers_idx[i])
        score = np.sqrt(np.sum(vec_sub**2))
        scores[cluster].append(score)

In [84]:
scores

{0: [],
 1: [0.04714280508689833,
  0.04663348760679046,
  0.04741592493844627,
  0.20003664307438732,
  0.20003664307438732,
  0.09575834286466348]}

In [81]:
scores

[0.04714280508689833,
 0.04663348760679046,
 0.04741592493844627,
 0.20003664307438732,
 0.20003664307438732,
 0.09575834286466348]

In [54]:
followers_idx = chier[1]['followers']

In [55]:
followers_idx

[2, 3, 4, 5, 6, 7]

In [59]:
def get_doc_by_id(id):
    header = doc_headers[id]
    return df[header].copy()

In [65]:
fols = list(map(get_doc_by_id, followers_idx))