In [76]:
import pandas as pd

In [303]:
df = pd.read_csv('tf-df.csv')

In [78]:
df.head()

Unnamed: 0,word,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,...,Document 23,Document 24,Document 25,Document 26,Document 27,Document 28,Document 29,Document 30,Document 31,df
0,caruth,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,the,2,1,0,0,2,0,0,0,2,...,0,1,0,1,1,0,0,0,0,17
2,content,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3
3,respons,1,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,2
4,smu,1,2,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [79]:
doc_headers = [column for column in df.columns if column not in ['word', 'df']]

In [80]:
vecs = pd.DataFrame(df)

In [81]:
# ntc
import numpy as np
norm_doc_freq = np.log(len(doc_headers)/vecs['df'])

In [82]:
# tf.df
for doc in doc_headers:
    vecs[doc] = vecs[doc] * norm_doc_freq

In [83]:
# Cosine normalization:
for doc in doc_headers:
    sum_sq = np.sum(vecs[doc]**2)
    vecs[doc] = vecs[doc]/sum_sq

In [84]:
# KNN step a: randomly assign docs to clusters
cluster_centroids = [vecs[doc_headers[no]] for no in range(5)]

In [85]:
dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)

In [86]:
# Populate the distance-matrix
for doc in range(len(doc_headers)):
    for centroid in range(len(cluster_centroids)):
        vec_difference = vecs[doc_headers[doc]] - cluster_centroids[centroid]
        magnitude = np.sum(vec_difference**2)
        dist_matrix[doc][centroid] = magnitude 

In [87]:
distances = np.apply_along_axis(np.argmin, 1, dist_matrix)

In [88]:
# initialize the clusters
clusters = dict()

In [89]:
for i in range(5):
    clusters[i] = []

In [90]:
for doc in range(len(distances)):
    nearest = distances[doc]
    clusters[nearest].append(doc)

In [91]:
# Get the vectors in cluster 1
cluster_vectors = [vecs[doc_headers[doc_nos]] for doc_nos in clusters[1]]

In [92]:
def get_centroids(clusters):
    cluster_centroids = []
    for i in clusters:
        cluster_vectors = [vecs[doc_headers[doc_nos]] for doc_nos in clusters[i]]
        centroid = np.sum(cluster_vectors, axis=0)/len(clusters[i])
        cluster_centroids.append(centroid)
    return cluster_centroids

In [93]:
# Get a matrix of shape(docs x clusters) containing the distance of doc from each cluster centroid.
def get_absolute_distances(doc_headers, cluster_centroids):
    dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)
    for vec in range(len(doc_headers)):
        for centroid in range(len(cluster_centroids)):
            vec_difference = vecs[doc_headers[vec]] - cluster_centroids[centroid]
            magnitude = np.sum(vec_difference**2)
            dist_matrix[vec][centroid] = magnitude
    return dist_matrix

In [94]:
def get_clusters(dist_matrix):
    clusters = dict()
    for i in range(len(dist_matrix[0])):
        clusters[i] = []
    distances = np.apply_along_axis(np.argmin, 1, dist_matrix)
    for doc in range(len(distances)):
        nearest = distances[doc]
        clusters[nearest].append(doc)
    return clusters

In [302]:
class KNN:
    def __init__(self, N):
        self.N = N
    
    def get_empty_clusters(self, N):
        clusters = dict()
        for i in range(N):
            clusters[i] = []
        return clusters
    
    def initialize_state(self, doc_headers, vecs):
        self.doc_headers = doc_headers
        self.vecs = vecs
        self.clusters = self.get_empty_clusters(self.N)
        
        #indices = np.arange(len(doc_headers))
        # np.random.shuffle(indices)
        doc_count = len(doc_headers)
        indices = np.arange(start=0, stop=doc_count, step=doc_count/self.N, dtype=np.int)
        
        for i in range(self.N):
            self.clusters[i].append(indices[i])
    
    # Calculates the centroids given clusters
    def get_centroids(self, clusters):
        cluster_centroids = []
        for i in clusters:
            cluster_vectors = [self.vecs[self.doc_headers[doc_nos]] for doc_nos in clusters[i]]
            #centroid = np.sum(cluster_vectors, axis=0)/len(clusters[i])
            centroid = np.mean(cluster_vectors, axis=0)
            cluster_centroids.append(centroid)
        return cluster_centroids

    # Get a matrix of shape(docs x clusters) containing the distance of doc from each cluster centroid.
    def get_absolute_distances(self, doc_headers, cluster_centroids):
        dist_matrix = np.zeros(shape=(len(doc_headers),len(cluster_centroids)), dtype=float)
        for vec in range(len(doc_headers)):
            for centroid in range(len(cluster_centroids)):
                vec_difference = self.vecs[doc_headers[vec]] - cluster_centroids[centroid]
                magnitude = np.sum(vec_difference**2)
                dist_matrix[vec][centroid] = np.sqrt(magnitude)
        return dist_matrix
    
    def get_clusters(self, dist_matrix):
        clusters = self.get_empty_clusters(self.N)
        distances = np.apply_along_axis(np.argmin, 1, dist_matrix)
        for doc in range(len(distances)):
            nearest = distances[doc]
            clusters[nearest].append(doc)
        return clusters
    
    def cluster_hierarchy(self):
        clust_hier = dict()
        clusters = self.clusters
        for i in clusters:
            clust_hier[i] = dict()
            dist_from_centroid = self.dist_matrix[clusters[i], i]
            closest = np.argmin(dist_from_centroid)
            leader = clusters[i][closest]
            clust_hier[i]['leader'] = leader
            clust_hier[i]['followers'] = [doc for doc in clusters[i] if doc!=leader]
        return clust_hier
    
    def cluster(self, doc_headers, vecs):
        self.initialize_state(doc_headers, vecs)
        self.clusters = self.iteration(self.clusters)
        converged = False
        
        while(converged == False):
            calculated_clusters = self.iteration(self.clusters)
            converged = self.clusters == calculated_clusters
            self.clusters = calculated_clusters
        return calculated_clusters
    
    def iteration(self, clusters):
        cluster_centroids = self.get_centroids(clusters)
        self.dist_matrix = self.get_absolute_distances(self.doc_headers, cluster_centroids)
        return self.get_clusters(self.dist_matrix)

In [306]:
knn = KNN(5)

In [226]:
import scorer

In [304]:
vecs = df.copy()

In [307]:
knn.cluster(doc_headers, scorer.nnc_normalize(vecs[doc_headers], vecs['df']))

{0: [0, 1, 3, 7, 11, 14, 27, 28, 30],
 1: [5, 6],
 2: [8, 12, 15, 16, 20, 21, 22, 26, 29],
 3: [2, 4, 9, 13, 17, 18, 23, 25],
 4: [10, 19, 24]}

In [308]:
make_legible()

{0: {'follower': ['SMU CSE 5337/7337 Spring 2018 Schedule',
   'SMU CSE 5/7337 Spring 2018 Textfiles',
   'This is the magic file',
   'baseball4',
   'golf2',
   'CSE 7337 Spring 2018 distance students exam 1 location',
   'CSE 5337/7337 User-Agent',
   'Porter Stemmer Online'],
  'leader': 'Freeman Moore - SMU Spring 2018'},
 1: {'follower': ['building1'], 'leader': 'building2'},
 2: {'follower': ['SMU CSE 5/7337 Spring 2018 text files"',
   'baseball5',
   'Mockingbird part 5',
   'Mockingbird part 3',
   'Mockingbird part 2',
   'Mockingbird novel part 1'],
  'leader': 'Mockingbird part 4'},
 3: {'follower': ['basketball1',
   'baseball3',
   'basketball5',
   'basketball4',
   'basketball2',
   'golf4',
   'golf1',
   'Levenshtein Distance demo'],
  'leader': 'basketball3'},
 4: {'follower': ['baseball2', 'golf3', 'baseball1'], 'leader': 'golf5'}}

In [235]:
titles_previews = pd.read_csv('title_preview.csv')

In [236]:
tp = titles_previews.copy()

In [237]:
cluster_hierarchy = knn.cluster_hierarchy()

In [267]:
knn.cluster_hierarchy()

{0: {'followers': [0, 1, 7, 11, 14, 27, 28, 30], 'leader': 3},
 1: {'followers': [6], 'leader': 5},
 2: {'followers': [8, 12, 16, 20, 21, 22, 26, 29], 'leader': 15},
 3: {'followers': [2, 4, 13, 17, 18, 23, 25], 'leader': 9},
 4: {'followers': [10, 24], 'leader': 19}}

In [266]:
cluster_hierarchy

{0: {'followers': [1, 2, 7, 14, 25, 27, 28, 30], 'leader': 0},
 1: {'followers': [6], 'leader': 5},
 2: {'followers': [3, 4, 8, 10, 11, 13], 'leader': 9},
 3: {'followers': [12, 15, 17, 18, 22, 24, 26, 29], 'leader': 21},
 4: {'followers': [16, 19, 20], 'leader': 23}}

In [240]:
leaders_idx = [cluster_hierarchy[cluster]['leader'] for cluster in cluster_hierarchy]
leaders = [scorer.doc_headers[idx] for idx in leaders_idx]
followers_idx = [cluster_hierarchy[cluster]['followers'] for cluster in cluster_hierarchy]

In [103]:
def make_legible():
    legible_hier = dict()
    def get_title_by_idx(idx):
        header = doc_headers[idx]
        table = tp
        table = table[table['Document'] == header]
        return table['title'].values[0]
    for cluster in range(len(leaders_idx)):
        leader = get_title_by_idx(leaders_idx[cluster])
        followers = [get_title_by_idx(follower_id) for follower_id in followers_idx[cluster]]
        legible_hier[cluster] = dict()
        legible_hier[cluster]['leader'] = str(leader)
        legible_hier[cluster]['follower'] = [str(follower) for follower in followers]
    return legible_hier

In [244]:
make_legible()

{0: {'follower': ['SMU CSE 5337/7337 Spring 2018 Schedule',
   'SMU CSE 5/7337 Spring 2018 Textfiles',
   'This is the magic file',
   'baseball4',
   'golf2',
   'CSE 7337 Spring 2018 distance students exam 1 location',
   'CSE 5337/7337 User-Agent',
   'Porter Stemmer Online'],
  'leader': 'Freeman Moore - SMU Spring 2018'},
 1: {'follower': ['building1'], 'leader': 'building2'},
 2: {'follower': ['SMU CSE 5/7337 Spring 2018 text files"',
   'baseball5',
   'Mockingbird part 5',
   'Mockingbird part 3',
   'Mockingbird part 2',
   'Mockingbird novel part 1'],
  'leader': 'Mockingbird part 4'},
 3: {'follower': ['basketball1',
   'baseball3',
   'basketball5',
   'basketball4',
   'basketball2',
   'golf4',
   'golf1',
   'Levenshtein Distance demo'],
  'leader': 'basketball3'},
 4: {'follower': ['baseball2', 'golf3', 'baseball1'], 'leader': 'golf5'}}

In [268]:
legible = make_legible()

In [274]:
scores

NameError: name 'scores' is not defined

In [273]:
a

cluster 0
leader: Freeman Moore - SMU Spring 2018
followers: ['SMU CSE 5337/7337 Spring 2018 Schedule', 'SMU CSE 5/7337 Spring 2018 Textfiles', 'This is the magic file', 'baseball4', 'golf2', 'CSE 7337 Spring 2018 distance students exam 1 location', 'CSE 5337/7337 User-Agent', 'Porter Stemmer Online']

cluster 1
leader: building2
followers: ['building1']

cluster 2
leader: Mockingbird part 4
followers: ['SMU CSE 5/7337 Spring 2018 text files"', 'baseball5', 'Mockingbird part 5', 'Mockingbird part 3', 'Mockingbird part 2', 'Mockingbird novel part 1']

cluster 3
leader: basketball3
followers: ['basketball1', 'baseball3', 'basketball5', 'basketball4', 'basketball2', 'golf4', 'golf1', 'Levenshtein Distance demo']

cluster 4
leader: golf5
followers: ['baseball2', 'golf3', 'baseball1']



In [287]:
docs = vecs.loc[:4, doc_headers[:7]]

In [288]:
docs

Unnamed: 0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7
0,1,0,0,0,0,0,0
1,2,1,0,0,2,0,0
2,1,1,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,2,1,1,0,0,0


In [289]:
docs.columns = list(map(lambda x: 'd'+str(x), np.arange(7)))

In [290]:
docs['d1'] = [1, 9, 4, 7.375, 8]
docs['d2'] = [1, 9, 4, 5.875, 8]
docs['d3'] = [1, 8, 3, 6, 8]
docs['d4'] = [2, 13, 9, 10.83333, 6]
docs['d5'] = [2, 12, 2, 7.333333, 6]
docs['d6'] = [2, 14, 4, 8.25, 8]
docs['d7'] = [3, 8, 4, 6, 4]

In [133]:
docs.columns = ['d1', 'd2', 'd3', 'd4']

In [168]:
docs['d1']

0    1
1    1
2    1
3    1
Name: d1, dtype: object

In [291]:
docs

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7
0,1,1.0,1.0,1,2.0,2.0,2.0,3
1,2,9.0,9.0,8,13.0,12.0,14.0,8
2,1,4.0,4.0,3,9.0,2.0,4.0,4
3,1,7.375,5.875,6,10.83333,7.333333,8.25,6
4,1,8.0,8.0,8,6.0,6.0,8.0,4


In [295]:
knn = KNN(2)

In [296]:
knn.cluster(docs.columns, docs)

{0: [0], 1: [1, 2, 3, 4, 5, 6, 7]}

In [297]:
test_data = pd.read_csv('test_data.csv')

In [298]:
td = test_data.iloc[:, :-1].copy()

In [299]:
td

Unnamed: 0,tf doc1,tf doc2,tf doc3,tf doc4,tf doc5,tf doc6,tf doc7,tf doc8,tf doc9,tf doc10,tf doc11
0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0
2,1,1,0,1,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,1,1,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,1,1,0,1,1,0


In [300]:
knn_test = KNN(2)

In [301]:
knn_test.cluster(td.columns, td)

{0: [0, 1, 2, 3], 1: [4, 5, 6, 7, 8, 9, 10]}