# Density-Based Clustering
Alex Gajowski

In [20]:
epsilon = 0.6
eta = 0.7
mu = 12

In [21]:
import csv

In [22]:
class Vert:
    def __init__(self):
        self.edges = dict()
        self.attrs = dict()
        
    def __repr__(self):
        return 'edges:' + str(self.edges) + ', attrs:' + str(self.attrs)

In [23]:
G = dict()
edgeWeights = list()

with open('cora_edges.csv') as edgesFile:
    coraEdges = csv.reader(edgesFile)
    next(coraEdges)
    for e in coraEdges:
        i = e[0]
        j = e[1]
        w = float(e[2])**2
                
        if w >= 1:
            continue
        edgeWeights.append(w)

        
        if not i in G:
            G[i] = Vert()
            
        if not j in G:
            G[j] = Vert()     
        
        if not j in G[i].edges:
            G[i].edges[j] = w
        
        if not i in G[j].edges:
            G[j].edges[i] = w


#G


In [24]:
def commonNeighbors(u, v):
    n_u = set(G[u].edges.keys())
    n_v = set(G[v].edges.keys())

    return len(n_u.intersection(n_v)) / len(n_u.union(n_v))

In [25]:
def getNextNode(cluster, clNeighbors):
    edgeWeightSums = [(e[0], sum(G[e[0]].edges[o] for o in e[1])) for e in [(n, set(cluster).intersection(set(G[n].edges.keys()))) for n in clNeighbors ]]
    sortedEdgeWeightSums = sorted(edgeWeightSums, key=lambda e: e[1], reverse=True)
    topEdgesByWeight = list(filter(lambda x: x[1] == sortedEdgeWeightSums[0][1], sortedEdgeWeightSums))
    return sorted([(t[0], t[1], len(set(G[t[0]].edges.keys()).difference(cluster).intersection(clNeighbors))) for t in topEdgesByWeight], key=lambda e: e[2], reverse=True)[0]

In [26]:
#cl = set()
#cl.add('35')
#cl.add('1050679')

#clN = set()
#clN.update([x for x in G['35'].edges.keys()])
#clN.update([x for x in G['1050679'].edges.keys()])

#getNextNode(cl, clN)

In [27]:
#sorted([(x[0], len(x[1].edges)) for x in G.items()],key=lambda y: y[1], reverse=True)

In [28]:
def densityCluster(t_w):
    # 5 - 7
    nodeExists = dict()
    for node, edges_node in G.items():
        nodeExists[node] = True
    
    # 8
    clusters = list()
    
    # 9
    nodeDegree = dict()
    # 10 - 12
    for node, edges_node in G.items():
        nodeDegree[node] = len(edges_node.edges)
        
    
    for nId, __n_edges in sorted([(x[0], len(x[1].edges)) for x in G.items()],key=lambda y: y[1], reverse=True):
        node = str(nId)
        if nodeExists[node]:
            c = set()
            maxDegree = node
            nodeDegree[maxDegree] = -1
            nodeExists[maxDegree] = False
            neighbors = set(G[maxDegree].edges.keys())
            c.add(maxDegree)
            
            #for neighbor, edge_weight in neighbors:
                #print(float(edge_weight) + commonNeighbors(node, neighbor))
            
            #print(neighbors)
            
            while len(neighbors) > 0:
                neighbor, edge_weight, no_neighbors = getNextNode(c, neighbors)
                neighbors.remove(neighbor)
                
                if not nodeExists[neighbor]:
                    continue
                
                
                if (edge_weight + no_neighbors) >= t_w:
                    c.add(neighbor)
                    nodeExists[neighbor] = False
                    nodeDegree[neighbor] = -1
                    
                    newNeighbors = list(filter(lambda x: nodeExists[x], G[neighbor].edges.keys()))
                    neighbors.update( newNeighbors )
                                    
            #print(neighbors)
            
            clusters.append(c)
            
    return clusters


In [71]:
T_W = 1.5
cs = sorted(densityCluster(T_W), key=lambda x: len(x), reverse=True)

In [72]:
#[len(c) for c in cs]

In [73]:
import mysql.connector

mydb = mysql.connector.connect(
    host='localhost',
    user='alex',
    database='cora'
)

cursor = mydb.cursor()

In [74]:
colorMap = { 'Genetic_Algorithms': 'red',
'Reinforcement_Learning': 'orange',
'Theory': 'yellow',
'Rule_Learning': 'green',
'Case_Based': 'blue',
'Probabilistic_Methods': 'purple',
'Neural_Networks': 'pink',
}

In [75]:
coloredVerts = {}
for cl in cs:
    if len(cl) < 5:
        continue
    
    topicCounts = dict()
    for pid in cl:
        cursor.execute("SELECT class_label FROM paper WHERE paper_id="+pid)
        topic = next(cursor)[0]
        
        #coloredVerts[pid] = colorMap[topic]
        
        if not topic in topicCounts:
            topicCounts[topic] = 0
        topicCounts[topic] += 1
    
    cTopics = sorted(topicCounts.items(), key=lambda x: x[1], reverse=True)

    print('Size: ' + str(len(cl)))
    for cTopic in cTopics:
        topicPercent = cTopic[1]/len(cl) * 100
        percent = str(round(topicPercent, 1))
        print(percent + '% ' + cTopic[0] + ' (' + str(cTopic[1]) + ')')

    for u in cl:
        coloredVerts[u] = colorMap[cTopics[0][0]]
        
    print()

Size: 49
100.0% Genetic_Algorithms (49)

Size: 43
93.0% Reinforcement_Learning (40)
2.3% Genetic_Algorithms (1)
2.3% Case_Based (1)
2.3% Theory (1)

Size: 20
100.0% Theory (20)

Size: 9
100.0% Rule_Learning (9)

Size: 9
100.0% Neural_Networks (9)

Size: 9
100.0% Probabilistic_Methods (9)

Size: 8
75.0% Theory (6)
25.0% Case_Based (2)

Size: 8
100.0% Neural_Networks (8)

Size: 8
100.0% Neural_Networks (8)

Size: 7
100.0% Probabilistic_Methods (7)

Size: 6
100.0% Probabilistic_Methods (6)

Size: 6
100.0% Probabilistic_Methods (6)

Size: 5
100.0% Neural_Networks (5)

Size: 5
100.0% Probabilistic_Methods (5)



In [76]:
#coloredVerts

In [77]:
#for color, verts in coloredVerts.items():
#    print(color + ' = ' + str(verts).replace('[', 'c(').replace(']', ')'))

In [78]:
#"""
from pyvis.network import Network
import pandas as pd

cora_net = Network(height="100%", width="100%", bgcolor="#222222", font_color="white")
#cora_net.prep_notebook()

#cora_net.set_options(" ""
#{
#  "nodes": {
#    "fixed": {
#      "x": true,
#      "y": true
#    }
#  }
#}
#" "")

#cora_net.toggle_physics(False)

# set the physics layout of the network
#cora_net.barnes_hut()
#print(cora_data)

for src in G.keys():
    
    src_color = 'gray'
    if src in coloredVerts:
        src_color = coloredVerts[src]
    #else:
    #    continue
    
    cora_net.add_node(src, label=" ", title=src, color=src_color)

    for dst, w in G[src].edges.items():
        dst_color = 'gray'
        if dst in coloredVerts:
            dst_color = coloredVerts[dst]
        #else:
        #    continue
        
        cora_net.add_node(dst, label=" ", title=dst, color=src_color)

        
        cora_net.add_edge(src, dst, value=w) #, physics=False)

for e in cora_net.edges:
    e['title'] = e['value']
            
cora_net.show_buttons(filter_=['nodes'])

cora_net.save_graph("cora" + str(T_W) + ".html")
#"""

In [37]:
#coloredVerts.keys()

In [38]:
sorted(edgeWeights, reverse=True)

[0.9111570247933886,
 0.9111570247933886,
 0.8975069252077561,
 0.8975069252077561,
 0.7744,
 0.7159763313609467,
 0.7055999999999999,
 0.7055999999999999,
 0.6944444444444445,
 0.6824196597353497,
 0.6824196597353497,
 0.655328798185941,
 0.6523668639053255,
 0.626736111111111,
 0.626736111111111,
 0.5804988662131518,
 0.5804988662131518,
 0.5340236686390532,
 0.5017361111111112,
 0.48999999999999994,
 0.4792899408284023,
 0.4792899408284023,
 0.4444444444444444,
 0.4444444444444444,
 0.42925089179548154,
 0.42533081285444235,
 0.41326530612244905,
 0.41326530612244905,
 0.4049586776859504,
 0.390625,
 0.38525564803804996,
 0.37869822485207105,
 0.37051039697542537,
 0.3686224489795918,
 0.3686224489795918,
 0.34027777777777785,
 0.33284023668639046,
 0.33284023668639046,
 0.308641975308642,
 0.28994082840236685,
 0.2722117202268431,
 0.2688614540466392,
 0.25,
 0.25,
 0.25,
 0.25,
 0.25,
 0.25,
 0.25,
 0.25,
 0.2330558858501784,
 0.2304,
 0.2304,
 0.2304,
 0.2177777777777778,
 0.2177