# Density-Based Clustering
Alex Gajowski

In [1]:
epsilon = 0.6
eta = 0.7
mu = 12

In [2]:
import csv

In [3]:
G = dict()
with open('cora_edges.csv') as edgesFile:
    coraEdges = csv.reader(edgesFile)
    
    for e in coraEdges:
        if not e[0] in G:
            G[e[0]] = list()
        if not e[1] in G:
            G[e[1]] = list()
        G[e[0]].append( (e[1], e[2]) )
        G[e[1]].append( (e[0], e[2]) )

G

{'1033': [('35', '0.047619047619047616'),
  ('1034', '0.42424242424242425'),
  ('1107062', '0.16216216216216217'),
  ('41714', '0.07692307692307693'),
  ('45605', '0.037037037037037035')],
 '35': [('1033', '0.047619047619047616'),
  ('1688', '0.07692307692307693'),
  ('8865', '0.05'),
  ('12576', '0.08571428571428572'),
  ('15670', '0.09090909090909091'),
  ('18582', '0.046511627906976744'),
  ('28290', '0.023809523809523808'),
  ('28851', '0.08108108108108109'),
  ('33904', '0.08571428571428572'),
  ('33907', '0.11428571428571428'),
  ('35061', '0.13043478260869565'),
  ('41714', '0.02702702702702703'),
  ('44368', '0.05263157894736842'),
  ('45599', '0.18421052631578946'),
  ('46079', '0.08108108108108109'),
  ('46431', '0.024390243902439025'),
  ('48766', '0.15625'),
  ('54129', '0.07692307692307693'),
  ('54131', '0.05128205128205128'),
  ('56119', '0.045454545454545456'),
  ('66556', '0.05714285714285714'),
  ('66563', '0.027777777777777776'),
  ('66805', '0.07894736842105263'),
 

In [4]:
def commonNeighbors(u, v):
    n_u = set([x[0] for x in G[u]])
    n_v = set([x[0] for x in G[v]])

    return len(n_u.intersection(n_v)) / len(n_u.union(n_v))

In [9]:
def densityCluster(t_w):
    # 5 - 7
    nodeExists = dict()
    for node, edges_node in G.items():
        nodeExists[node] = True
    
    # 8
    clusters = list()
    
    # 9
    nodeDegree = dict()
    # 10 - 12
    for node, edges_node in G.items():
        nodeDegree[node] = len(edges_node)
        
    
    for nId in sorted([int(x) for x in G.keys()], reverse=True):
        node = str(nId)
        if nodeExists[node]:
            c = list()
            maxDegree = node
            nodeDegree[maxDegree] = -1
            nodeExists[maxDegree] = False
            neighbors = list(G[maxDegree])
            c.append(maxDegree)
            
            for neighbor, edge_weight in neighbors:
                #print(float(edge_weight) + commonNeighbors(node, neighbor))
                
                if (float(edge_weight) + commonNeighbors(node, neighbor)) >= t_w:
                    c.append(neighbor)
                    nodeExists[neighbor] = False
                    nodeDegree[neighbor] = -1
                    
                    neighbors.extend( filter(lambda x: nodeExists[x[0]], G[neighbor]) )
                                    
            #print(neighbors)
            
            clusters.append(c)
            
    return clusters


In [47]:
cs = sorted(densityCluster(.3), key=lambda x: len(x), reverse=True)

In [48]:
import mysql.connector

mydb = mysql.connector.connect(
    host='localhost',
    user='alex',
    database='cora'
)

cursor = mydb.cursor()

In [49]:
for cl in cs:
    if len(cl) < 20:
        continue
    
    topicCounts = dict()
    for pid in cl:
        cursor.execute("SELECT class_label FROM paper WHERE paper_id="+pid)
        topic = next(cursor)[0]
        if not topic in topicCounts:
            topicCounts[topic] = 0
        topicCounts[topic] += 1
    
    cTopics = sorted(topicCounts.items(), key=lambda x: x[1], reverse=True)

    print('Size: ' + str(len(cl)))
    for cTopic in cTopics:
        topicPercent = cTopic[1]/len(cl) * 100
        percent = str(round(topicPercent, 1))
        print(percent + '% ' + cTopic[0] + ' (' + str(cTopic[1]) + ')')

    print()

Size: 43
95.3% Neural_Networks (41)
4.7% Theory (2)

Size: 38
81.6% Genetic_Algorithms (31)
15.8% Case_Based (6)
2.6% Reinforcement_Learning (1)

Size: 31
100.0% Theory (31)

Size: 30
100.0% Case_Based (30)

Size: 28
100.0% Neural_Networks (28)

Size: 28
100.0% Neural_Networks (28)

Size: 26
88.5% Neural_Networks (23)
11.5% Probabilistic_Methods (3)

Size: 25
56.0% Neural_Networks (14)
44.0% Probabilistic_Methods (11)

Size: 25
100.0% Neural_Networks (25)

Size: 24
100.0% Rule_Learning (24)

Size: 21
100.0% Probabilistic_Methods (21)

Size: 20
100.0% Probabilistic_Methods (20)

