## Function: Uses the d3 library to visualize clusters of articles in a force-directed graph with each color corresponding to a different topic

## Create JSON object for article-article correlation

In [1]:
#Create JSON object for article-article correlation 

import pandas as pd
import numpy as np
import scipy.spatial
from IPython.display import Javascript

column_names = ["Topic"+str(i) for i in range(100)]
doc_vs_topic = pd.read_csv(open('doc_vs_topic.csv'),header=None,names=column_names)
article_sim_matrix = np.zeros((63,63))

for i in range(63):
    for j in range(63):
        article_sim_matrix[i,j] = scipy.spatial.distance.cosine(doc_vs_topic.iloc[i],doc_vs_topic.iloc[j])
        

df = pd.DataFrame(article_sim_matrix)

Javascript("""
           window.artVsart={};
           """.format(df.to_json()))


<IPython.core.display.Javascript object>

## Create JSON object for article names 

In [2]:
#JSON object for article names 

from openpyxl import Workbook, load_workbook
import json

wb = load_workbook(filename = 'GoogleNews_18July.xlsx')
# sheet_list1 = wb['Links']
sheet_list1 = wb['DocumentOrder']
names=[]
titles = []
urls = []
labels = []
for i in range(1,173):
    s = 'A' + str(i)
    t = 'B' + str(i)
    u = 'C' + str(i)
    v = 'D' + str(i)
    names.append(sheet_list1[s].value.encode(encoding='ascii',errors='ignore'))
    titles.append(sheet_list1[t].value.encode(encoding='ascii',errors='ignore'))
    labels.append(sheet_list1[u].value)
    urls.append(sheet_list1[v].value.encode(encoding='ascii',errors='ignore'))
namesdf = pd.DataFrame(names)
titlesdf = pd.DataFrame(titles)
urlsdf = pd.DataFrame(urls)
labelsdf = pd.DataFrame(labels)
Javascript("""
           window.names={};
           """.format(namesdf.to_json()))



<IPython.core.display.Javascript object>

## Calculate centroids for each known topic and find top 3 similar articles to that cluster

In [None]:
import numpy as np
import math

no_of_topics = 15
clustercentroids=np.zeros((no_of_topics,100))
grouplen = [0]*no_of_topics

for i in range(len(doc_vs_topic)):
    c = names[i][0]
    if c=='A':
        group = 12 if i==94 else (int)(math.floor((i+1)/8) + 1)
    elif c=='F': 
        group = 14
    elif  c=='E': 
        group = 7
    elif c=='S': 
        group = 14      
    elif c=='D': 
        group = 1

    clustercentroids[group] += doc_vs_topic.loc[i,:].values
    grouplen[group] += 1
    
for i in range(1,no_of_topics):
    clustercentroids[i]/=grouplen[i]
#     print clustercentroids[i]


# Now get the top 3 similar docs to each of the clusters
sim_docs_all=[]
for i in range(1,no_of_topics):
    cluster_sim_list = []
    for j in range(len(doc_vs_topic)):
        doc_cluster_sim = scipy.spatial.distance.cosine(doc_vs_topic.loc[j,:].values,clustercentroids[i])
        cluster_sim_list += [ (j,names[j],doc_cluster_sim) ]
    
    sorted_docs_by_average = sorted(cluster_sim_list, key=lambda x: x[2])
    similar_docs = [sorted_docs_by_average[idx] for idx in range(6)] 
    sim_docs_all.append( similar_docs )


    
#     for doc in similar_docs:
#         print doc
# #         i+=1
# #         topic_vectors[i] = dat['doc_topic_dists'][doc[0]]
# #         labels.append(doc[1])

print( json.dumps( dict(clusters=sim_docs_all) ) )
# Print JSON object for the most similar documents of clusters

In [4]:
Javascript("""
           window.titles={};
           """.format(titlesdf.to_json()))

<IPython.core.display.Javascript object>

In [6]:
Javascript("""
           window.urls={};
           """.format(urlsdf.to_json()))

<IPython.core.display.Javascript object>

## Prepare data into graph of nodes and links

In [7]:
%%javascript

var names0 = names["0"];
var titles0 = titles["0"];
var urls0 = urls["0"];
//var str = JSON.stringify(names, null, 2); 
var nodes = [];
var labels = [];

//Get nodes 
Object.keys(artVsart).forEach( k => {
    var k1 = parseInt(k);
    var node = { name:"ArticleID "+ k +" : "+ titles0[k], url:urls0[k]  };
    switch(names0[k].charAt(0)){
        case 'A': node.group = k==94 ? 12 : Math.floor((parseInt(k)+1)/8) + 1;
                  break;
        case 'F': node.group = 14;
                  break;
        case 'E': node.group = 7;
                  break;
        case 'S': node.group = 14;
                  break;          
        case 'D': node.group = 1;
                  break;
        
    }
    //node.group='-AFESD'.indexOf(names0[k].charAt(0));
    labels.push(node.group);
    nodes.push(node);
    //console.log(artVsart[k1]);
});



var links = []

function sortProperties(obj)
{
  // convert object into array
    var sortable=[];
    for(var key in obj)
        if(obj.hasOwnProperty(key))
            sortable.push([key, obj[key]]); // each item is an array in format [key, value]

    // sort items by value
    sortable.sort(function(a, b)
    {
      return a[1]-b[1]; // compare numbers
    });
    return sortable; // array in format [ [ key1, val1 ], [ key2, val2 ], ... ]
}



// Links exist for top 5 matches of a node to other nodes excluding itself of course
Object.keys(artVsart).forEach( k => {
    var k1 = parseInt(k);
//     var sum = 0.0;
//     var len = Object.keys(artVsart[k1]).length;
//     for (var i=0;i<len;i++){
//         sum = sum + artVsart[k1][i];
//     }
//     var mean = sum/len;
    var topMatches = sortProperties(artVsart[k1]).slice(1,6);
    for (var sim in topMatches) {
        //console.log("Mean: "+mean+" Val: "+parseFloat(topMatches[sim][1]));
        //if (parseFloat(topMatches[sim][1]) < 0.5*mean){
            var m1 = parseInt(topMatches[sim][0]);
            var link = { "source": k1, "target": m1, "value": artVsart[k1][m1]*10 };
            links.push(link);
        //}
        };
});

//var result = $.grep(links, function(e){ return e; });
//console.log(links);

var final = {"nodes":nodes, "links":links};
var graph = JSON.stringify(final);
console.log(graph);
window.graph = graph;

<IPython.core.display.Javascript object>

In [8]:
%%javascript
console.log(window.graph)

<IPython.core.display.Javascript object>

### Dump graph JSON object

In [None]:
# import json
# with open('check.json', 'w') as outfile:
#     json.dump(window.graph, outfile)

## Build force directed graph using d3

In [9]:
from IPython.display import HTML

force = """
<button id='advance' title='Advance Layout One Increment'>
    <i class='fa fa-step-forward'></i>
</button>
<button id='slow' title='Run Layout in Slow Motion'>
    <i class='fa fa-play'></i>
</button>
<button id='play' title='Run Layout at Full Speed'>
    <i class='fa fa-fast-forward'></i>
</button>
<button id='reset' title='Reset Layout to Beginning'>
    <i class='fa fa-undo'></i>
</button>

<div class="ui-widget">
   <input id="search">
    <button type="button" onclick="searchNode()">Search</button>
</div>

<script type='text/javascript' src="http://code.jquery.com/ui/1.11.0/jquery-ui.min.js"> </script>
<script type='text/javascript' src="http://code.jquery.com/ui/1.11.0/themes/smoothness/jquery-ui.css"> </script>
<script src="http://labratrevenge.com/d3-tip/javascripts/d3.tip.v0.6.3.js"></script>
<svg height="700" width="960" id="cmsvg"></svg>
<style>
    .link {
        stroke: #ccc;
        stroke-width: 1.5px;
    }
    
    .node text {
        pointer-events: none;
        font: 10px sans-serif;
    }
    .textClass {
        stroke: #323232;
        font-family: "Lucida Grande", "Droid Sans", Arial, Helvetica, sans-serif;
        font-weight: normal;
        stroke-width: .5;
        font-size: 14px;
    }
    
    .d3-tip {
        line-height: 1;
        font-weight: bold;
        padding: 12px;
        background: rgba(0, 0, 0, 0.8);
        color: #fff;
        border-radius: 2px;
    }
    
</style>

<script type="text/Javascript">

var svg = d3.select('#cmsvg');

var width = svg.attr("width") || 1,
    height = svg.attr("height") || 1;

//Set up tooltip
var tip = d3.tip()
    .attr("class", "d3-tip")
    .offset([-10, 0])
    .html(function (d) { return d.name; });

var force = d3.layout.force()
    .charge(-300)
    .gravity(0.1)
    .size([width, height]); 

force.linkDistance(function(d) { return d.value*20; } );
force.linkStrength(1);


//Data for vis - nodes and links
        
//d3.json("graph.json", function(error, json) {
//  if (error) throw error;

  json = JSON.parse( window.graph );
  
  force
      .nodes(json.nodes)
      .links(json.links)
      .start();
      
  var link = svg.selectAll(".link")
      .data(json.links)
    .enter().append("line")
      .attr("class", "link");

//  var node = svg.selectAll(".node")
//      .data(json.nodes)
//    .enter().append("g")
//      .attr("class", "node")
//      .call(force.drag);


  var color = d3.scale.category20();
  
  svg.call(tip);
  
  
//  function dblclick(d){
//    window.location.assign(d.url, '_blank');
//  }
  
  var node = svg.selectAll(".node")
    .data(json.nodes)
    .enter().append("circle")
    .attr("class", "node")
    .attr("r", 8)
    .attr("target", "_blank")
    .style("fill", function (d) {
    return color(d.group);
})
    .call(force.drag)
    .on('mouseover', tip.show)
    .on('mouseout', tip.hide)
    .on('dblclick',function (d) {window.open(d.url,'_blank');});
  
  
  
    var optArray = [];
    for (var i = 0; i < json.nodes.length - 1; i++) {
        optArray.push(json.nodes[i].name);
    }
    optArray = optArray.sort();
    $(function () {
        $("#search").autocomplete({
            source: optArray
        });
    });
    function searchNode() {
        //find the node
        var selectedVal = document.getElementById('search').value;
        var node = svg.selectAll(".node");
        if (selectedVal == "none") {
            node.style("stroke", "white").style("stroke-width", "1");
        } else {
            var selected = node.filter(function (d, i) {
                return d.name != selectedVal;
            });
            selected.style("opacity", "0");
            var link = svg.selectAll(".link")
            link.style("opacity", "0");
            d3.selectAll(".node, .link").transition()
                .duration(5000)
                .style("opacity", 1);
        }
    }
  
  
  
//  node.append("image")
//      .attr("xlink:href", "https://www.google.com.sg/intl/en-GB/docs/about/favicon.ico")
//      .attr("x", -8)
//      .attr("y", -8)
//      .attr("width", 16)
//      .attr("height", 16);    
      
//  node.append("text")
//      .attr("dx", 12)
//      .attr("dy", ".35em")
//     .text(function(d) { return d.name });
    
  force.on("tick", function() {
    link.attr("x1", function(d) { return d.source.x; })
        .attr("y1", function(d) { return d.source.y; })
        .attr("x2", function(d) { return d.target.x; })
        .attr("y2", function(d) { return d.target.y; });

    node.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; });
  });
    
//Force layout object
//Principle is simple: The layout moves the graph 
//nodes around a little bit at a time. 
//Eventually (usually) the nodes settle into a comfortable
//location and the layout stops.


</script>

"""
HTML(force)

## Test KMeans and Agglomerative clustering on the set of documents 

In [None]:
#Clustering - KMeans and Agglomerative

import matplotlib
matplotlib.use('Agg')
import numpy as np
from scipy import ndimage
from sklearn import manifold, metrics
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.neighbors import kneighbors_graph
from matplotlib import pyplot as plt

m=15

X = doc_vs_topic.as_matrix()
y = labelsdf.as_matrix().ravel()

#KMeans
k_means = KMeans(init='k-means++', n_clusters=15, n_init=100)
k_means.fit(X,y)
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
print y
print k_means_labels

print "KMEANS SCORES:"
print metrics.adjusted_mutual_info_score(y, k_means_labels)  
print metrics.homogeneity_score(y, k_means_labels)
print metrics.completeness_score(y, k_means_labels)
print metrics.silhouette_score(X, k_means_labels, metric='euclidean')

y1=np.copy(y)
np.random.shuffle(y1)
print y1
print "Random Scores:"
print metrics.adjusted_mutual_info_score(y,y1)
print metrics.homogeneity_score(y,y1)
print metrics.completeness_score(y,y1)


#Plot KMeans
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111)
color=plt.cm.spectral(np.linspace(0, 1, 15))

for k in range(m):
    my_members = k_means_labels == k
    cluster_center = k_means_cluster_centers[k]
    ax.plot(doc_vs_topic.as_matrix()[my_members, 0], doc_vs_topic.as_matrix()[my_members, 1], 'w', marker='.')
    ax.plot(cluster_center[0], cluster_center[1], 'o',
            markeredgecolor='k', markersize=6)
ax.set_title('KMeans')
ax.set_xticks(())
ax.set_yticks(())

plt.show()



#Agglomerative Clustering
knn_graph = kneighbors_graph(doc_vs_topic.as_matrix(), 30, include_self=False)
model = AgglomerativeClustering(linkage='ward',
                                connectivity=knn_graph,
                                n_clusters=m)

model.fit(doc_vs_topic.as_matrix())
print model.labels_
print model.n_components_
print metrics.adjusted_mutual_info_score(y,model.labels_ ) 
print metrics.homogeneity_score(y,model.labels_ ) 
print metrics.completeness_score(y,model.labels_ ) 
print metrics.silhouette_score(X, model.labels_, metric='euclidean')



def plot_clustering(X, labels, title=None):
    x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure(figsize=(6, 4))
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(y[i]),
                 color=plt.cm.spectral(labels[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    plt.xticks([])
    plt.yticks([])
    if title is not None:
        plt.title(title, size=17)
    plt.axis('off')
    plt.tight_layout()
    
plot_clustering(doc_vs_topic.as_matrix(),model.labels_,"Plt")


## Misc code not used in the implementation beyond this point

In [68]:

var link = svg.selectAll('.link')
    .data(links)
    .enter().append('line')
    .attr('class', 'link')
    .attr('x1', function(d) { return nodes[d.source].x; })
    .attr('y1', function(d) { return nodes[d.source].y; })
    .attr('x2', function(d) { return nodes[d.target].x; })
    .attr('y2', function(d) { return nodes[d.target].y; });
    
var node = svg.selectAll('.node')
    .data(nodes)
    .enter().append('circle')
    .attr('class', 'node')
    .attr('r', width/100)
    .attr('cx', function(d) { return d.x; })
    .attr('cy', function(d) { return d.y; });
    
var animating = false;
var animationStep = 400;


//The force layout, through its iterations, will try to arrange the 
//nodes so that all links are approximately = link distance, 
//but that won’t always be possible.

force.on('tick', function() {

    // Because we want to emphasize how the nodes and
    // links move, we use a transition to move them to
    // their positions instead of simply setting the
    // values abruptly.

    node.transition().ease('linear').duration(animationStep)
        .attr('cx', function(d) { return d.x; })
        .attr('cy', function(d) { return d.y; });

    link.transition().ease('linear').duration(animationStep)
        .attr('x1', function(d) { return d.source.x; })
        .attr('y1', function(d) { return d.source.y; })
        .attr('x2', function(d) { return d.target.x; })
        .attr('y2', function(d) { return d.target.y; });

    force.stop();

    if (animating) {
        setTimeout(
            function() { force.start(); },
            animationStep
        );
    }

});


d3.select('#advance').on('click', force.start);

d3.select('#slow').on('click', function() {

    d3.selectAll('button').attr('disabled','disabled');
    animating = true;
    force.start();

});


//Time to turn things over to the force layout
force.start()

SyntaxError: invalid syntax (<ipython-input-68-82fb44856771>, line 2)

In [10]:
from IPython.display import HTML

clusteringd3  = """
<script src="//d3js.org/d3.v3.min.js"></script>
<script>

var width = 960,
    height = 500,
    padding = 1.5, // separation between same-color circles
    clusterPadding = 6, // separation between different-color circles
    maxRadius = 12;

var n = 200, // total number of circles
    m = 15; // number of distinct clusters

var color = d3.scale.category10()
    .domain(d3.range(m));

// The largest node for each cluster.
var clusters = new Array(m);

var nodes = d3.range(n).map(function() {
  var i = Math.floor(Math.random() * m),
      r = Math.sqrt((i + 1) / m * -Math.log(Math.random())) * maxRadius,
      d = {cluster: i, radius: r};
  if (!clusters[i] || (r > clusters[i].radius)) clusters[i] = d;
  return d;
});

var force = d3.layout.force()
    .nodes(nodes)
    .size([width, height])
    .gravity(.02)
    .charge(0)
    .on("tick", tick)
    .start();

var svg = d3.select("body").append("svg")
    .attr("width", width)
    .attr("height", height);

var circle = svg.selectAll("circle")
    .data(nodes)
  .enter().append("circle")
    .attr("r", function(d) { return d.radius; })
    .style("fill", function(d) { return color(d.cluster); })
    .call(force.drag);

function tick(e) {
  circle
      .each(cluster(10 * e.alpha * e.alpha))
      .each(collide(.5))
      .attr("cx", function(d) { return d.x; })
      .attr("cy", function(d) { return d.y; });
}

// Move d to be adjacent to the cluster node.
function cluster(alpha) {
  return function(d) {
    var cluster = clusters[d.cluster];
    if (cluster === d) return;
    var x = d.x - cluster.x,
        y = d.y - cluster.y,
        l = Math.sqrt(x * x + y * y),
        r = d.radius + cluster.radius;
    if (l != r) {
      l = (l - r) / l * alpha;
      d.x -= x *= l;
      d.y -= y *= l;
      cluster.x += x;
      cluster.y += y;
    }
  };
}

// Resolves collisions between d and all other circles.
function collide(alpha) {
  var quadtree = d3.geom.quadtree(nodes);
  return function(d) {
    var r = d.radius + maxRadius + Math.max(padding, clusterPadding),
        nx1 = d.x - r,
        nx2 = d.x + r,
        ny1 = d.y - r,
        ny2 = d.y + r;
    quadtree.visit(function(quad, x1, y1, x2, y2) {
      if (quad.point && (quad.point !== d)) {
        var x = d.x - quad.point.x,
            y = d.y - quad.point.y,
            l = Math.sqrt(x * x + y * y),
            r = d.radius + quad.point.radius + (d.cluster === quad.point.cluster ? padding : clusterPadding);
        if (l < r) {
          l = (l - r) / l * alpha;
          d.x -= x *= l;
          d.y -= y *= l;
          quad.point.x += x;
          quad.point.y += y;
        }
      }
      return x1 > nx2 || x2 < nx1 || y1 > ny2 || y2 < ny1;
    });
  };
}

</script>
"""
HTML(clusteringd3)

In [1]:
import pycurl
from StringIO import StringIO

buffer = StringIO()
c = pycurl.Curl()
c.setopt(c.URL, 'http://fastner.sage:3020/entities/mitie')
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()

body = buffer.getvalue()
# Body is a string in some encoding.
# In Python 2, we can print it without knowing what the encoding is.
print(body)

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>405 Method Not Allowed</title>
<h1>Method Not Allowed</h1>
<p>The method is not allowed for the requested URL.</p>

