<img src="housess6.jpg">
# <center> Five Families of GoT </center>
## <center>A word embedding case-study</center>
This notebook contains the code which is used in the blog post in [Protagonist Technology](https://protagonist.io) on analysis of five big families in game of throne (GoT).  
This is a fun experiment to demonstrate/examine the so-called "human-machine learning" in which different machine learning algorithms, through both natural language processing (NLP) methods and visualization, help the expert-in-the-field human to achieve deeper understanding of large corpus efficiently.  
Now, checkout the notebook, run it and let us know what new insight you got in the world of GoT!  

In [1]:
from __future__ import unicode_literals
from itertools import tee, izip
import gensim
from gensim.models import Word2Vec
import pandas as pd
import networkx as nx
import numpy as np
from bokeh.plotting import show,figure
from bokeh.io import output_notebook
from bokeh.models import HoverTool,TapTool,BoxSelectTool
from bokeh.models.graphs import NodesAndLinkedEdges
from bokeh.models import ColumnDataSource
from IPython.display import display
import community # python-louvain
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from bokeh.palettes import Spectral11
output_notebook()

In [2]:
# Returnsa word which is the most similar to a phrase on all 5 models that we trained on 5 volumes of GoT (It is a hack that we used to get around training gensim with bi/tri-grams
# mostly because we were not happy with the bi/tri-gram models' performances! Work in progress on that front.)
def get_most_similar_to_phrase(phrase,model):
    if (' ' in phrase):
        keyword_list = phrase.split(' ')
        #print keyword_list
        if (keyword_list[0] in model.wv.vocab) and (keyword_list[1] in model.wv.vocab):
            keyword_similar = model.most_similar(positive = [keyword_list[0],keyword_list[1]],topn = 1)
            keyword = keyword_similar[0][0]
        elif keyword_list[0] in model.wv.vocab:
            keyword = keyword_list[0]
        elif keyword_list[1] in model.wv.vocab:
            keyword = keyword_list[1]
    else:
        if phrase in model.wv.vocab:
            keyword = phrase
    return keyword
        
# Returns similarities between keyword and the word-list across 5 GoT trained models.
def get_similarities(keyword, word_list, models):
    """
    Get similarities with words over models
    """
    my_dict = {}
    for word in word_list:
        my_dict[word] = [model.similarity(get_most_similar_to_phrase(keyword,model), word) if (word in model.wv.vocab) else None for model in models]
    changes = pd.DataFrame(my_dict)
    return changes

In [3]:
# We didn't use typical list of stopwords (nothing wrong with that though!) but just removed the words that interfered with 
# understanding of the resulting graph by experts.
stopw = ["(",")","-the","-by","another","both","then","again","almost","somehow","-ser","father","mother","young","son","daughter","-lord","-her","-his","-their","'m",'ca','-PRON-','.','"','yes','oh','pyp','page',"n't","'","m'lord","'ll","m'lady",".","}","{",":","-lady","m."]

def get_n_most_similar(word, model,n, stopwords=stopw):
    """
    Returns top N similar words in 5 GoT trained models.
    """
    if word in model.wv.vocab:
        return [(i,j) for i,j in model.most_similar(word, topn=n+len(stopwords)) if i not in stopwords][:n]
    else:
        return []

def add_to_graph(g, word, model, n, iterations, min_similarity= 0, stopwords=stopw):
    """
    Helper function for recursive traversal of most_similar
    """
    similar_words = get_n_most_similar(word, model, n=n, stopwords=stopw)
    if iterations >= 1:
        iterations = iterations - 1
        for similar_word, score in similar_words:
            if min_similarity is None or score > min_similarity:
                g.add_edge(word, similar_word, weight=score)
                add_to_graph(g, similar_word, model, n, iterations)

# creat a graph from similar words
def generate_graph(word, model, n, iterations, min_similarity= 0, stopwords=stopw):
    """
    Generate a graph from a model and a word from similar words
    """
    G = nx.Graph()
    add_to_graph(G, word, model, n=n, iterations=iterations, min_similarity=min_similarity, stopwords=stopwords)
    return G

def get_edges_specs(_network, _layout):
    """
    Returns the spec of edges in the graph including origin, destination and weight
    """
    d = dict(xs=[], ys=[], alphas=[])
    weights = [z['weight'] for u, v, z in _network.edges(data=True)]
    max_weight = max(weights)
    calc_alpha = lambda h: 0.1 + 0.6 * (h / max_weight)
    # example: { ..., ('user47', 'da_bjoerni', {'weight': 3}), ... }
    for u, v, data in _network.edges(data=True):
        #print d
        d['xs'].append([_layout[u][0], _layout[v][0]])
        d['ys'].append([_layout[u][1], _layout[v][1]])
        d['alphas'].append(calc_alpha(data['weight']))
    return d

In [4]:
# Thanks to a generosity of Stackoverflow contributers, we manage to modify the layout our graph based on communities of 
# Luovain algorithm! Thank you!
def community_layout(g, partition):
    """
    Compute the layout for a modular graph.


    Arguments:
    ----------
    g -- networkx.Graph or networkx.DiGraph instance
        graph to plot

    partition -- dict mapping int node -> int community
        graph partitions


    Returns:
    --------
    pos -- dict mapping int node -> (float x, float y)
        node positions

    """

    pos_communities = _position_communities(g, partition, scale=3.)

    pos_nodes = _position_nodes(g, partition, scale=1.)

    # combine positions
    pos = dict()
    for node in g.nodes():
        pos[node] = pos_communities[node] + pos_nodes[node]

    return pos

def _position_communities(g, partition, **kwargs):

    # create a weighted graph, in which each node corresponds to a community,
    # and each edge weight to the number of edges between communities
    between_community_edges = _find_between_community_edges(g, partition)

    communities = set(partition.values())
    hypergraph = nx.DiGraph()
    hypergraph.add_nodes_from(communities)
    for (ci, cj), edges in between_community_edges.items():
        hypergraph.add_edge(ci, cj, weight=len(edges))

    # find layout for communities
    pos_communities = nx.spring_layout(hypergraph, **kwargs)

    # set node positions to position of community
    pos = dict()
    for node, community in partition.items():
        pos[node] = pos_communities[community]

    return pos

def _find_between_community_edges(g, partition):

    edges = dict()

    for (ni, nj) in g.edges():
        ci = partition[ni]
        cj = partition[nj]

        if ci != cj:
            try:
                edges[(ci, cj)] += [(ni, nj)]
            except KeyError:
                edges[(ci, cj)] = [(ni, nj)]

    return edges

def _position_nodes(g, partition, **kwargs):
    """
    Positions nodes within communities.
    """

    communities = dict()
    for node, community in partition.items():
        try:
            communities[community] += [node]
        except KeyError:
            communities[community] = [node]

    pos = dict()
    for ci, nodes in communities.items():
        subgraph = g.subgraph(nodes)
        pos_subgraph = nx.spring_layout(subgraph, **kwargs)
        pos.update(pos_subgraph)

    return pos

In [5]:

phrase_graph = 0
# Models trained on tri/bigrams of five volumes of GoT.(Experiment with them but we didn't like the performance! 
# Work in progress.)
if phrase_graph:
    model1 = Word2Vec.load('models/GoT1_p')
    model2 = Word2Vec.load('models/GoT2_p')
    model3 = Word2Vec.load('models/GoT3_p')
    model4 = Word2Vec.load('models/GoT4_p')
    model5 = Word2Vec.load('models/GoT5_p')
else:
    # Models trained on words of five volumes of GoT.
    model1 = Word2Vec.load('models/GoT1')
    model2 = Word2Vec.load('models/GoT2')
    model3 = Word2Vec.load('models/GoT3')
    model4 = Word2Vec.load('models/GoT4')
    model5 = Word2Vec.load('models/GoT5')

models = [model1,model2,model3,model4,model5]

In [6]:
# We used Luovain community detection algorithm to cluster the most similar words together.
def fg(family,volume):
    kword = family
    model_n = models[volume-1]
    network = generate_graph(kword, model_n,20,2)
    
    partition = community.best_partition(network)
    layout = community_layout(network, partition)
    #layout = nx.spring_layout(network,k = 0.1)

    nodes, nodes_coordinates = zip(*sorted(layout.items()))
    nodes_xs, nodes_ys = list(zip(*nodes_coordinates))
    nodes_source = ColumnDataSource(dict(x=nodes_xs, y=nodes_ys,
                                         name=nodes))

    plot = figure(tools="")
    plot = figure(plot_width=800, plot_height=800)
    r_circles = plot.circle('x', 'y', source=nodes_source, size=10,
                            color='blue', level = 'overlay')
    hover = HoverTool(tooltips=[('name','@name')],renderers = [r_circles])

    lines_source = ColumnDataSource(get_edges_specs(network, layout))

    r_lines = plot.multi_line('xs', 'ys', line_width=0.5,
                              alpha='alphas', color='grey',
                              source=lines_source)

    centrality = nx.algorithms.centrality.betweenness_centrality(network)
    # first element are nodes again
    _, nodes_centrality = zip(*sorted(centrality.items()))
    max_centraliy = max(nodes_centrality)
    nodes_source.add([7 + 10 * t / max_centraliy
                      for t in nodes_centrality],
                     'centrality')
    b_select = BoxSelectTool()
    tap_select = TapTool()
    plot.add_tools(hover,tap_select,b_select)

    p_, nodes_community = zip(*sorted(partition.items()))
    nodes_source.add(nodes_community, 'community')
    community_colors = ['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628', '#b3cde3','#ccebc5','#decbe4','#fed9a6','#ffffcc','#e5d8bd','#fddaec','#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d','#666666']
    nodes_source.add([community_colors[t % len(community_colors)]
                      for t in nodes_community],
                     'community_color')

    r_circles.glyph.size = 'centrality'
    r_circles.glyph.fill_color = 'community_color'
    show(plot)
    return family,volume
interact(fg, family = ["lannister","stark","targaryen","baratheon","tyrell"],volume=widgets.IntSlider(min=1,max=5,step=1,value=1))

<function __main__.fg>

In [7]:
!pip install python-louvain

Collecting python-louvain
  Downloading python-louvain-0.10.tar.gz
Building wheels for collected packages: python-louvain
  Running setup.py bdist_wheel for python-louvain ... [?25ldone
[?25h  Stored in directory: /Users/alishahed/Library/Caches/pip/wheels/8b/18/34/89e4c136e90279264bc0b2e3ad7a7991b7b4d18f8009c3f89a
Successfully built python-louvain
Installing collected packages: python-louvain
Successfully installed python-louvain-0.10
