In [2]:
# Settings

In [3]:
from IPython.core.display import HTML
HTML("""
<style>
    h1{background-color:black; color:white; padding: 10px 10px 10px 10px}
    h2{background-color:blue; color:white;  padding: 5px 5px 5px 5px}
</style>
""")

In [4]:
%matplotlib inline

# Parameters

In [5]:
# Mode to run the script with.  Either 'cjc' or 'lspbs'
mode = 'cjc' 

# Max number of case types to be generate the summaries for
n_casetypes = 10

# Min number of cases in for each case type in order to run summarization
threshold = 100

# Number of key phrases to be extracted for summary
phrase_limit = 50

# Word limit for significant sentences to be included in top sentences generator before (excluding similar ones)
word_limit = 1000

# Parameter to exclude sentences that are too similar from the list of top sentences.  
# Acceptable values: between 0.0 and 1.0 inclusive
# E.g., if sim_score_threshold = 0.9, sentences that are >= 90% similar will be excluded
sim_score_threshold = 0.9

# Boolean parameter for printing out intermediate outputs (not very useful for now)
to_print = False

# Boolean parameter for plotting the network graph
to_plot = False


# Install dependencies

In [6]:
!pip install pytextrank networkx xlrd

[33mYou are using pip version 9.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Load libraries

In [7]:
import pandas as pd
import numpy as np
import re
import string

import spacy
from spacy.en.word_sets import STOP_WORDS

import pytextrank

from collections import Counter

import pylab as plt

from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamodel import LdaModel

import pyLDAvis
import pyLDAvis.gensim


from pprint import pprint
import warnings

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Something to make custom pytextrank work
__file__ = pytextrank.__file__

# Load data

In [10]:
harmonization_path = '../../data/anonymising_scripts/harmonize_case_types_combined.py'
case_type_dic_filepath = '../../data/original/case_type_harmonization.csv'

#CJC
if mode.lower() == 'cjc':
    original_file_path = '../../data/original/cjc_cases_20178_sample_20180808.xlsx'
    col_advice = 'LEGAL_ISSUES'
    col_synopsis = 'BACKGROUND_INFORMATION'
    col_casetype = 'CASE_TYPE_CJC'
#LSPBS
elif mode.lower() == 'lspbs':
    original_file_path = '../../data/original/lspbs_cases2016_sample.xlsx'
    col_advice = 'ADVICE_SOUGHT'
    col_synopsis = 'CASE_SYNOPSIS'
    col_casetype = 'CASE_TYPE_LSPBS'

%run $harmonization_path $original_file_path $mode $case_type_dic_filepath

if re.search(r'xlsx', original_file_path):
    file_path = original_file_path[:-5] + '_harmonized.csv'
else:
    file_path = original_file_path[:-4] + '_harmonized.csv'

df = pd.read_csv(file_path).fillna('')


# Load spacy model

In [11]:
spacy.util.set_data_path("../../data/spacy")

In [12]:
nlp = spacy.load('en_core_web_sm/en_core_web_sm-1.2.0')

# Update stopwords

In [13]:
NEW_STOP_WORDS = STOP_WORDS.copy()
custom_stopwords = [
    'adverse','party','applicant',
    'legal','advice','general','guidance','advice','advise',
    'like','know','seeking','lawyers','lawyer','client','help'
    'money','amount','approached applicant','client','time'
]

to_keep = [
    'who','what','when','where','why','how','whether', 'will'
]
to_keep = ['will']
    
for s in custom_stopwords:
    NEW_STOP_WORDS.add(s)

for s in to_keep:
    NEW_STOP_WORDS.remove(s)


# Modify Pytextrank functions

In [14]:
#!/usr/bin/env python
# encoding: utf-8

from collections import namedtuple
from datasketch import MinHash
from graphviz import Digraph
import hashlib
import json
import math
import networkx as nx
import os
import os.path
import re
import spacy
import statistics
import string

DEBUG = False # True

ParsedGraf = namedtuple('ParsedGraf', 'id, sha1, graf')
WordNode = namedtuple('WordNode', 'word_id, raw, root, pos, keep, idx')
RankedLexeme = namedtuple('RankedLexeme', 'text, rank, ids, pos, count')
SummarySent = namedtuple('SummarySent', 'dist, idx, text')


######################################################################
## filter the novel text versus quoted text in an email message

PAT_FORWARD = re.compile("\n\-+ Forwarded message \-+\n")
PAT_REPLIED = re.compile("\nOn.*\d+.*\n?wrote\:\n+\>")
PAT_UNSUBSC = re.compile("\n\-+\nTo unsubscribe,.*\nFor additional commands,.*")


def split_grafs (lines):
    """
    segment the raw text into paragraphs
    """
    graf = []

    for line in lines:
        line = line.strip()

        if len(line) < 1:
            if len(graf) > 0:
                yield "\n".join(graf)
                graf = []
        else:
            graf.append(line)

    if len(graf) > 0:
        yield "\n".join(graf)


def filter_quotes (text, is_email=True):
    """
    filter the quoted text out of a message
    """
    global DEBUG
    global PAT_FORWARD, PAT_REPLIED, PAT_UNSUBSC

    if is_email:
        text = filter(lambda x: x in string.printable, text)

        if DEBUG:
            print("text:", text)

        # strip off quoted text in a forward
        m = PAT_FORWARD.split(text, re.M)

        if m and len(m) > 1:
            text = m[0]

        # strip off quoted text in a reply
        m = PAT_REPLIED.split(text, re.M)

        if m and len(m) > 1:
            text = m[0]

        # strip off any trailing unsubscription notice
        m = PAT_UNSUBSC.split(text, re.M)

        if m:
            text = m[0]

    # replace any remaining quoted text with blank lines
    lines = []

    for line in text.split("\n"):
        if line.startswith(">"):
            lines.append("")
        else:
            lines.append(line)

    return list(split_grafs(lines))


######################################################################
## parse and markup text paragraphs for semantic analysis

PAT_PUNCT = re.compile(r'^\W+$')
PAT_SPACE = re.compile(r'\_+$')

POS_KEEPS = ['v', 'n', 'j']
POS_LEMMA = ['v', 'n']
UNIQ_WORDS = { ".": 0 }


def is_not_word (word):
    return PAT_PUNCT.match(word) or PAT_SPACE.match(word)


def get_word_id (root):
    """
    lookup/assign a unique identify for each word root
    """
    global UNIQ_WORDS

    # in practice, this should use a microservice via some robust
    # distributed cache, e.g., Redis, Cassandra, etc.
    if root not in UNIQ_WORDS:
        UNIQ_WORDS[root] = len(UNIQ_WORDS)

    return UNIQ_WORDS[root]


def fix_microsoft (foo):
    """
    fix special case for `c#`, `f#`, etc.; thanks Microsoft
    """
    i = 0
    bar = []

    while i < len(foo):
        text, lemma, pos, tag = foo[i]

        if (text == "#") and (i > 0):
            prev_tok = bar[-1]

            prev_tok[0] += "#"
            prev_tok[1] += "#"

            bar[-1] = prev_tok
        else:
            bar.append(foo[i])

        i += 1

    return bar


def fix_hypenation (foo):
    """
    fix hyphenation in the word list for a parsed sentence
    """
    i = 0
    bar = []

    while i < len(foo):
        text, lemma, pos, tag = foo[i]

        if (tag == "HYPH") and (i > 0) and (i < len(foo) - 1):
            prev_tok = bar[-1]
            next_tok = foo[i + 1]

            prev_tok[0] += "-" + next_tok[0]
            prev_tok[1] += "-" + next_tok[1]

            bar[-1] = prev_tok
            i += 2
        else:
            bar.append(foo[i])
            i += 1

    return bar


def parse_graf (doc_id, graf_text, base_idx, spacy_nlp=None):
    """
    CORE ALGORITHM: parse and markup sentences in the given paragraph
    """
    global DEBUG
    global POS_KEEPS, POS_LEMMA, SPACY_NLP

    # set up the spaCy NLP parser
    if not spacy_nlp:
        if not SPACY_NLP:
            SPACY_NLP = spacy.load("en_core_web_sm/en_core_web_sm-1.2.0")

        spacy_nlp = SPACY_NLP

    markup = []
    new_base_idx = base_idx
    doc = spacy_nlp(graf_text, parse=True)

    for span in doc.sents:
        graf = []
        digest = hashlib.sha1()

        if DEBUG:
            print(span)

        # build a word list, on which to apply corrections
        word_list = []

        for tag_idx in range(span.start, span.end):
            token = doc[tag_idx]

            if DEBUG:
                print("IDX", tag_idx, token.text, token.tag_, token.pos_)
                print("reg", is_not_word(token.text))

            word_list.append([token.text, token.lemma_, token.pos_, token.tag_])

        # scan the parsed sentence, annotating as a list of `WordNode`
        corrected_words = fix_microsoft(fix_hypenation(word_list))

        for tok_text, tok_lemma, tok_pos, tok_tag in corrected_words:
            word = WordNode(word_id=0, raw=tok_text, root=tok_text.lower(), pos=tok_tag, keep=0, idx=new_base_idx)

            if is_not_word(tok_text) or (tok_tag == "SYM"):
                # a punctuation, or other symbol
                pos_family = '.'
                word = word._replace(pos=pos_family)
            else:
                pos_family = tok_tag.lower()[0]

            if pos_family in POS_LEMMA:
                # can lemmatize this word?
                word = word._replace(root=tok_lemma)

            if pos_family in POS_KEEPS:
                word = word._replace(word_id=get_word_id(word.root), keep=1)

            digest.update(word.root.encode('utf-8'))

            # schema: word_id, raw, root, pos, keep, idx
            if DEBUG:
                print(word)

            graf.append(list(word))
            new_base_idx += 1

        markup.append(ParsedGraf(id=doc_id, sha1=digest.hexdigest(), graf=graf))

    return markup, new_base_idx


def parse_doc (json_iter):
    """
    parse one document to prep for TextRank
    """
    global DEBUG

    for meta in json_iter:
        base_idx = 0

        for graf_text in filter_quotes(meta["text"], is_email=False):
            if DEBUG:
                print("graf_text:", graf_text)

            grafs, new_base_idx = parse_graf(meta["id"], graf_text, base_idx)
            base_idx = new_base_idx

            for graf in grafs:
                yield graf


######################################################################
## graph analytics

def get_tiles (graf, size=3):
    """
    generate word pairs for the TextRank graph
    """
    keeps = list(filter(lambda w: w.word_id > 0, graf))
    keeps_len = len(keeps)

    for i in iter(range(0, keeps_len - 1)):
        w0 = keeps[i]

        for j in iter(range(i + 1, min(keeps_len, i + 1 + size))):
            w1 = keeps[j]

            if (w1.idx - w0.idx) <= size:
                yield (w0.root, w1.root,)


def build_graph (json_iter):
    """
    construct the TextRank graph from parsed paragraphs
    """
    global DEBUG, WordNode
    graph = nx.DiGraph()

    for meta in json_iter:
        if DEBUG:
            print(meta["graf"])

        for pair in get_tiles(map(WordNode._make, meta["graf"])):
            if DEBUG:
                print(pair)

            for word_id in pair:
                if not graph.has_node(word_id):
                    graph.add_node(word_id)

            try:
                graph.adj[pair[0]][pair[1]]["weight"] += 1.0
            except KeyError:
                graph.add_edge(pair[0], pair[1], weight=1.0)

    return graph


def write_dot (graph, ranks, path="graph.dot"):
    """
    output the graph in Dot file format
    """
    dot = Digraph()

    for node in graph.nodes():
        dot.node(node, "%s %0.3f" % (node, ranks[node]))

    for edge in graph.edges():
        dot.edge(edge[0], edge[1], constraint="false")

    with open(path, 'w') as f:
        f.write(dot.source)


def render_ranks (graph, ranks, dot_file="graph.dot"):
    """
    render the TextRank graph for visual formats
    """
    if dot_file:
        write_dot(graph, ranks, path=dot_file)

    ## omitted since matplotlib isn't reliable enough
    #import matplotlib.pyplot as plt
    #nx.draw_networkx(graph)
    #plt.savefig(img_file)
    #plt.show()


def text_rank (path):
    """
    run the TextRank algorithm
    """
    graph = build_graph(json_iter(path))
    ranks = nx.pagerank(graph)

    return graph, ranks


######################################################################
## collect key phrases

SPACY_NLP = None
STOPWORDS = NEW_STOP_WORDS

STOPWORDS
def load_stopwords (stop_file):
    stopwords = set([])

    # provide a default if needed
    if not stop_file:
        stop_file = "stop.txt"

    # check whether the path is fully qualified
    if os.path.isfile(stop_file):
        stop_path = stop_file

    # check for the file in the current working directory
    else:
        cwd = os.getcwd()
        stop_path = os.path.join(cwd, stop_file)

        # check for the file in the same directory as this code module
        if not os.path.isfile(stop_path):
            loc = os.path.realpath( os.path.join(cwd, os.path.dirname(__file__)) )
            stop_path = os.path.join(loc, stop_file)

    try:
        with open(stop_path, "r") as f:
            for line in f.readlines():
                stopwords.add(line.strip().lower())
    except FileNotFoundError:
        pass

    return stopwords


def find_chunk_sub (phrase, np, i):
    for j in iter(range(0, len(np))):
        p = phrase[i + j]

        if p.text != np[j]:
            return None

    return phrase[i:i + len(np)]


def find_chunk (phrase, np):
    """
    leverage noun phrase chunking
    """
    for i in iter(range(0, len(phrase))):
        parsed_np = find_chunk_sub(phrase, np, i)

        if parsed_np:
            return parsed_np


def enumerate_chunks (phrase, spacy_nlp):
    """
    iterate through the noun phrases
    """
    if (len(phrase) > 1):
        found = False
        text = " ".join([rl.text for rl in phrase])
        doc = spacy_nlp(text.strip(), parse=True)

        for np in doc.noun_chunks:
            if np.text != text:
                found = True
                yield np.text, find_chunk(phrase, np.text.split(" "))

        if not found and all([rl.pos[0] != "v" for rl in phrase]):
            yield text, phrase


def collect_keyword (sent, ranks, stopwords):
    """
    iterator for collecting the single-word keyphrases
    """
    for w in sent:
        if (w.word_id > 0) and (w.root in ranks) and (w.pos[0] in "NV") and (w.root not in stopwords):
            rl = RankedLexeme(text=w.raw.lower(), rank=ranks[w.root]/2.0, ids=[w.word_id], pos=w.pos.lower(), count=1)

            if DEBUG:
                print(rl)

            yield rl


def find_entity (sent, ranks, ent, i):
    if i >= len(sent):
        return None, None
    else:
        for j in iter(range(0, len(ent))):
            w = sent[i + j]

            if w.raw != ent[j]:
                return find_entity(sent, ranks, ent, i + 1)

        w_ranks = []
        w_ids = []

        for w in sent[i:i + len(ent)]:
            w_ids.append(w.word_id)

            if w.root in ranks:
                w_ranks.append(ranks[w.root])
            else:
                w_ranks.append(0.0)

        return w_ranks, w_ids


def collect_entities (sent, ranks, stopwords, spacy_nlp):
    """
    iterator for collecting the named-entities
    """
    global DEBUG
    sent_text = " ".join([w.raw for w in sent])

    if DEBUG:
        print("sent:", sent_text)

    for ent in spacy_nlp(sent_text).ents:
        if DEBUG:
            print("NER:", ent.label_, ent.text)

        if (ent.label_ not in ["CARDINAL"]) and (ent.text.lower() not in stopwords):
            w_ranks, w_ids = find_entity(sent, ranks, ent.text.split(" "), 0)

            if w_ranks and w_ids:
                rl = RankedLexeme(text=ent.text.lower(), rank=w_ranks, ids=w_ids, pos="np", count=1)

                if DEBUG:
                    print(rl)

                yield rl


def collect_phrases (sent, ranks, spacy_nlp):
    """
    iterator for collecting the noun phrases
    """
    tail = 0
    last_idx = sent[0].idx - 1
    phrase = []

    while tail < len(sent):
        w = sent[tail]

        if (w.word_id > 0) and (w.root in ranks) and ((w.idx - last_idx) == 1):
            # keep collecting...
            rl = RankedLexeme(text=w.raw.lower(), rank=ranks[w.root], ids=w.word_id, pos=w.pos.lower(), count=1)
            phrase.append(rl)
        else:
            # just hit a phrase boundary
            for text, p in enumerate_chunks(phrase, spacy_nlp):
                if p:
                    id_list = [rl.ids for rl in p]
                    rank_list = [rl.rank for rl in p]
                    np_rl = RankedLexeme(text=text, rank=rank_list, ids=id_list, pos="np", count=1)

                    if DEBUG:
                        print(np_rl)

                    yield np_rl

            phrase = []

        last_idx = w.idx
        tail += 1


def calc_rms (values):
    """
    calculate a root-mean-squared metric for a list of float values
    """
    #return math.sqrt(sum([x**2.0 for x in values])) / float(len(values))
    # take the max() which works fine
    return max(values)


def normalize_key_phrases (path, ranks, stopwords=None, spacy_nlp=None, skip_ner=True):
    """
    collect keyphrases, named entities, etc., while removing stop words
    """
    global STOPWORDS, SPACY_NLP

    # set up the stop words
    if (type(stopwords) is list) or (type(stopwords) is set):
        # explicit conversion to a set, for better performance
        stopwords = set(stopwords)
    else:
        if not STOPWORDS:
            STOPWORDS = load_stopwords(stopwords)

        stopwords = STOPWORDS

    # set up the spaCy NLP parser
    if not spacy_nlp:
        if not SPACY_NLP:
            SPACY_NLP = spacy.load("en_core_web_sm/en_core_web_sm-1.2.0")

        spacy_nlp = SPACY_NLP

    # collect keyphrases
    single_lex = {}
    phrase_lex = {}

    if isinstance(path, str):
        path = json_iter(path)

    for meta in path:
        sent = [w for w in map(WordNode._make, meta["graf"])]

        for rl in collect_keyword(sent, ranks, stopwords):
            id = str(rl.ids)

            if id not in single_lex:
                single_lex[id] = rl
            else:
                prev_lex = single_lex[id]
                single_lex[id] = rl._replace(count = prev_lex.count + 1)

        if not skip_ner:
            for rl in collect_entities(sent, ranks, stopwords, spacy_nlp):
                id = str(rl.ids)

                if id not in phrase_lex:
                    phrase_lex[id] = rl
                else:
                    prev_lex = phrase_lex[id]
                    phrase_lex[id] = rl._replace(count = prev_lex.count + 1)

        for rl in collect_phrases(sent, ranks, spacy_nlp):
            id = str(rl.ids)

            if id not in phrase_lex:
                phrase_lex[id] = rl
            else:
                prev_lex = phrase_lex[id]
                phrase_lex[id] = rl._replace(count = prev_lex.count + 1)

    # normalize ranks across single keywords and longer phrases:
    #    * boost the noun phrases based on their length
    #    * penalize the noun phrases for repeated words
    rank_list = [rl.rank for rl in single_lex.values()]

    if len(rank_list) < 1:
        max_single_rank = 0
    else:
        max_single_rank = max(rank_list)

    repeated_roots = {}

    for rl in sorted(phrase_lex.values(), key=lambda rl: len(rl), reverse=True):
        rank_list = []

        for i in iter(range(0, len(rl.ids))):
            id = rl.ids[i]

            if not id in repeated_roots:
                repeated_roots[id] = 1.0
                rank_list.append(rl.rank[i])
            else:
                repeated_roots[id] += 1.0
                rank_list.append(rl.rank[i] / repeated_roots[id])

        phrase_rank = calc_rms(rank_list)
        single_lex[str(rl.ids)] = rl._replace(rank = phrase_rank)

    # scale all the ranks together, so they sum to 1.0
    sum_ranks = sum([rl.rank for rl in single_lex.values()])

    for rl in sorted(single_lex.values(), key=lambda rl: rl.rank, reverse=True):
        if sum_ranks > 0.0:
            rl = rl._replace(rank=rl.rank / sum_ranks)
        elif rl.rank == 0.0:
            rl = rl._replace(rank=0.1)

        rl = rl._replace(text=re.sub(r"\s([\.\,\-\+\:\@])\s", r"\1", rl.text))
        yield rl


######################################################################
## sentence significance

def mh_digest (data):
    """
    create a MinHash digest
    """
    num_perm = 512
    m = MinHash(num_perm)

    for d in data:
        m.update(d.encode('utf8'))

    return m


def rank_kernel (path):
    """
    return a list (matrix-ish) of the key phrases and their ranks
    """
    kernel = []

    if isinstance(path, str):
        path = json_iter(path)

    for meta in path:
        if not isinstance(meta, RankedLexeme):
            rl = RankedLexeme(**meta)
        else:
            rl = meta

        m = mh_digest(map(lambda x: str(x), rl.ids))
        kernel.append((rl, m,))

    return kernel


def top_sentences (kernel, path):
    """
    determine distance for each sentence
    """
    key_sent = {}
    i = 0

    if isinstance(path, str):
        path = json_iter(path)

    for meta in path:
        graf = meta["graf"]
        tagged_sent = [WordNode._make(x) for x in graf]
        text = " ".join([w.raw for w in tagged_sent])

        m_sent = mh_digest([str(w.word_id) for w in tagged_sent])
        dist = sum([m_sent.jaccard(m) * rl.rank for rl, m in kernel])
        key_sent[text] = (dist, i)
        i += 1

    for text, (dist, i) in sorted(key_sent.items(), key=lambda x: x[1][0], reverse=True):
        yield SummarySent(dist=dist, idx=i, text=text)


######################################################################
## document summarization

def limit_keyphrases (path, phrase_limit=20):
    """
    iterator for the most significant key phrases
    """
    rank_thresh = None

    if isinstance(path, str):
        lex = []

        for meta in json_iter(path):
            rl = RankedLexeme(**meta)
            lex.append(rl)
    else:
        lex = path

    if len(lex) > 0:
        rank_thresh = statistics.mean([rl.rank for rl in lex])
    else:
            rank_thresh = 0

    used = 0

    for rl in lex:
        if rl.pos[0] != "v":
            if (used > phrase_limit) or (rl.rank < rank_thresh):
                return

            used += 1
            yield rl.text.replace(" - ", "-")


def limit_sentences (path, word_limit=100):
    """
    iterator for the most significant sentences, up to a specified limit
    """
    word_count = 0

    if isinstance(path, str):
        path = json_iter(path)

    for meta in path:
        if not isinstance(meta, SummarySent):
            p = SummarySent(**meta)
        else:
            p = meta

        sent_text = p.text.strip().split(" ")
        sent_len = len(sent_text)

        if (word_count + sent_len) > word_limit:
            break
        else:
            word_count += sent_len
            yield sent_text, p.idx, p.dist


def make_sentence (sent_text):
    """
    construct a sentence text, with proper spacing
    """
    lex = []
    idx = 0

    for word in sent_text:
        if len(word) > 0:
            if (idx > 0) and not (word[0] in ",.:;!?-\"'"):
                lex.append(" ")

            lex.append(word)

        idx += 1

    return "".join(lex)


######################################################################
## common utilities

def json_iter (path):
    """
    iterator for JSON-per-line in a file pattern
    """
    with open(path, 'r') as f:
        for line in f.readlines():
            yield json.loads(line)


def pretty_print (obj, indent=False):
    """
    pretty print a JSON object
    """

    if indent:
        return json.dumps(obj, sort_keys=True, indent=2, separators=(',', ': '))
    else:
        return json.dumps(obj, sort_keys=True)

#  Function to extract summary

In [15]:
def textrank_generate(df, 
                      case_type, 
                      col='advice',
                      sim_score_threshold=0.9, 
                      phrase_limit=30, 
                      word_limit=1000, 
                      to_print=False, 
                      to_plot=True):
    
    """
    Takes in the cleaned and harmonized case detail Pandas DataFrame, 
    performs some text processing, 
    tokenizes words using spacy-en language model and 
    prints out key phrases and summaries. 
    
    Arguments
    @ df                  : Cleaned and harmonized case detail Pandas DataFrame
    @ casetype            : Case type for which the summary is to be generated
    @ col                 : Column to be used.  Accepts 'advice', 'synopsis' and 'both' only
    @ sim_score_threshold : Float between 0.0 and 1.0 inclusive.  Parameter to exclude sentences that are >= x in cosine similarity from the list of top sentences.  
    @ phrase_limit        : Number of key phrases to be extracted for summary
    @ word_limit          : Word limit for significant sentences to be included in top sentences generator before (excluding similar ones)
    @ to_print            : Boolean parameter.  If True, prints all intermediate outputs from PyTextRank
    @ to_plot             : Boolean parameter.  If True, plots the network graph
    
    Returns
    A tuple: list of key_phrases and significant sentences (as printed out)
    """
    
    # Extract data from column(s) to be used into a single array
    if col.lower()=='both':
        print('\nColumns used: advice and synopsis')
        advice = (df
                  .loc[df[col_casetype] == case_type, col_advice]
                  .append(df
                          .loc[df[col_casetype] == case_type, col_synopsis], 
                          ignore_index=True)
                  .values)
    elif col.lower()=='synopsis':
        print('\nColumn used: synopsis')
        advice = df.loc[df[col_casetype] == case_type, col_synopsis].values
    else:
        print('\nColumn used: advice')
        advice = df.loc[df[col_casetype] == case_type, col_advice].values
    
    # charset = set(list(''.join(advice)))
    
    # Simple text processing
    ## Standardize some short hand notation
    advice = [adv.replace('A/P',' Adverse Party ') for adv in advice] 
    ## Separate '#1', '#2' etc from 'Adverse Party' string to improve keyword outputs
    advice = [adv.replace('#',' #') for adv in advice]  
    ## Create a single long text separated by spaces after replacing double quotes as apostrophes with single quote
    advice_all = '. '.join(advice).replace('"', '\'')  # 
    ## Convert to required json string format for PyTextRank
    advice_json = '{"id":"1", "text":"' + advice_all + '"}'
    
    path_stage0 = 'advice.json'
    path_stage1 = 'ol.json'
    path_stage2 = "o2.json"
    path_stage3 = "o3.json"
    
    with open(path_stage0, 'w') as f:
        f.write(advice_json)
    
    with open(path_stage1, 'w') as f:
        for graf in parse_doc(json_iter(path_stage0)):
            f.write("%s\n" % pretty_print(graf._asdict()))
            
            if to_print:
                # to view output in this notebook
                print(pretty_print(graf))
    
    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)
    
    with open(path_stage2, 'w') as f:
        for rl in normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pretty_print(rl._asdict()))
            
            if to_print:
                # to view output in this notebook
                print(pretty_print(rl))
    
    if to_plot:
        plt.figure(1, figsize = (12, 12))
        nx.draw(graph, with_labels=True) 
        plt.show()
    
    
    kernel = rank_kernel(path_stage2)

    with open(path_stage3, 'w') as f:
        for s in top_sentences(kernel, path_stage1):
            f.write(pretty_print(s._asdict()))
            f.write("\n")
            
            if to_print:
                # to view output in this notebook
                print(pretty_print(s._asdict()))
    
    phrases = ", ".join(set([p for p in limit_keyphrases(path_stage2, phrase_limit=phrase_limit)]))
    sent_iter = sorted(limit_sentences(path_stage3, word_limit=word_limit), key=lambda x: x[2], reverse=True)
    s = []
    
    sent_text_list = []
    counter = 1
    for sent_text, idx, dist in sent_iter:
        if len(sent_text_list) == 0:
            sim_score_prev = 0.
            sent_text_list.append(nlp(make_sentence(sent_text)))
            s.append('sentence_' + 
                     str(counter) + 
                     ': (dist :' + 
                     str(round(dist, 5)) + 
                     ')\n\t - ' + 
                     make_sentence(sent_text))
            counter += 1
        else:
            sim_text_curr = nlp(make_sentence(sent_text))
            max_sim_score = max([sent_text_prev.similarity(sim_text_curr) for sent_text_prev in sent_text_list])
            if max_sim_score < sim_score_threshold:
                sent_text_list.append(sim_text_curr)
                s.append('sentence_' + 
                         str(counter) + 
                         ': (dist :' + 
                        str(round(dist, 5)) + 
                         ')\n\t - ' + 
                         make_sentence(sent_text))
                counter += 1

    graf_text = "\n".join(s)
    
    print("**excerpts:**\n%s\n\n**keywords:**\n%s" % (graf_text, phrases,))
    
    return phrases, graf_text


# Extract top case types

In [16]:
top_n_casestypes = (df
                    .loc[:, [col_casetype, col_advice]]
                    .groupby(col_casetype, as_index=False)
                    .agg('count')
                    .sort_values(col_advice, ascending=False)
                    .iloc[:n_casetypes, :]
                   )

top_n_casestypes = (top_n_casestypes
                    .loc[top_n_casestypes[col_advice] >= threshold, :]
                   ).values

# Generate Summaries

## Advice only

In [17]:
out_advice = {}

for casetype, num_cases in top_n_casestypes:
    print('\n\n\nCasetype: ' + 
          casetype + 
          ' (' + 
          str(num_cases) + 
          ' cases)')
    
    out_advice[casetype] = textrank_generate(df, 
                                             casetype, 
                                             col='advice', 
                                             sim_score_threshold=sim_score_threshold, 
                                             phrase_limit=phrase_limit, 
                                             word_limit=word_limit, 
                                             to_print=to_print, 
                                             to_plot=to_plot)





Casetype: Bankruptcy / DRS (804 cases)

Column used: advice
**excerpts:**
sentence_1: (dist :0.05046)
	 - How to file a bankruptcy application?
sentence_2: (dist :0.04843)
	 - Should the applicant file for bankruptcy himself?
sentence_3: (dist :0.04451)
	 - Filling up of bankruptcy application form.
sentence_4: (dist :0.04404)
	 - Can the applicant self-declare bankruptcy instead.
sentence_5: (dist :0.04152)
	 - How to avoid bankruptcy from the bank?.
sentence_6: (dist :0.0396)
	 - Restrictions after declaring bankruptcy..
sentence_7: (dist :0.03863)
	 - What is the procedure for bankruptcy proceedings?
sentence_8: (dist :0.03858)
	 - Will his employer know of his bankruptcy?
sentence_9: (dist :0.03812)
	 - ( 5 ) When can Applicant be discharged from bankruptcy?.
sentence_10: (dist :0.03792)
	 - The applicant now wishes to apply for bankruptcy..
sentence_11: (dist :0.03776)
	 - Bankruptcy proceedings; process of DRS..
sentence_12: (dist :0.03756)
	 - Applicant wants to resist the ba

**excerpts:**
sentence_1: (dist :0.02734)
	 - What the applicant can claim..
sentence_2: (dist :0.02652)
	 - How can the applicant, the tenant, dispute this?.
sentence_3: (dist :0.02504)
	 - Can they claim for more than the deposit?.
sentence_4: (dist :0.02497)
	 - Landlord and tenant dispute.
sentence_5: (dist :0.02274)
	 - Small claim matter.
sentence_6: (dist :0.02225)
	 - The tenant claimed the defects in the Small Claims Tribunal..
sentence_7: (dist :0.02171)
	 - 1. Landlord-tenant dispute-whether landlord can file a counter-claim against the tenant's claim.
sentence_8: (dist :0.02163)
	 - Should Applicant file a claim against him?.
sentence_9: (dist :0.02088)
	 - Can the applicant recover his rental deposit..
sentence_10: (dist :0.02068)
	 - When the Applicant is allowed to evict the tenant.. Tenancy agreement.
sentence_11: (dist :0.0202)
	 - Applicant would like his deposit back.
sentence_12: (dist :0.0195)
	 - tenancy dispute, withholding deposit.
sentence_13: (dist :0.01709)
	

**excerpts:**
sentence_1: (dist :0.02725)
	 - What action should the applicant take?.
sentence_2: (dist :0.02627)
	 - Should the applicant file a magistrate's complaint?.
sentence_3: (dist :0.0232)
	 - Should the applicant lodge another police report?
sentence_4: (dist :0.02014)
	 - The applicant has gone to the magistrates court.
sentence_5: (dist :0.0197)
	 - How the applicant can commence proceedings against the other party..
sentence_6: (dist :0.01866)
	 - - Whether the Applicant can pursue any criminal or civil claims against his wife.
sentence_7: (dist :0.01843)
	 - 1 ) How to file the claims that applicant wishes to make...
sentence_8: (dist :0.01738)
	 - Can Applicant sue the Adverse Party?.
sentence_9: (dist :0.01708)
	 - Should applicant accept the compensation?..
sentence_10: (dist :0.01605)
	 - Can the applicant claim compensation for injuries etc. via ways other than suing?
sentence_11: (dist :0.01575)
	 - How can the applicant sue his neighbor.
sentence_12: (dist :0.01494

## Synposis only

In [18]:
out_synopsis = {}

for casetype, num_cases in top_n_casestypes:
    print('\n\n\nCasetype: ' + 
          casetype + 
          ' (' + 
          str(num_cases) + 
          ' cases)')
    
    out_synopsis[casetype] = textrank_generate(df, 
                                               casetype, 
                                               col='synopsis', 
                                               sim_score_threshold=sim_score_threshold, 
                                               phrase_limit=phrase_limit, 
                                               word_limit=word_limit, 
                                               to_print=to_print, 
                                               to_plot=to_plot)





Casetype: Bankruptcy / DRS (804 cases)

Column used: synopsis
**excerpts:**
sentence_1: (dist :0.03523)
	 - Bank filed bankruptcy against applicant.
sentence_2: (dist :0.03098)
	 - Applicant applied for bankruptcy.
sentence_3: (dist :0.03037)
	 - Banks will not file bankruptcy against him.
sentence_4: (dist :0.02998)
	 - Applicant facing bankruptcy.
sentence_5: (dist :0.02852)
	 - Applicant owes the bank $ 80000 in debt and wants to file for bankruptcy..
sentence_6: (dist :0.02767)
	 - Applicant came from a bankruptcy hearing.
sentence_7: (dist :0.02652)
	 - The applicant wants voluntarily file for bankruptcy.
sentence_8: (dist :0.02614)
	 - The applicant has received a bankruptcy notice from a bank.
sentence_9: (dist :0.02606)
	 - Applicant wants to self-declare bankruptcy.
sentence_10: (dist :0.02562)
	 - Applicant wishes to apply for bankruptcy..
sentence_11: (dist :0.0251)
	 - . Applicant wants to declare bankruptcy herself..
sentence_12: (dist :0.02454)
	 - Applicant's company 

**excerpts:**
sentence_1: (dist :0.0283)
	 - Applicant is the landlord of a tenant.
sentence_2: (dist :0.02631)
	 - The applicant has been to the small claims court.
sentence_3: (dist :0.026)
	 - - Applicants have been renting flat for last 5 years.- 1 month of rental
sentence_4: (dist :0.02548)
	 - Applicant's tenancy agreement is for 12 months and has been stamped.
sentence_5: (dist :0.02546)
	 - The Applicant was a tenant, and wishes to claim his deposit from his landlord.
sentence_6: (dist :0.02546)
	 - Applicant is a tenant.
sentence_7: (dist :0.02546)
	 - - Applicant is tenant.
sentence_8: (dist :0.02501)
	 - The applicant had a tenancy agreement dispute with landlord.
sentence_9: (dist :0.02464)
	 - Tenant is not paying rent to the applicant..
sentence_10: (dist :0.02431)
	 - applicant is the landlord ( there are 2 landlords ).
sentence_11: (dist :0.0242)
	 - The applicant's tenant was paid $ 1200 of deposit when they signed the agreement.
sentence_12: (dist :0.02358)
	 - Applic

**excerpts:**
sentence_1: (dist :0.02702)
	 - applicant also has a police report.
sentence_2: (dist :0.02437)
	 - Applicant owes $ 20,000.
sentence_3: (dist :0.02387)
	 - Applicants have reported this to the police and have filed with the magistrate..
sentence_4: (dist :0.02267)
	 - Applicant has his medical reports..
sentence_5: (dist :0.02178)
	 - Applicant visited his house once to deliver the police report.
sentence_6: (dist :0.02128)
	 - Applicant wants to file a magistrate's complaint.
sentence_7: (dist :0.02077)
	 - The applicant has filed a magistrate complaint.
sentence_8: (dist :0.0204)
	 - Applicant wants to sue him.
sentence_9: (dist :0.01965)
	 - Applicant had a girlfriend in 2005.
sentence_10: (dist :0.01899)
	 - Applicants found out where she lived.
sentence_11: (dist :0.01855)
	 - The applicant can also speak Vietnamese.
sentence_12: (dist :0.01807)
	 - The applicants were also fined..
sentence_13: (dist :0.01798)
	 - The applicant's property has been seized by the poli

## Both Advice and Synopsis

In [19]:
out_both = {}

for casetype, num_cases in top_n_casestypes:
    print('\n\n\nCasetype: ' + 
          casetype + 
          ' (' + 
          str(num_cases) + 
          ' cases)')
    
    out_both[casetype] = textrank_generate(df, 
                                           casetype, 
                                           col='both', 
                                           sim_score_threshold=sim_score_threshold, 
                                           phrase_limit=phrase_limit, 
                                           word_limit=word_limit, 
                                           to_print=to_print, 
                                           to_plot=to_plot)





Casetype: Bankruptcy / DRS (804 cases)

Columns used: advice and synopsis
**excerpts:**
sentence_1: (dist :0.03857)
	 - Bank filed bankruptcy against applicant.
sentence_2: (dist :0.03802)
	 - Should the applicant file for bankruptcy himself?
sentence_3: (dist :0.03442)
	 - Applicant applied for bankruptcy.
sentence_4: (dist :0.03429)
	 - Can the applicant self-declare bankruptcy instead.
sentence_5: (dist :0.03381)
	 - Banks will not file bankruptcy against him.
sentence_6: (dist :0.0334)
	 - Applicant facing bankruptcy.
sentence_7: (dist :0.03187)
	 - Applicant wants to file for bankruptcy.
sentence_8: (dist :0.03068)
	 - Applicant owes the bank $ 80000 in debt and wants to file for bankruptcy..
sentence_9: (dist :0.03014)
	 - How to avoid bankruptcy from the bank?.
sentence_10: (dist :0.03)
	 - How to file a bankruptcy application?
sentence_11: (dist :0.02989)
	 - Applicant came from a bankruptcy hearing.
sentence_12: (dist :0.02896)
	 - The applicant now wishes to apply for bank

**excerpts:**
sentence_1: (dist :0.02724)
	 - Applicant is the landlord of a tenant.
sentence_2: (dist :0.0271)
	 - The applicant has been to the small claims court.
sentence_3: (dist :0.02694)
	 - The Applicant was a tenant, and wishes to claim his deposit from his landlord.
sentence_4: (dist :0.02513)
	 - The applicant had a tenancy agreement dispute with landlord.
sentence_5: (dist :0.02509)
	 - Applicant is a tenant.
sentence_6: (dist :0.02509)
	 - - Applicant is tenant.
sentence_7: (dist :0.02475)
	 - How can the applicant, the tenant, dispute this?.
sentence_8: (dist :0.02427)
	 - The applicant's tenant was paid $ 1200 of deposit when they signed the agreement.
sentence_9: (dist :0.02408)
	 - Tenant is not paying rent to the applicant..
sentence_10: (dist :0.02384)
	 - Applicant's tenancy agreement is for 12 months and has been stamped.
sentence_11: (dist :0.02336)
	 - The applicant is facing the tenancy disputes with her tenant.
sentence_12: (dist :0.02333)
	 - - Applicants have

**excerpts:**
sentence_1: (dist :0.03971)
	 - Applicant is a PR.
sentence_2: (dist :0.0395)
	 - The applicant is a harassment case.
sentence_3: (dist :0.03846)
	 - Applicant's deposit of $ 1000 is still with them.
sentence_4: (dist :0.03792)
	 - Applicant is here regarding harassment.
sentence_5: (dist :0.03748)
	 - applicant was harassed.
sentence_6: (dist :0.0336)
	 - Applicant is being physically and verbally attacked by her neighbour.
sentence_7: (dist :0.03309)
	 - Applicant's friend was his girlfriend.
sentence_8: (dist :0.03252)
	 - Applicant is now suspended by his company.
sentence_9: (dist :0.03234)
	 - The applicant is the director of the company.
sentence_10: (dist :0.0323)
	 - The applicant's house was reconstructed.
sentence_11: (dist :0.03208)
	 - Previously, the applicant was a AIA agent.
sentence_12: (dist :0.03202)
	 - Applicant has been in pain.
sentence_13: (dist :0.03183)
	 - Applicant believes that she is mentally unstable.
sentence_14: (dist :0.0316)
	 - Applican

**excerpts:**
sentence_1: (dist :0.03121)
	 - Total money loaned was $ 4000.
sentence_2: (dist :0.0312)
	 - - AP loaned money ( $ 1,300 ) from the applicant.
sentence_3: (dist :0.03118)
	 - The loan was repaid.
sentence_4: (dist :0.02755)
	 - The applicant loaned money to a friend ( ex- colleague ) but the friend is not at his home.
sentence_5: (dist :0.0258)
	 - - Refusal by applicant's friend to repay money owed to applicant.
sentence_6: (dist :0.02556)
	 - This is a personal loan.
sentence_7: (dist :0.02497)
	 - Applicant owes 3 bank loans from credit cards.
sentence_8: (dist :0.02479)
	 - Applicant loaned his ex- girlfriend some money, around $ 170,000 in May 2014.
sentence_9: (dist :0.02466)
	 - Applicant loaned money to another party-$ 11,500 ( cash ) 8 months ago.
sentence_10: (dist :0.02441)
	 - The money is for investment.
sentence_11: (dist :0.02408)
	 - Whether Applicant may be able to claim the sum of money and through which means..
sentence_12: (dist :0.02394)
	 - The appl