In [16]:
from nlp_utils import *

import networkx as nx

from pyvis.network import Network

from operator import itemgetter

import matplotlib.pyplot as plt

In [6]:
def lowercase_text(input_text):
    text = input_text.lower()
    return text

def normalize_whitespace(input_text):
    text = re.sub("\s+"," ",input_text)
    return text


def replace_html_tags(input_text):
    return re.sub('<.*?>','',input_text)

def remove_emails(input_text):
    return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", input_text)

def remove_urls(input_text):
    return re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , input_text)

def normalize_unicode(input_text):
    return unicodedata.normalize('NFKD', input_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def replace_abbreviations(x):
#     """
#     This is kind of dogsh*t; consider using the "contractions" package instead
#     """
#     for key in abbreviations:
#         if key in x:
#             x = x.replace(key,abbreviations[key])

    x = contractions.fix(x)

    return x

def get_just_words(doc):
    return re.sub(r'[^\w ]+', "", doc)

def remove_stopwords(doc):
    cleaned_doc = ' '.join([word for word in doc.split() if word not in english_stopwords])
    cleaned_doc = re.sub(r"(\s[.]\s)", '. ', cleaned_doc)
    cleaned_doc = re.sub(r"(\s[,]\s)", ', ', cleaned_doc)
    cleaned_doc = re.sub(r"(\s[!]\s)", '! ', cleaned_doc)
    cleaned_doc = re.sub(r"(\s[?]\s)", '? ', cleaned_doc)
    
    return cleaned_doc

def lemmatize(doc):
    return " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(doc)])

In [225]:
def remove_footnotes(doc):
    return re.sub(r"\[[\d]{1,3}\]", "", doc)

In [226]:
def clean_text(input_text):
    """
    Infranodus proper uses a separate stage to remove stopwords after normalizing the text
    """
    out_text = lowercase_text(input_text)
    out_text = normalize_whitespace(out_text)
    out_text = replace_html_tags(out_text)
    out_text = remove_emails(out_text)
    out_text = remove_urls(out_text)
    out_text = normalize_unicode(out_text)
    out_text = replace_abbreviations(out_text)
    out_text = get_just_words(out_text)
    out_text = remove_stopwords(out_text)
    out_text = lemmatize(out_text)
    out_text = remove_footnotes(out_text)
    
    return out_text

In [227]:
def build_word_network(input_text):
    G = nx.Graph()
    for x in range(0, len(input_text.split())):
        window = cleaned_text.split()[x:x+4]
        if len(window) ==4:
            G.add_edge(window[0], window[1], weight=3)
            G.add_edge(window[0], window[2], weight=2)
            G.add_edge(window[0], window[3], weight=1)

            
    ## Scale nodes by naive Degree
    scale=2 # Scaling the size of the nodes by 10*degree
    d = dict(G.degree)

    #Updating dict
    d.update((x, scale*y) for x, y in d.items())

    #Setting up size attribute
    nx.set_node_attributes(G,d,'size')
    
    return G

def top_bet_cen(G, n):
    bet_centrality = nx.betweenness_centrality(G, normalized = True, 
                                              endpoints = False)
    return dict(sorted(bet_centrality.items(), key = itemgetter(1), reverse = True)[:n])

def top_close_cen(G, n):
    closeness_centrality = nx.closeness_centrality(G)
    return dict(sorted(closeness_centrality.items(), key = itemgetter(1), reverse = True)[:n])

def top_PR_cen(G, alpha=0.8, n=5):
#     closeness_centrality = nx.closeness_centrality(G)
    pr = nx.pagerank(G, alpha = 0.8)
    return dict(sorted(pr.items(), key = itemgetter(1), reverse = True)[:n])

In [228]:
def pyvis_from_nx(G):
    nt = Network('1000px', '750px', notebook=True)
    nt.from_nx(G)
    
    return nt

In [229]:
re.sub(r"\[[\d]{1,3}\]", "", testtext)

"On 24 February 2022, Russia invaded Ukraine in a major escalation of the Russo-Ukrainian War, which began in 2014. The invasion has likely resulted in tens of thousands of deaths on both sides and caused Europe's largest refugee crisis since World War II, with an estimated 8 million people being displaced within the country by late May as well as 7.8 million Ukrainians fleeing the country as of 8 November 2022. Within five weeks of the invasion, Russia experienced its greatest emigration since the 1917 October Revolution. The invasion has also caused global food shortages."

In [2]:
testtext = """The routing protocol is called SrcRR. There are two broadcasts used with the protocol. The first is periodic broadcasts used to determine a metric called ETX. These public broadcasts measure the probability that a packet between two nodes in radio contact reaches its destination. The second broadcast type is used to build up routing tables. A node 0 will broadcast that it wants to find a route to D. Then each node that receives the broadcast will add its id to the route and forward the packet. When node D receives a packet, it will reply back along the route that was found for that packet. Then node 0 can use this information to determine the best route using the ETX metrics and the route information returned from its query."""

In [235]:
testtext = """On 24 February 2022, Russia invaded Ukraine in a major escalation of the Russo-Ukrainian War, which began in 2014. The invasion has likely resulted in tens of thousands of deaths on both sides and caused Europe's largest refugee crisis since World War II,[10][11] with an estimated 8 million people being displaced within the country by late May as well as 7.8 million Ukrainians fleeing the country as of 8 November 2022.[12][13][14][15] Within five weeks of the invasion, Russia experienced its greatest emigration since the 1917 October Revolution.[16] The invasion has also caused global food shortages.[17][18]"""

cleaned_text = clean_text(testtext)
G = build_word_network(cleaned_text)
nt = pyvis_from_nx(G)
nt.show('RUS UKR Invasion 2022.html')

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 


In [236]:
testtext = """Apollo 17 (December 7–19, 1972) was the final mission of NASA's Apollo program, the most recent time humans have set foot on the Moon or traveled beyond low Earth orbit. Commander Gene Cernan and Lunar Module Pilot Harrison Schmitt walked on the Moon, while Command Module Pilot Ronald Evans orbited above. Schmitt was the only professional geologist to land on the Moon; he was selected in place of Joe Engle, as NASA had been under pressure to send a scientist to the Moon. The mission's heavy emphasis on science meant the inclusion of a number of new experiments, including a biological experiment containing five mice that was carried in the command module."""

cleaned_text = clean_text(testtext)

G = build_word_network(cleaned_text)

nt = pyvis_from_nx(G)

nt.show('Apollo 17.html')

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 


In [237]:
testtext = """John Ronald Reuel Tolkien CBE FRSL (/ˈruːl ˈtɒlkiːn/, ROOL TOL-keen;[a] 3 January 1892 – 2 September 1973) was an English writer and philologist. He was the author of the high fantasy works The Hobbit and The Lord of the Rings.

From 1925 to 1945, Tolkien was the Rawlinson and Bosworth Professor of Anglo-Saxon and a Fellow of Pembroke College, both at the University of Oxford. He then moved within the same university to become the Merton Professor of English Language and Literature and Fellow of Merton College, and held these positions from 1945 until his retirement in 1959. Tolkien was a close friend of C. S. Lewis, a co-member of the informal literary discussion group The Inklings. He was appointed a Commander of the Order of the British Empire by Queen Elizabeth II on 28 March 1972."""

cleaned_text = clean_text(testtext)

G = build_word_network(cleaned_text)

nt = pyvis_from_nx(G)
nt.show('tolkien.html')

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 


In [241]:
communities = nx.algorithms.community.label_propagation_communities(G)

AttributeError: 'dict_values' object has no attribute 'values'

In [217]:
nx.betweenness_centrality(G, normalized = True, 
                                              endpoints = False)

{'john': 0.0,
 'ronald': 0.000271370420624152,
 'reuel': 0.0011985526910900043,
 'tolkien': 0.5367360526528957,
 'cbe': 0.0132066985330313,
 'frsl': 0.029793040493490797,
 'rul': 0.06722432473715587,
 'tlkin': 0.0124537674431486,
 'rool': 0.018720169502325912,
 'tolkeena': 0.025041440925459568,
 '3': 0.008643724448593383,
 'january': 0.008898057484669875,
 '1892': 0.00860341997934667,
 '2': 0.023735908356776338,
 'september': 0.015565227776896706,
 '1973': 0.011245329285562903,
 'english': 0.16865607711354275,
 'writer': 0.010374543040757423,
 'philologist': 0.012644565532054964,
 'author': 0.015242264576128853,
 'high': 0.007207713053484198,
 'fantasy': 0.007310670724087828,
 'work': 0.018442051744803446,
 'hobbit': 0.018339171631792793,
 'lord': 0.018446009391538735,
 'ring': 0.05419372114618651,
 '1925': 0.026678295783636476,
 '1945': 0.07131854308205161,
 'rawlinson': 0.010638918220329354,
 'bosworth': 0.013890231152768245,
 'professor': 0.22947967514054474,
 'anglosaxon': 0.001717