In [1]:
import networkx as nx
import json
import nltk
import matplotlib.pyplot as plt
from neo4j import GraphDatabase

from heapq import nsmallest, nlargest
from typing import List

from wordExplorer.utils import getWordData, loadWord2Vec, jsonSimi
from wordExplorer.thesa import bighugeCrawl

In [2]:
def add_words(tx, first: str, sec: List[str], similarity: List[float]):
    for second, sim in zip(sec, similarity):
        tx.run("MERGE (a:Word {word: $first}) "
                "MERGE (b:Word {word: $second})"
                "MERGE (a)-[:SYN {sim: $sim}]->(b)", first=first, second=second, sim=sim)

In [3]:
def getCalculator(mName: str = None):
    if mName is not None:
        model = loadWord2Vec(mName)
    else:
        model = loadWord2Vec()

    driver = GraphDatabase.driver("bolt://172.31.128.1:7687/",
                              auth=("neo4j", "1234"))

    def calcGraph(crawlType, word: str, loc: str = './data/', apiKwargs: dict = None, word2vecKwargs: dict = None) -> nx.Graph:
        if apiKwargs is None:
            apiKwargs = {}

        if word2vecKwargs is None:
            word2vecKwargs = {}

        #Getting the raw graph structure
        try:
            with open(loc + f'synonyms/{word}.json', "r", encoding="utf-8") as f:
                rawdata = json.load(f)
        except FileNotFoundError:
            rawdata = getWordData(crawlType, word, loc + 'synonyms/', **apiKwargs)
            # with open(loc + f'synonyms/{word}.json', "w", encoding="utf-8") as f:
            #     json.dump(data, f)

        # G = nx.Graph(data)
        
        #Getting the weights for the graph
        try:
            with open(loc + f'weights/{word}.json', "r", encoding="utf-8") as f:
                weights = jsonSimi(json.load(f))
        except FileNotFoundError:
            weights, failed = model(rawdata, **word2vecKwargs)
            print(f'Total failed: {failed}')
            # with open(loc + f'weights/{word}.json', "w", encoding="utf-8") as f:
            #     json.dump(jsonSimi(weights), f)

        dbData = {}

        for key, syns in rawdata.items():
            validSyn = []
            validWeight = []
            for syn in syns:
                if (key, syn) in weights:
                    validSyn.append(syn)
                    validWeight.append(weights[(key, syn)])
            dbData[key] = [validSyn, validWeight]

        with driver.session(database="neo4j") as session:
            for key, (synList, synWeight) in dbData.items():
                session.execute_write(add_words, key, synList, synWeight)

        # remove = [x for x in G.nodes() if x not in weights]
        # G.remove_nodes_from(remove)

        # nx.set_edge_attributes(G, weights, 'weight')
        # return G
    
    return calcGraph

def posTag(char: str) -> str:
    punc = ".,!?;“”"
    def puncSwap(char: str, pos: str):
        if char in punc:
            return 'PUN'
        else:
            return pos

    return list(map(lambda x: (x[0], puncSwap(x[0], x[1])), \
        nltk.pos_tag(nltk.word_tokenize(char))))

def lemmatize(char: str) -> str:
        lmn = nltk.stem.WordNetLemmatizer()

        wordnetConv = {
            'J': 'a',
            'V': 'v',
            'N': 'n',
            'R': 'r'
        }

        return map(lambda x: lmn.lemmatize(x[0], wordnetConv.get(x[1][0], 'n')), posTag(char))

In [4]:
calc = getCalculator()

In [5]:
obama = [
    # 'entangled',
    # 'request',
    # 'surge',
    # 'migrants',
    # 'flood',
    # 'children',
    # 'beefing',
    # 'presence',
    # 'tougher',
    # 'enforcement',
    # 'strong',
    # 'misgivings',
    # 'charged',
    # 'shadow',
    # 'politics',
    # 'emphasize',
    # 'trip',
    # 'relax',
    # 'deportations',
    # 'round',
    'discussion',
    'sweeping',
    'actions',
    'path',
    'bill',
    'flexibility',
    'treat',
    'combating',
    'trafficking',
    'aggressive',
    'plan',
    'cut',
    'process'
]

In [6]:
for word in obama:
    print(f'Attempting insertion for: {word}')
    calc(bighugeCrawl, list(lemmatize(word))[0], './data/crawlers/hugelabs/', apiKwargs={'order': 3, 'waitTime': 0.5})
    # print(list(lemmatize(word))[0])

Attempting insertion for: discussion
Total failed: 833
Attempting insertion for: sweeping
Got error code: 404 when trying word defecation
Total failed: 7373
Attempting insertion for: actions
Total failed: 4522
Attempting insertion for: path
Total failed: 1898
Attempting insertion for: bill
Total failed: 5497
Attempting insertion for: flexibility
Total failed: 15
Attempting insertion for: treat
Total failed: 6017
Attempting insertion for: combating
Total failed: 1156
Attempting insertion for: trafficking
Total failed: 658
Attempting insertion for: aggressive
Total failed: 44
Attempting insertion for: plan
Got error code: 404 when trying word cause to be perceived
Total failed: 1870
Attempting insertion for: cut
Got error code: 404 when trying word cause to be perceived
Got error code: 404 when trying word cock walk
Got error code: 404 when trying word defecate
Total failed: 20409
Attempting insertion for: process
Got error code: 500 when trying word 0
Got error code: 404 when trying wor

In [None]:
# g = nx.from_dict_of_dicts(net)
g = calc(bighugeCrawl, 'surge', './data/crawlers/hugelabs/', apiKwargs={'order': 2})
# h = calc(bighugeCrawl, 'flood', './data/crawlers/hugelabs/', apiKwargs={'order': 3})

# g = nx.compose(g, h)
# g = nx.intersection(g, h)

In [None]:
print(g)

In [None]:
# edges = nx.get_edge_attributes(g, "weight")
edges = [g.get_edge_data(u, v).get("weight", 1.0) ** 4 / 12 for u, v in g.edges()]

colDict = { idx+1: val for idx, val in enumerate([
    '#ff0000',
    '#ff4500',
    '#ff8900',
    '#ffce00',
    '#ebff00',
    '#a6ff00',
    '#62ff00',
    '#1dff00',
    '#00ff28',
    '#00ff6d',
    '#00ffb1',
    '#00fff6',
])}

colDict[0] = "#00FF51"
colDict[-2] = "#00ffff"

depth = nx.shortest_path_length(g, "flood")

depth['surge'] = -2

nodeCols = [colDict.get(depth.get(key, -1), "#00ffff") for key, val in g.nodes().items()]

In [None]:
pos = nx.kamada_kawai_layout(g)
# pos = nx.circular_layout(g)

In [None]:
plt.figure(1, figsize=(9, 9))
# nx.draw(g, pos=pos)
nx.draw_networkx_edges(g, pos, width=edges)
nx.draw_networkx_nodes(g, pos, node_color=nodeCols, node_size=8)
# nx.draw_networkx_labels(g, pos)

In [None]:
n = 10

# a, extrema = nx.centrality.eigenvector_centrality(g, max_iter=10**4, weight='weight'), nlargest
# a, extrema = nx.centrality.harmonic_centrality(g, distance='weight'), nlargest
a, extrema = nx.centrality.closeness_centrality(g, distance='weight'), nlargest

ext = extrema(n, a, key=a.get)
for key in ext:
    print(f'Extrema: {key}, {a[key]:.3f}')