### Basic Article Enrichment and Relationship Stucture

In [1]:
import re
import networkx as nx
import matplotlib.colors as clr
from utils.graph import Graph_Viz_Engine
from utils.io import read_in


In [2]:
article_relations = './data/outbound-requests-by-domain-2.json'
relations = read_in(article_relations)

In [3]:
def extract_domain(url: str) -> str:
    
    pattern = r"(((([A-Za-z0-9]+){1,63}\.)|(([A-Za-z0-9]+(\-)+[A-Za-z0-9]+){1,63}\.))+){1,255}"
    domain = re.search(pattern, url)

    return domain.group().replace('www', '').replace('.', '')

In [4]:
n_relations = {}

for idx, key in enumerate(relations.keys()):
    n_key= extract_domain(key)

    n_relations[n_key] = relations[key]

In [5]:
t_blob = n_relations['nytimes']['content']

In [7]:
from textblob import TextBlob
from textblob import Word
from utils.stopwords import stopwords
from string import punctuation


In [8]:
nyt_blob = TextBlob(t_blob)


In [10]:
polarities = []

for sentence in nyt_blob.sentences:
    polarities.append(sentence.sentiment.polarity)

avg_polarity_of_sentences = sum(polarities) / len(polarities)

In [11]:
filtered_words = [ word for word in sorted(nyt_blob.word_counts, key=nyt_blob.word_counts.get, reverse=True) if word not in stopwords(additional=[p for p in punctuation]) and len(word) >= 3]

In [12]:
filtered_words_scored = [(word, nyt_blob.words.count(word)) for word in filtered_words]

In [13]:
reduced_raw = [word for word in nyt_blob.raw.split() if word in filtered_words]

In [14]:
reduced_blob =  TextBlob(' '.join(reduced_raw))

In [32]:
reduced_sentences = []

for wordlist in reduced_blob.ngrams(10):

    wordlist = list(wordlist)
    wordlist.append('.')

    reduced_sentences.append(
        ' '.join(wordlist)
    )


In [16]:
cleaned_sentences = []
for sentence in nyt_blob.sentences:
    sentence = [ word for word in sentence.words.lower() if word not in stopwords(additional=[p for p in punctuation]) and len(word) >= 3 ]
    cleaned_sentences.append(
        ' '.join(sentence)
    )

In [17]:
from utils.nlp import Skip_Gram, Text_Tools

In [35]:
epochs = 1000
training_data = Text_Tools.preprocessing(' '.join(reduced_sentences), stopwords(additional=[p for p in punctuation]))
Model = Skip_Gram()
Text_Tools.prepare_data_for_training(training_data, Model)

In [36]:
Model.train(epochs)

 | epoch 1 | loss: 55780.1270781515 
 | epoch 2 | loss: 55397.60993668581 
 | epoch 3 | loss: 55024.66282691354 
 | epoch 4 | loss: 54660.21730148519 
 | epoch 5 | loss: 54303.28014157387 
 | epoch 6 | loss: 53952.93236695419 
 | epoch 7 | loss: 53608.32862305345 
 | epoch 8 | loss: 53268.69697205308 
 | epoch 9 | loss: 52933.33913046103 
 | epoch 10 | loss: 52601.631213588036 
 | epoch 11 | loss: 52273.02506699646 
 | epoch 12 | loss: 51947.05028276214 
 | epoch 13 | loss: 51623.31700664344 
 | epoch 14 | loss: 51301.51962543812 
 | epoch 15 | loss: 50981.44135354532 
 | epoch 16 | loss: 50662.959567152364 
 | epoch 17 | loss: 50346.05139735418 
 | epoch 18 | loss: 50030.79852150308 
 | epoch 19 | loss: 49717.38927466019 
 | epoch 20 | loss: 49406.11531384816 
 | epoch 21 | loss: 49097.35962469079 
 | epoch 22 | loss: 48791.57352768618 
 | epoch 23 | loss: 48489.2432267447 
 | epoch 24 | loss: 48190.85081059146 
 | epoch 25 | loss: 47896.838023326374 
 | epoch 26 | loss: 47607.5806499

In [37]:
Model.predict("bomb", 7)

['liberal',
 'clash',
 'planted',
 'relationship',
 'describing',
 'fighting',
 'scene']

In [38]:
Model.predict("daughter", 3)

['political', 'philosopher', 'killing']

In [40]:
Model.predict("war", 7)

['attack', 'society', 'war', 'proponents', 'win', 'six-month', 'inevitable']