In [1]:
from bs4 import BeautifulSoup
import requests
from word_parsing import tokenize, tag_tokens
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

def displayHeadlines(source, headlines):

    source = []
    count = 0
    for head in headlines:
        if count < 5:
            source.append(head.getText())
            count += 1
        else:
            break

    return source

def journalScrape(top_headlines):

    r1 = requests.get('https://www.thejournal.ie/irish')
    coverpage = r1.content

    soup1 = BeautifulSoup(coverpage, 'html5lib')
    headlines = soup1.find_all('div', class_='text span-5 last')

    complete_headlines = []

    for tag in headlines:
        atag = tag.find_all('h4', class_=None)
        for a in atag:
            text = a.find_all('a')
            headline = displayHeadlines("Journal", text)
            complete_headlines += headline

    top_headlines["Journal"] = complete_headlines[0:5]
    return top_headlines

def RTEScrape(top_headlines):

    r1 = requests.get('https://www.rte.ie/news')
    coverpage = r1.content

    soup1 = BeautifulSoup(coverpage, 'html5lib')
    headlines = soup1.find_all('span', class_='underline')

    headlines = displayHeadlines("RTE", headlines)
    top_headlines["RTE"] = headlines
    return top_headlines

def irishIndependantScrape(top_headlines):
    r1 = requests.get('https://www.independent.ie')
    coverpage = r1.content

    soup1 = BeautifulSoup(coverpage, 'html5lib')
    headlines = soup1.find_all('h3', class_='hx')

    headlines = displayHeadlines("Independant", headlines)
    top_headlines["Independant"] = headlines
    return top_headlines

def irishTimesScrape(top_headlines):
    r1 = requests.get('https://www.irishtimes.com/')
    coverpage = r1.content

    soup1 = BeautifulSoup(coverpage, 'html5lib')
    headlines = soup1.find_all('span', class_='tr-headline')

    headlines = displayHeadlines("Irish Times", headlines)
    top_headlines["Irish Times"] = headlines
    return top_headlines


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def irishSites():
    top_headlines = dict()
    top_headlines = irishTimesScrape(top_headlines)
    top_headlines = irishIndependantScrape(top_headlines)
    top_headlines = RTEScrape(top_headlines)
    top_headlines = journalScrape(top_headlines)
    
    return top_headlines

In [10]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
import tensorflow_hub as hub
import numpy as np

tf.logging.set_verbosity(tf.logging.ERROR)

def tokenize_stop_lemmatize(data):
    """ Check if data structure is a dictionary.
    Tokensize the words in each of the titles.
    Extract the Nouns and Verbs and remove stop words
    """

    stop_words = set(stopwords.words('english'))
    punctuations="?:!.,;"

    if data == dict():
        for source, articles in data.items():
            token_articles = []
            for article in articles:
                    token_article = word_tokenize(article)
                    filtered_article = [w for w in token_article if (not w in stop_words and not w in punctuations)]
                    filtered_article = [w.lower() for w in filtered_article]
                    token_articles.append(filtered_article)

            data[source] = token_articles

        return data

    else:
        token_articles = []

        for article in data:
            token_article = word_tokenize(article)
            filtered_article = [w.lower() for w in token_article if (not w in stop_words and not w in punctuations)]
            token_articles.append(filtered_article)

        return token_articles

def getStringArticles(dictionary):

    article_list = []

    for articles in dictionary.values():
        for article in articles:
            article = (article.replace('\n', ''))
            article_list.append(article)

    return article_list


In [8]:
top_headlines = irishSites()
string_articles = getStringArticles(top_headlines)
token = tokenize_stop_lemmatize(string_articles)
print(token)

[['budget', '2020', 'the', 'main', 'points'], ['brexit', 'johnson', 'allies', 'blame', 'merkel', 'admit', 'deal', 'hopes', 'effectively', 'dead'], ['budget', '2020', 'donohoe', 'allocates', '€1.2bn', 'tackle', 'no-deal', 'brexit'], ['michelin', 'guide', '2020', 'full', 'list', 'irish', 'restaurants', 'judges', 'said'], ['boy', 'cut', 'eyebrow', 'fall', 'dublin', 'crèche', 'awarded', '€32,500'], ['breaking', 'minister', 'announces', '€1.2', 'billion', 'package', 'respond', 'no-deal', 'brexit'], ['live', 'the', 'brexit', 'budget', 'donohoe', 'presents', 'budget', '2020', 'dáil'], ['breaking', 'budget', '2020', 'boost', 'parents', 'early', 'learning', 'childcare', 'gets', '€54m'], ['exclusive', "'eight", '10', 'people', 'staying', 'one', 'bedroom', "'", '-', 'secret', 'footage', 'reveals', 'asylum', 'seekers', 'crammed', 'hotel'], ["'it", "'s", 'vanished', 'face', 'earth', "'", '-', 'son', "'s", 'appeal', 'help', 'finding', 'missing', 'mum'], ['watch', 'paschal', 'donohoe', "'s", 'budget'

1. We are now going to try and evaluate the topics that are coveredin the headlines. (Topic Modeling)
2. We can then look at which artilces have the topics. (Comparison)
3. Compile a list of the articles that have the topics. (Aggregation)
4. Run analysis on the articles to invesitgate which is the most unbiased. (Further Research Needed)

https://datascience.stackexchange.com/questions/23969/sentence-similarity-prediction

In [15]:
import gensim
import os

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

PATH_TO_WORD2VEC = os.path.expanduser("~/data/word2vec/GoogleNews-vectors-negative300.bin")
PATH_TO_GLOVE = os.path.expanduser("~/data/glove/glove.840B.300d.txt")

word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC, binary=True)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Stephen/data/word2vec/GoogleNews-vectors-negative300.bin'

In [5]:
# def tag_tokens(dictionary):
#     """ Compare the returned words.
#     If they match with any of them, increment that article by 1 score.
#     Order them by the scoring.
#     """
#     try:
#         for source, articles in dictionary.items():
#             tagged_tokens = []
#             for i in articles:
#                 i = [v.lower() for v in i]
#                 i = pos_tag(i)
#                 tagged_tokens.append(i)

#             dictionary[source] = tagged_tokens

#         return dictionary

#     except ValueError:
#         return "Error in data structure. \
#         Please see the input data type and ensure that the dictionary is Source : Articles[]. \
#         Then ensure that the values are tokenized"

# def getNouns(dictionary):
#     """ Remove all tokens that are not Nouns and return the dictionary
#     """

#     try:
#         for source, articles in dictionary.items():
#             clean_articles = []
#             for tokens in articles:
#                 nouns = []
#                 for token in tokens:
#                     if token[1] == "NN" or token[1] == "NNS" or token[1] == "VBG":
#                         nouns.append(token)

#                 clean_articles.append(nouns)

#             dictionary[source] = clean_articles

#         return dictionary

#     except:
#         return "Error, the nouns could not be identified. Check the articles"

# def sentenceSimilarities(data):
    
#     embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

#     tf.logging.set_verbosity(tf.logging.ERROR)

#     with tf.Session() as session:
#         session.run([tf.global_variables_initialiser(), tf.tables_initializer()])
#         embeddings = session.run(embed(data))

#     return np.array(embeddings).tolist()

In [6]:
# from collections import defaultdict, Counter

# def getOccurances(dictionary):

#     dict_word_count = defaultdict(int)

#     for source, articles in dictionary.items():
#         for article in articles:
#             for word in article:
#                 dict_word_count[word[0]] += 1

#     return dict_word_count

# def getStringArticles(dictionary):

#     article_list = []

#     for articles in dictionary.values():
#         for article in articles:
#             article = (article.replace('\n', ''))
#             article_list.append(article)
            
#     return article_list

# def compareArticles(dictionary):
#     pass
#     # list of articles that are tokenize
#     #
#     # for each item in the list compare