In [12]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import csv

In [13]:
# Set the limit for number of articles to download
LIMIT = 50

data = {}
data['newspapers'] = {}

# load newspaper
with open('company.json') as data_file:
    companies = json.load(data_file)

articles_array = []    
    
count = 1
# Iterate through each news company
for company, value in companies.items():
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            #"link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                article['authors'] = content.authors
                article['top_image'] =  content.top_image
                article['movies'] = content.movies
                newsPaper['articles'].append(article)
                articles_array.append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.

            article = {}
            article['title'] = content.title
            article['authors'] = content.authors
            article['text'] = content.text
            article['top_image'] =  content.top_image
            article['movies'] = content.movies
            article['link'] = content.url
            article['published'] = content.publish_date
            newsPaper['articles'].append(article)
            articles_array.append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            #noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper
    
# Finally it saves the articles as a JSON-file.
try:
    f = csv.writer(open('Scraped_data_news_output.csv', 'w', encoding='utf-8'))
    f.writerow(['Title', 'Authors','Text','Image','Videos','Link','Published_Date'])
    #print(article)
    for artist_name in articles_array:
        title = artist_name['title']
        authors=artist_name['authors']
        text=artist_name['text']
        image=artist_name['top_image']
        video=artist_name['movies']
        link=artist_name['link']
        publish_date=artist_name['published']
        # Add each artist’s name and associated link to a row
        f.writerow([title, authors, text, image, video, link, publish_date])

except Exception as e: print(e)

Downloading articles from  HT
1 articles downloaded from HT , url:  https://www.hindustantimes.com/football/kolkata-in-race-to-host-india-qatar-world-cup-qualifier-in-march/story-VFQL7RbhSjpfv5a6X6MVLI.html
2 articles downloaded from HT , url:  https://www.hindustantimes.com/football/manchester-united-will-reinforce-squad-in-january-ole-gunnar-solskjaer/story-tH7dG6Wbx1kYf3C7ZYkeQI.html
3 articles downloaded from HT , url:  https://www.hindustantimes.com/football/arsenal-s-mesut-ozil-recalls-terrifying-car-jacking-ordeal/story-zzmRfs4lRq1BXxlLAin5kI.html
4 articles downloaded from HT , url:  https://www.hindustantimes.com/football/lionel-messi-wins-sixth-golden-shoe-award/story-pK7imiUxPxvKIeuETEtpsN.html
5 articles downloaded from HT , url:  https://www.hindustantimes.com/football/liverpool-head-to-united-with-widest-gulf-in-years/story-nN3vM5Rtfbxh0g2IcDtqxL.html
6 articles downloaded from HT , url:  https://www.hindustantimes.com/football/manchester-city-s-sergio-aguero-unhurt-after

In [16]:
import pandas as pd

In [17]:
df_articles = pd.read_csv('Scraped_data_news_output.csv')

In [18]:
df_articles

Unnamed: 0,Title,Authors,Text,Image,Videos,Link,Published_Date
0,Kolkata in race to host India-Qatar World Cup ...,['Indo Asian News Service'],"football\r\n\r\nUpdated: Oct 17, 2019 19:49 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/kolkat...,2019-10-17T02:19:56
1,Manchester United will reinforce squad in Janu...,[],"football\r\n\r\nUpdated: Oct 17, 2019 19:49 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/manche...,2019-10-17T02:19:51
2,Arsenal’s Mesut Ozil recalls terrifying car-ja...,['Agence France-Presse'],"football\r\n\r\nUpdated: Oct 17, 2019 17:33 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/arsena...,2019-10-17T12:03:32
3,Lionel Messi wins sixth Golden Shoe award,['Indo Asian News Service'],"football\r\n\r\nUpdated: Oct 17, 2019 13:19 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/lionel...,2019-10-17T07:49:40
4,Liverpool head to United with widest gulf in y...,[],"football\r\n\r\nUpdated: Oct 17, 2019 12:26 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/liverp...,2019-10-17T06:56:36
5,Manchester City’s Sergio Aguero unhurt after c...,[],"football\r\n\r\nUpdated: Oct 17, 2019 12:26 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/manche...,2019-10-17T06:56:34
6,Bulgaria police hold six over racist abuse at ...,[],"football\r\n\r\nUpdated: Oct 16, 2019 21:43 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/bulgar...,2019-10-16T04:13:16
7,Barcelona’s Lionel Messi receives record sixth...,[],"football\r\n\r\nUpdated: Oct 16, 2019 20:59 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/barcel...,2019-10-16T03:29:20
8,Real Madrid vs Barcelona: La Liga wants El Cla...,[],"football\r\n\r\nUpdated: Oct 16, 2019 18:10 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/real-m...,2019-10-16T12:40:18
9,"‘Bangladesh draw unfortunate, team will learn ...",['Indo Asian News Service'],"football\r\n\r\nUpdated: Oct 16, 2019 16:14 IS...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/bangla...,2019-10-16T10:44:41


In [19]:
df = df_articles.filter(["Title", "Text"])
df

Unnamed: 0,Title,Text
0,Kolkata in race to host India-Qatar World Cup ...,"football\r\n\r\nUpdated: Oct 17, 2019 19:49 IS..."
1,Manchester United will reinforce squad in Janu...,"football\r\n\r\nUpdated: Oct 17, 2019 19:49 IS..."
2,Arsenal’s Mesut Ozil recalls terrifying car-ja...,"football\r\n\r\nUpdated: Oct 17, 2019 17:33 IS..."
3,Lionel Messi wins sixth Golden Shoe award,"football\r\n\r\nUpdated: Oct 17, 2019 13:19 IS..."
4,Liverpool head to United with widest gulf in y...,"football\r\n\r\nUpdated: Oct 17, 2019 12:26 IS..."
5,Manchester City’s Sergio Aguero unhurt after c...,"football\r\n\r\nUpdated: Oct 17, 2019 12:26 IS..."
6,Bulgaria police hold six over racist abuse at ...,"football\r\n\r\nUpdated: Oct 16, 2019 21:43 IS..."
7,Barcelona’s Lionel Messi receives record sixth...,"football\r\n\r\nUpdated: Oct 16, 2019 20:59 IS..."
8,Real Madrid vs Barcelona: La Liga wants El Cla...,"football\r\n\r\nUpdated: Oct 16, 2019 18:10 IS..."
9,"‘Bangladesh draw unfortunate, team will learn ...","football\r\n\r\nUpdated: Oct 16, 2019 16:14 IS..."


In [20]:
for i in range(0,df.last_valid_index()+1):
    x = df.iloc[i]['Text']
    a = df.iloc[i]['Text'][43:(len(x)-41)]
    df.iloc[i]['Text'] = a

In [21]:
df

Unnamed: 0,Title,Text
0,Kolkata in race to host India-Qatar World Cup ...,\r\n\r\nThe iconic Salt Lake Stadium could onc...
1,Manchester United will reinforce squad in Janu...,\r\n\r\nManchester United will try to sign one...
2,Arsenal’s Mesut Ozil recalls terrifying car-ja...,\r\n\r\nMezut Ozil has opened up about being a...
3,Lionel Messi wins sixth Golden Shoe award,"\r\n\r\nLionel Messi, star striker of FC Barce..."
4,Liverpool head to United with widest gulf in y...,\r\n\r\nPremier League leaders Liverpool head ...
5,Manchester City’s Sergio Aguero unhurt after c...,\r\n\r\nManchester City striker Sergio Aguero ...
6,Bulgaria police hold six over racist abuse at ...,\r\n\r\nSix Bulgarians have been detained over...
7,Barcelona’s Lionel Messi receives record sixth...,\r\n\r\nBarcelona forward Lionel Messi picked ...
8,Real Madrid vs Barcelona: La Liga wants El Cla...,\r\n\r\nThe Spanish football league said Wedne...
9,"‘Bangladesh draw unfortunate, team will learn ...",\r\n\r\nFormer India captain Bhaichung Bhutia ...


In [26]:
from pycorenlp import StanfordCoreNLP
import json, string
nlp=StanfordCoreNLP('http://localhost:9000')

In [29]:
text_input = 'this is a test.'
print('text_input: {0}'.format(text_input))
text_output = nlp.annotate(text_input, properties={
                    'annotators': 'tokenize,ssplit',
                    'outputFormat': 'json'
                })
print('text_output: {0}'.format(text_output))



text_input: this is a test.
text_output: {'sentences': [{'index': 0, 'tokens': [{'index': 1, 'word': 'this', 'originalText': 'this', 'characterOffsetBegin': 0, 'characterOffsetEnd': 4, 'before': '', 'after': ' '}, {'index': 2, 'word': 'is', 'originalText': 'is', 'characterOffsetBegin': 5, 'characterOffsetEnd': 7, 'before': ' ', 'after': ' '}, {'index': 3, 'word': 'a', 'originalText': 'a', 'characterOffsetBegin': 8, 'characterOffsetEnd': 9, 'before': ' ', 'after': ' '}, {'index': 4, 'word': 'test', 'originalText': 'test', 'characterOffsetBegin': 10, 'characterOffsetEnd': 14, 'before': ' ', 'after': ''}, {'index': 5, 'word': '.', 'originalText': '.', 'characterOffsetBegin': 14, 'characterOffsetEnd': 15, 'before': '', 'after': ''}]}]}


In [37]:

def lemmatize_corenlp(conn_nlp, sentence):
    props = {
        'annotators': 'pos,lemma,tokenize,ssplit',
        'pipelineLanguage': 'en',
        'outputFormat': 'json'
    }

    # tokenize into words
    #sents = conn_nlp.word_tokenize(sentence)

    # remove punctuations from tokenised list
    sents_no_punct = [s for s in sentence if s not in string.punctuation]

    # form sentence
    sentence2 = " ".join(sents_no_punct)

    # annotate to get lemma
    parsed_str = conn_nlp.annotate(sentence2, properties=props)
    parsed_dict = json.loads(parsed_str)    # contains the text which is annotated
    
    # extract the lemma for each word
    lemma_list = [v 
                  for d in parsed_dict['sentences'][0]['tokens'] 
                      for k,v in d.items() 
                          if k == 'lemma'
                 ]

    # form sentence and return it
    return " ".join(lemma_list)


In [38]:
# make the connection and call `lemmatize_corenlp`
#nlp = StanfordCoreNLP('http://localhost', port=9000, timeout=30000)

In [39]:
# output for lemmatizing only
for i in range(0, df.last_valid_index()+1):
    df.iloc[i]['Title'] = lemmatize_corenlp(conn_nlp=nlp, sentence=df.iloc[i]['Title'])
    df.iloc[i]['Text'] = lemmatize_corenlp(conn_nlp=nlp, sentence=df.iloc[i]['Text'])

TypeError: the JSON object must be str, bytes or bytearray, not dict