In [15]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import csv

# Set the limit for number of articles to download
LIMIT = 50

data = {}
data['newspapers'] = {}

# load newspaper
with open('company.json') as data_file:
    companies = json.load(data_file)

articles_array = []    
    
count = 1
# Iterate through each news company
for company, value in companies.items():
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            #"link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                article['authors'] = content.authors
                article['top_image'] =  content.top_image
                article['movies'] = content.movies
                newsPaper['articles'].append(article)
                articles_array.append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.

            article = {}
            article['title'] = content.title
            article['authors'] = content.authors
            article['text'] = content.text
            article['top_image'] =  content.top_image
            article['movies'] = content.movies
            article['link'] = content.url
            article['published'] = content.publish_date
            newsPaper['articles'].append(article)
            articles_array.append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            #noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper
    
# Finally it saves the articles as a JSON-file.
try:
    f = csv.writer(open('Scraped_data_news_output.csv', 'w', encoding='utf-8'))
    f.writerow(['Title', 'Authors','Text','Image','Videos','Link','Published_Date'])
    #print(article)
    for artist_name in articles_array:
        title = artist_name['title']
        authors=artist_name['authors']
        text=artist_name['text']
        image=artist_name['top_image']
        video=artist_name['movies']
        link=artist_name['link']
        publish_date=artist_name['published']
        # Add each artist’s name and associated link to a row
        f.writerow([title, authors, text, image, video, link, publish_date])

except Exception as e: print(e)

Downloading articles from  HT
1 articles downloaded from HT , url:  https://www.hindustantimes.com/football/denmark-edge-switzerland-in-euro-qualifiers-after-ireland-draw/story-DF9Pl1zxcGTxgGQrEU5N7L.html
2 articles downloaded from HT , url:  https://www.hindustantimes.com/football/italy-clinch-place-at-euro-2020-as-spain-made-to-wait/story-Pz74p46GQdxeXHagkq9bdO.html
3 articles downloaded from HT , url:  https://www.hindustantimes.com/football/exclusive-bayern-legend-klaus-augenthaler-talks-about-money-in-football-thomas-mueller-situation-and-much-more/story-382Zmd0wRJAYt2mQXdfpHO.html
4 articles downloaded from HT , url:  https://www.hindustantimes.com/football/sunil-chhetri-is-the-most-dangerous-player-bangladesh-coach-wary-of-indian-captain-s-threat/story-XaNJQq8VmYOIlYafzjUQDK.html
5 articles downloaded from HT , url:  https://www.hindustantimes.com/football/stimac-names-23-member-india-squad-for-world-cup-qualifiers-against-bangladesh/story-3vJzxtrc9639uajmDqaLFM.html
6 articles 

In [16]:
import pandas as pd

In [17]:
df_articles = pd.read_csv('Scraped_data_news_output.csv')

In [18]:
df_articles

Unnamed: 0,Title,Authors,Text,Image,Videos,Link,Published_Date
0,Denmark edge Switzerland in Euro qualifiers af...,[],"football\n\nUpdated: Oct 13, 2019 08:20 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/denmar...,2019-10-13T02:50:45
1,Italy clinch place at Euro 2020 as Spain made ...,[],"football\n\nUpdated: Oct 13, 2019 08:00 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/italy-...,2019-10-13T02:30:00
2,EXCLUSIVE: Bayern legend Klaus Augenthaler tal...,['Bihan Sengupta'],"football\n\nUpdated: Oct 12, 2019 23:09 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/exclus...,2019-10-12T05:39:17
3,"‘Sunil Chhetri is the most dangerous player,’ ...",['Press Trust Of India'],"football\n\nUpdated: Oct 12, 2019 17:56 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/sunil-...,2019-10-12T12:26:09
4,Stimac names 23-member India squad for World C...,['Press Trust Of India'],"football\n\nUpdated: Oct 12, 2019 17:05 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/stimac...,2019-10-12T11:35:14
5,‘It is our right to watch a football match’,['Hoda Hashemi'],"football\n\nUpdated: Oct 12, 2019 09:24 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/it-is-...,2019-10-12T03:54:08
6,Euro 2020: Cristiano Ronaldo edges closer to c...,[],"football\n\nUpdated: Oct 12, 2019 08:16 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/euro-2...,2019-10-12T02:46:05
7,"‘Private jets left, right and centre’: Roberto...",['Ht Correspondent'],"football\n\nUpdated: Oct 11, 2019 16:25 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/privat...,2019-10-11T10:55:52
8,‘I really missed being on the pitch’: Jeje Lal...,['Indo Asian News Service'],"football\n\nUpdated: Oct 11, 2019 16:17 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/i-real...,2019-10-11T10:47:08
9,Iran women freely attend football match for fi...,[],"football\n\nUpdated: Oct 11, 2019 08:19 IST\n\...",https://www.hindustantimes.com/rf/image_size_9...,[],https://www.hindustantimes.com/football/iran-w...,2019-10-11T02:49:50


In [19]:
df = df_articles.filter(["Title", "Text"])
df

Unnamed: 0,Title,Text
0,Denmark edge Switzerland in Euro qualifiers af...,"football\n\nUpdated: Oct 13, 2019 08:20 IST\n\..."
1,Italy clinch place at Euro 2020 as Spain made ...,"football\n\nUpdated: Oct 13, 2019 08:00 IST\n\..."
2,EXCLUSIVE: Bayern legend Klaus Augenthaler tal...,"football\n\nUpdated: Oct 12, 2019 23:09 IST\n\..."
3,"‘Sunil Chhetri is the most dangerous player,’ ...","football\n\nUpdated: Oct 12, 2019 17:56 IST\n\..."
4,Stimac names 23-member India squad for World C...,"football\n\nUpdated: Oct 12, 2019 17:05 IST\n\..."
5,‘It is our right to watch a football match’,"football\n\nUpdated: Oct 12, 2019 09:24 IST\n\..."
6,Euro 2020: Cristiano Ronaldo edges closer to c...,"football\n\nUpdated: Oct 12, 2019 08:16 IST\n\..."
7,"‘Private jets left, right and centre’: Roberto...","football\n\nUpdated: Oct 11, 2019 16:25 IST\n\..."
8,‘I really missed being on the pitch’: Jeje Lal...,"football\n\nUpdated: Oct 11, 2019 16:17 IST\n\..."
9,Iran women freely attend football match for fi...,"football\n\nUpdated: Oct 11, 2019 08:19 IST\n\..."


In [20]:
for i in range(0,df.last_valid_index()+1):
    x = df.iloc[i]['Text']
    a = df.iloc[i]['Text'][43:(len(x)-41)]
    df.iloc[i]['Text'] = a

In [21]:
df

Unnamed: 0,Title,Text
0,Denmark edge Switzerland in Euro qualifiers af...,Kasper Schmeichel produced a series of excelle...
1,Italy clinch place at Euro 2020 as Spain made ...,Italy became the second team to qualify for Eu...
2,EXCLUSIVE: Bayern legend Klaus Augenthaler tal...,Klaus Augenthaler represents a footballing cla...
3,"‘Sunil Chhetri is the most dangerous player,’ ...","No stranger to Indian football, Bangladesh ass..."
4,Stimac names 23-member India squad for World C...,Head coach Igor Stimac on Saturday named a 23-...
5,‘It is our right to watch a football match’,"In 90 minutes, we Iranian women changed 40 yea..."
6,Euro 2020: Cristiano Ronaldo edges closer to c...,Cristiano Ronaldo moved to within six goals of...
7,"‘Private jets left, right and centre’: Roberto...",Former Real Madrid star Roberto Carlos reveale...
8,‘I really missed being on the pitch’: Jeje Lal...,At a time when the Indian football team is cri...
9,Iran women freely attend football match for fi...,"Waving flags and snapping selfies, thousands o..."


In [22]:
from stanfordcorenlp import StanfordCoreNLP
import json, string

def lemmatize_corenlp(conn_nlp, sentence):
    props = {
        'annotators': 'pos,lemma',
        'pipelineLanguage': 'en',
        'outputFormat': 'json'
    }

    # tokenize into words
    sents = conn_nlp.word_tokenize(sentence)

    # remove punctuations from tokenised list
    sents_no_punct = [s for s in sents if s not in string.punctuation]

    # form sentence
    sentence2 = " ".join(sents_no_punct)

    # annotate to get lemma
    parsed_str = conn_nlp.annotate(sentence2, properties=props)
    parsed_dict = json.loads(parsed_str)    # contains the text which is annotated
    
    # extract the lemma for each word
    lemma_list = [v 
                  for d in parsed_dict['sentences'][0]['tokens'] 
                      for k,v in d.items() 
                          if k == 'lemma'
                 ]

    # form sentence and return it
    return " ".join(lemma_list)


In [23]:
# make the connection and call `lemmatize_corenlp`
nlp = StanfordCoreNLP('http://localhost', port=9000, timeout=30000)

In [24]:
# output for lemmatizing only
for i in range(0, df.last_valid_index()+1):
    df.iloc[i]['Title'] = lemmatize_corenlp(conn_nlp=nlp, sentence=df.iloc[i]['Title'])
    df.iloc[i]['Text'] = lemmatize_corenlp(conn_nlp=nlp, sentence=df.iloc[i]['Text'])