## Briefly


### __ Problem Statement __
- Obtain news from google news articles
- Sammarize the articles within 60 words
- Obtain keywords from the articles










##### Importing all the necessary libraries required to run the following code 

In [1]:
from gnewsclient import gnewsclient   # for fetching google news
from newspaper import Article         # to obtain text from news articles
from transformers import pipeline     # to summarize text
import spacy                          # for named entity recognition
import spacy.displacy as displacy     # display keywords

##### Load sshleifer/distilbart-cnn-12-6 model

In [2]:
def load_model():                           
    model = pipeline('summarization')
    return model
data = gnewsclient.NewsClient(max_results=0)
nlp = spacy.load("en_core_web_lg") 

##### Obtain urls and it's content

In [3]:
def getNews(topic,location):                
    count=0
    contents=[]
    titles=[]
    authors=[]
    urls=[]
    data = gnewsclient.NewsClient(language='english',location=location,topic=topic,max_results=10) 
    news = data.get_news()  
    for item in news:
        url=item['link']
        article = Article(url)
        try:
            article.download()
            article.parse()
            temp=item['title'][::-1]
            index=temp.find("-")
            temp=temp[:index-1][::-1]
            urls.append(url)
            contents.append(article.text)
            titles.append(item['title'][:-index-1])    
            authors.append(temp)
            count+=1
            if(count==5):
                break
        except:
            continue 
    return contents,titles,authors,urls         

##### Summarizes the content- minimum word limit 30 and maximum 60

In [4]:
def getNewsSummary(contents,summarizer):   
    summaries=[]     
    for content in contents:
        minimum=len(content.split())
        summaries.append(summarizer(content,max_length=60,min_length=min(30,minimum),do_sample=False,truncation=True)[0]['summary_text'])   
    return summaries

##### Named Entity Recognition

In [5]:
# Obtain 4 keywords from content (person,organisation or geopolitical entity) 
def generateKeyword(contents):            
    keywords=[]
    words=[]      
    labels=["PERSON","ORG","GPE"]
    for content in contents:
        doc=nlp(content)
        keys=[]
        limit=0
        for ent in doc.ents:
            key=ent.text.upper()
            label=ent.label_
            if(key not in words and key not in keywords and label in labels): 
                keys.append(key)
                limit+=1
                for element in key.split():
                    words.append(element)
            if(limit==4):
                keywords.append(keys)
                break                           
    return keywords
  

##### Displaying keywords 

In [6]:
def printKeywords(keywords):
    for keyword in keywords:
        print(keyword)

##### Displaying the Summary with keywords in it highlighted

In [7]:
def printSummary(summaries,titles):
    colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
    options = {"ents": ["PERSON","ORG","GPE","NORP","PERCENT"],"colors": colors} 
    for summary,title in zip(summaries,titles):
        doc=nlp(summary) 
        print('\033[1m' + title.upper() + '\033[0m\n')
        displacy.render(doc, style="ent", options=options,jupyter=True)
        print("\n\n")

In [8]:
summarizer=load_model() 

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


In [9]:
contents,titles,authors,urls=getNews("Sports","India")

In [10]:
summaries=getNewsSummary(contents,summarizer)

In [11]:
keywords=generateKeyword(contents)

In [12]:
printKeywords(keywords)

['DWAYNE BRAVO', 'SRI LANKA', 'ICC', 'THE WEST INDIES']
['VIRAT KOHLI', 'INDIA', 'SCOTLAND', 'SUPER 12']
['AUSTRALIA', 'AFGHANISTAN', 'CRICKET AUSTRALIA', 'CRICBUZZ STAFF •']
['GARY STEAD', 'TRENT BOULT', 'COLIN DE GRANDHOMME', 'BLACKCAPS']
["VIRAT KOHLI'S", 'TEAM INDIA', 'DHONI', 'UAE']


In [13]:
printSummary(summaries,titles)

[1mT20 WORLD CUP 2021: WEST INDIES AND CHENNAI SUPER KINGS ALL-ROUNDER DWAYNE BRAVO TO RETIRE AFTER SHOWPIECE... [0m






[1mHAPPY BIRTHDAY VIRAT KOHLI: INDIAN CRICKET TEAM CAPTAIN TURNS 33 [0m






[1mONE-OFF TEST VS AFGHANISTAN POSTPONED, CONFIRMS CRICKET AUSTRALIA | CRICBUZZ.COM - CRICBUZZ [0m






[1mNEW ZEALAND INCLUDE FIVE SPINNERS FOR INDIA TOUR, TRENT BOULT OPTS OUT CITING BUBBLE FATIGUE [0m






[1m‘THERE ARE MANY CANDIDATES BUT HE’S THE BEST': SEHWAG PICKS NEXT INDIA CAPTAIN AFTER KOHLI STEPS DOWN AT END OF T20 WC [0m






