In [1]:
# Inshorts API : https://github.com/pari08tosh/Inshorts-API
# Collect Data and save to file
import requests
import re
import math
import operator

from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
stop = stopwords.words('english')

def getNews(category):
    newsDictionary = {
        'success': True,
        'category': category,
        'data': []
    }

    try:
        htmlBody = requests.get('https://www.inshorts.com/en/read/' + category)
    except requests.exceptions.RequestException as e:
        newsDictionary['success'] = False
        newsDictionary['errorMessage'] = str(e.message)
        return newsDictionary

    soup = BeautifulSoup(htmlBody.text, 'lxml')
    newsCards = soup.find_all(class_='news-card')
    if not newsCards:
        newsDictionary['success'] = False
        newsDictionary['errorMessage'] = 'Invalid Category'
        return newsDictionary

    for card in newsCards:
        try:
            title = card.find(class_='news-card-title').find('a').text
        except AttributeError:
            title = None

        try:
            imageUrl = card.find(
                class_='news-card-image')['style'].split("'")[1]
        except AttributeError:
            imageUrl = None

        try:
            url = ('https://www.inshorts.com' + card.find(class_='news-card-title')
                   .find('a').get('href'))
        except AttributeError:
            url = None

        try:
            content = card.find(class_='news-card-content').find('div').text
        except AttributeError:
            content = None

        try:
            author = card.find(class_='author').text
        except AttributeError:
            author = None

        try:
            date = card.find(clas='date').text
        except AttributeError:
            date = None

        try:
            time = card.find(class_='time').text
        except AttributeError:
            time = None

        try:
            readMoreUrl = card.find(class_='read-more').find('a').get('href')
        except AttributeError:
            readMoreUrl = None

        newsObject = {
            'title': title,
            'imageUrl': imageUrl,
            'url': url,
            'content': content,
            'author': author,
            'date': date,
            'time': time,
            'readMoreUrl': readMoreUrl
        }

        newsDictionary['data'].append(newsObject)
    return newsDictionary

### Categories on Inshorts 
# '' // blank to get top news from all categories, national //Indian National News, business, sports, 
# world, politics, technology, startup, entertainment, miscellaneous, hatke // Unconventional, science, automobile

result_df = pd.read_csv("inshorts_data.csv")
category_list = [" ",
                "national",
                "business",
                "sports",
                "world",
                "politics",
                "technology",
                "startup",
                "entertainment",
                "miscellaneous",
                "hatke",
                "science",
                "automobile"]
k = result_df.shape[0]
title_list = list()
title_list = result_df['title'].tolist()
for cat in category_list:
    result_news = getNews(cat)
    for i in range(len(result_news['data'])):
        title = result_news['data'][i]['title']
        if title not in title_list:
            result_df.loc[k,'title'] = title
            result_df.loc[k,'content'] = result_news['data'][i]['content']
            k = k + 1
            title_list.append(title)
print(result_df.shape)
result_df.to_csv("inshorts_data.csv", index=False)

(781, 2)


In [2]:
## Data Cleaning
## https://medium.com/@acrosson/summarize-documents-using-tf-idf-bdee8f60b71
## https://towardsdatascience.com/tfidf-for-piece-of-text-in-python-43feccaa74f8

def clean_document(document):
    """Cleans document by removing unnecessary punctuation. It also removes
    any extra periods and merges acronyms to prevent the tokenizer from
    splitting a false sentence
    """
    # Remove all characters outside of Alpha Numeric
    # and some punctuation
    document = re.sub('[^A-Za-z .-]+', ' ', document)
    document = document.replace('-', '')
    document = document.replace('...', '')
    document = document.replace('Mr.', 'Mr').replace('Mrs.', 'Mrs')

    # Remove Ancronymns M.I.T. -> MIT
    # to help with sentence tokenizing
    document = merge_acronyms(document)

    # Remove extra whitespace
    document = ' '.join(document.split())
    return(document)

def merge_acronyms(s):
    """Merges all acronyms in a given sentence. For example M.I.T -> MIT"""
    r = re.compile(r'(?:(?<=\.|\s)[A-Z]\.)+')
    acronyms = r.findall(s)
    for a in acronyms:
        s = s.replace(a, a.replace('.',''))
    return(s)

def remove_stop_words(document):
    """Returns document without stop words"""
    document = ' '.join([i for i in document.split() if i not in stop])
    return(document)

In [3]:
## Use tfidf to find score of words in sentence and then find importance sentence..
def sent_word_frequency(sentence):
    words = word_tokenize(sentence)
    words_dict = {}
    for word in words:
        word = word.lower()
        if(word in words_dict):
            words_dict[word] = words_dict[word] + 1/len(words)
        else:
            words_dict[word] = 1/len(words)
    return(words_dict)

def create_tf_dict(document):
    freq_list = []
    for k in range(1,len(document)+1):
        temp = {"sent_id":k,
               "sent_dict":sent_word_frequency(document[k-1])}
        freq_list.append(temp)
    return(freq_list)

def create_idf_dict(document, term_freq_list):
    idf_list = []
    for i in range(len(term_freq_list)):
        for val in term_freq_list[i]['sent_dict'].keys():
            count = sum([val in tempDict['sent_dict'] for tempDict in term_freq_list])
            temp = {"IDF_score": math.log(len(document)/count), "key":val, "sent_id":i}
            idf_list.append(temp)
    return(idf_list)

def rank_sentences(freq_list, idf_list, top_n , com_document):
    sent_tfidf = dict()
    for i in range(len(freq_list)):
        temp_score = 0
        for k,v in freq_list[i]['sent_dict'].items():
            for data in idf_list:
                if k == data['key']:
                    temp_score = temp_score + v*data['IDF_score']

        sent_tfidf[com_document[i]] = temp_score
    sorted_sent_tfidf = sorted(sent_tfidf.items(), key=lambda x: -x[1])
    sorted_list = list()

    for k in range(len(sorted_sent_tfidf)):
        sorted_list.append(sorted_sent_tfidf[k][0])
    print("\n".join(sorted_list[:top_n]))
    print("-------------")
    return(sorted_sent_tfidf)

document = result_df.loc[0,'content'] + " " + result_df.loc[1,'content'] + " " + result_df.loc[2,'content'] + " " + result_df.loc[3,'content'] + " " + result_df.loc[4,'content'] + " " + result_df.loc[5,'content'] + " " + result_df.loc[6,'content']
print(document)
cleaned_document = clean_document(document)

sent_text = sent_tokenize(cleaned_document)
filtered_text =[remove_stop_words(w) for w in sent_text]

summary_1 = rank_sentences(create_tf_dict(filtered_text),
               create_idf_dict(filtered_text, create_tf_dict(filtered_text)),
               2,
               sent_text)

A Pakistani prisoner,  identified as Shakir Ullah, was allegedly killed by other inmates in Rajasthan's Jaipur Central Jail following a quarrel, Jaipur Jail IG Rupinder Singh said on Wednesday. Senior police officials along with forensic experts have reached the jail, reports said. The deceased prisoner, who was lodged in the jail since 2011, was serving life sentence. Indian Super League side Bengaluru FC has offered to play an exhibition match against Real Kashmir in Srinagar to support the I-League debutants after I-League defending champions Minerva FC pulled out of their match, citing security concerns. Real Kashmir thanked Bengaluru FC and invited them in March for a friendly fixture, assuring them "the most electric football atmosphere." A youth in Rajasthan's Bikaner has tattooed the names of 71 martyred soldiers, including the ones martyred in the recent Pulwama attack, on his body to pay homage to the soldiers. The youth said he hopes this will increase the respect for these 

In [None]:
## TFIDF method but using libraby






In [74]:
# How to evaluate your summary? (Rouge(recall) and Blew(precision))
# https://github.com/pltrdy/rouge
!pip install rouge
from rouge import Rouge

def find_rouge(hypothesis, reference):
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference)
    return(scores)

/bin/sh: 1: pip: not found


ModuleNotFoundError: No module named 'rouge'