In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re

# spacy for lemmatization
import spacy
import nltk
from nltk.util import ngrams   
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import collections
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from heapq import nlargest

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [42]:
newsarticles = pd.read_csv('news_articles.csv')
newsarticles.shape

(180, 6)

In [43]:
newsarticles.head()

Unnamed: 0.1,Unnamed: 0,Title,date,Article,Category,PageLinks
0,0,"For some, the pandemic is an opportunity to st...","September 6, 2020","By six every morning, Kinchu Lhamo Bhutia is o...",National,https://kathmandupost.com/national/2020/09/06/...
1,1,Smugglers in Dadeldhura haphazardly fell trees...,"September 6, 2020",Hundreds of trees have been felled down on the...,National,https://kathmandupost.com/sudurpaschim-provinc...
2,2,Around 40 percent government staff haven’t rec...,"September 6, 2020",The Rautahat District Treasury Comptroller’s O...,National,https://kathmandupost.com/national/2020/09/06/...
3,3,Relief distribution programme in Birgunj fails...,"September 6, 2020","On Thursday, many residents of Ranighat in Bir...",National,https://kathmandupost.com/province-no-2/2020/0...
4,4,Job schemes come into question as virus-induce...,"September 6, 2020","Last week, when a group of Nepalis were crossi...",National,https://kathmandupost.com/national/2020/09/06/...


In [44]:
newsarticles.drop('Unnamed: 0', axis = 1, inplace=True) # drop column Unnamed: 0

In [59]:
# identifying news categories
News_Category = list(newsarticles.Category.unique())
News_Category

['National',
 'Politics',
 'Valley',
 'Opinion',
 'Money',
 'Sports',
 'Health',
 'Food',
 'Science & Technology']

So there are 9 news categories

In [45]:
data = newsarticles.copy()

In [46]:
# preprocessing to remove unwnated noises from text

def preprocess(text):
    data = text.lower()
    
    # remove new line characters if any
    data = re.sub('\s+', ' ', data)
    
    # remove distracting single quotes
    data = re.sub("\'", "", data)
    
    # removving the square brackets, texts inside parenthesis
    data = re.sub('\[[^]]*\]', '', data)
    data = re.sub(r'\([^)]*\)', '', data)
    
    # removing special characters
    data = re.sub(r'[^a-zA-z0-9\s]','', data)
    
    return data
    

In [47]:
%%time
data['Article'] = data['Article'].apply(preprocess)

Wall time: 286 ms


In [48]:
# Lets see if the processing above have done its work or not
data['Article'][0]

'by six every morning kinchu lhamo bhutia is out of her house to buy chicken and buffalo meat buying the meat herself is one of her quality control measures for her business kinchus kitchen which sells frozen momos and buff and chicken pickles for a long time kinchus kitchen was just an idea in bhutias head bhutia a native of sikkim moved to kathmandu after getting married she had always wanted to start a business focusing on serving the kind of sikkimese food that she grew up eating but between helping her husbands pashmina business in kathmandu and raising her children she had very little time to do anything else as the nation went into a lockdown in march the familys pashmina shop in thamel had to be shut and bhutia found herself free time on april 16 nearly a month into the lockdown bhutia launched kinchus kitchen not long after launching her business it took off and since then kinchus kitchen has sold thousands of plates of momos and hundreds of bottles of meat pickles the pandemi

Ok. our preprocessing function have done its work

In [49]:
# tokenizing, removing stopwords and short words
stop_words = stopwords.words('english')
def more_preprocess(text):
    tokens = [w for w in text.split() if not w in stop_words]
    long_words = []
    for i in tokens:
        if len(i) >= 3:
            long_words.append(i)
    return (" ".join(long_words))

In [50]:
%%time
data['Article'] = data['Article'].apply(more_preprocess)

Wall time: 789 ms


In [51]:
# checking to see if stop words have been removed or not. 
data['Article'][0]

'six every morning kinchu lhamo bhutia house buy chicken buffalo meat buying meat one quality control measures business kinchus kitchen sells frozen momos buff chicken pickles long time kinchus kitchen idea bhutias head bhutia native sikkim moved kathmandu getting married always wanted start business focusing serving kind sikkimese food grew eating helping husbands pashmina business kathmandu raising children little time anything else nation went lockdown march familys pashmina shop thamel shut bhutia found free time april nearly month lockdown bhutia launched kinchus kitchen long launching business took since kinchus kitchen sold thousands plates momos hundreds bottles meat pickles pandemic subsequent lockdowns crippled businesses across country job losses pay cuts become common across industries however economically turbulent times like bhutia started online businesses taken friends tried food always told start food business never really thought would actually start midst pandemic sa

so there is no stopwords here.

In [52]:
# function for lemmatization
def lemmatization(text):
    #tokens = [text.split()]
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
    newString = nlp(text)
    text_out = []
    text_out.append([word.lemma_ for word in newString if word.pos_ in allowed_postags])
    return text_out

In [53]:
%%time
import spacy

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data['Article'] = data['Article'].apply(lemmatization)

Wall time: 35.3 s


In [54]:
print(data['Article'][0])

[['morning', 'kinchu', 'buy', 'meat', 'buying', 'meat', 'quality', 'control', 'measure', 'business', 'sell', 'frozen', 'chicken', 'pickle', 'long', 'time', 'native', 'sikkim', 'move', 'get', 'marry', 'always', 'want', 'start', 'business', 'focus', 'serve', 'kind', 'sikkimese', 'food', 'grow', 'eat', 'help', 'husband', 'business', 'kathmandu', 'raise', 'child', 'little', 'time', 'else', 'nation', 'go', 'find', 'free', 'time', 'nearly', 'month', 'launch', 'long', 'launch', 'business', 'take', 'sell', 'thousand', 'plate', 'momos', 'hundred', 'bottle', 'meat', 'pickle', 'pandemic', 'subsequent', 'lockdown', 'cripple', 'business', 'country', 'job', 'loss', 'pay', 'cut', 'become', 'common', 'industry', 'however', 'economically', 'turbulent', 'time', 'start', 'online', 'business', 'take', 'friend', 'try', 'food', 'always', 'tell', 'start', 'food', 'business', 'never', 'really', 'think', 'would', 'actually', 'start', 'midst', 'say', 'month', 'start', 'start', 'online', 'bake', 'enthusiast', 'b

In [35]:
def func(docs):
    x = docs[0]
    return x

data['Article'] = data['Article'].apply(func) # flatten list

In [37]:
print(data['Article'][0])

['morning', 'kinchu', 'buy', 'meat', 'buying', 'meat', 'quality', 'control', 'measure', 'business', 'sell', 'frozen', 'chicken', 'pickle', 'long', 'time', 'native', 'sikkim', 'move', 'get', 'marry', 'always', 'want', 'start', 'business', 'focus', 'serve', 'kind', 'sikkimese', 'food', 'grow', 'eat', 'help', 'husband', 'business', 'kathmandu', 'raise', 'child', 'little', 'time', 'else', 'nation', 'go', 'find', 'free', 'time', 'nearly', 'month', 'launch', 'long', 'launch', 'business', 'take', 'sell', 'thousand', 'plate', 'momos', 'hundred', 'bottle', 'meat', 'pickle', 'pandemic', 'subsequent', 'lockdown', 'cripple', 'business', 'country', 'job', 'loss', 'pay', 'cut', 'become', 'common', 'industry', 'however', 'economically', 'turbulent', 'time', 'start', 'online', 'business', 'take', 'friend', 'try', 'food', 'always', 'tell', 'start', 'food', 'business', 'never', 'really', 'think', 'would', 'actually', 'start', 'midst', 'say', 'month', 'start', 'start', 'online', 'bake', 'enthusiast', 'ba

In [80]:
def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words = 'english', sublinear_tf=True)
    matrix = tfidf_vectorizer.fit_transform(data)
    return matrix, tfidf_vectorizer

def sentence_score(sentence: str, word_scores: dict):
    words = sentence.split()
    if len(words) < 50:
        score = sum([word_scores.get(w.lower(),0) for w in words])
    else:
        score=0
    return score


# This function only summarizes one research paper at a time.
def summarize(article_title):
    articletext = newsarticles[newsarticles.Title == article_title].Article
    list_corpus = list(articletext)
    tfidf_matrix, tfidf_vectorizer = tfidf(list_corpus)  # tfidf_matrix shape is 1*1
    
    # Creating a dataframe with the words scores
    # below dataframe shape will be (1,1)
    word_scores_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf_vectorizer.get_feature_names())   
    word_scores = dict(zip(list(word_scores_df.columns), list(word_scores_df.iloc[0])))  

    # Split into sentences and replacing irrelevant characters
    sentences_list = [nlp(s) for s in list(articletext.str.replace('\n', '.').replace('\r', '.'))]
    sentences_list = str(sentences_list[0]).split('.')  # sentence tokenizing

    # Assign scores and join the top sentences into the final summary
    sentences_scores = {}
    for s in sentences_list:
        sentences_scores[s] = sentence_score(s, word_scores)

    top_sentences = nlargest(5, sentences_scores, key=sentences_scores.get)
    top_sentences = [s for s in top_sentences ]
    summary = '. '.join(top_sentences)
        
    return summary

In [81]:
def related_articles(search_word):
    df_table = pd.DataFrame(columns = ["Date","Title","ArticleSummary"])
    summaries = []
    
    articles_df = newsarticles[newsarticles['Category'] == search_word]
    df_table.Date = list(articles_df.date)
    df_table.Title = list(articles_df.Title)
    
    # create summaries
    for title in list(df_table.Title):
        summaries.append(summarize(title))
    df_table.ArticleSummary = summaries
    return df_table

pd.options.display.max_colwidth = 1500

In [82]:
df1 = related_articles("National")  # summary for "National" newscategory
df1

Unnamed: 0,Date,Title,ArticleSummary
0,"September 6, 2020","For some, the pandemic is an opportunity to start new businesses","“Friends who had tried my food have always told me to start my own food business but I never really thought I would actually start it in the midst of a pandemic,” said Bhutia. “During the lockdown, our family business had to be shut, and I had ample free time to bake, and I resumed baking and shared my baked products with family, friends, and neighbours,” said Manandhar. Two months after Bhutia started her business, Sushmita Manandhar started her online bakery shop (bakeaway_Nepal), on June 16. Manandhar gets an average of five to six orders a day for cakes, muffins and cupcakes, and in a week she gets an average of six birthday cake orders. But when the orders started increasing, I realised having home delivery services makes things much more convenient for customers,” said Manandhar"
1,"September 6, 2020",Smugglers in Dadeldhura haphazardly fell trees on pretext of constructing road,"The incident came to light on August 22 when forest officials confiscated a large quantity of green sal logs on the road section that connects Godam bazaar in Aalital to Dola Village. “Trees of Rajani Community Forest and the national forest adjacent to the community forest were felled haphazardly on the pretext of constructing the road,” said Ram Bahadur Kunwar, a local of Godam bazaar. Locals said they cannot openly talk about the smugglers and timber smuggling in the forest because of an incident in which an individual, who provided information to the media about timber smuggling, was burnt alive by smugglers a decade ago. “The spokesperson at the Division Forest Office in Dadeldhura will provide additional information about the tree felling in Rajani Community Forest and its vicinity. Hundreds of trees have been felled down on the pretext of constructing a ring road in the Chure forest area of Aalital Rural Municipality in Dadeldhura"
2,"September 6, 2020",Around 40 percent government staff haven’t received their salaries yet,"For staffers at a government office to receive salaries, the office chief and chief accountant need to sign a payment order requesting the district treasury office to release funds. “For example, the district treasury office in Singh Durbar has released the salaries of around 90 of the 100 offices under its ambit and the one in Teku has released the salaries of around 80 of the 87 offices it deals with,” said Bhurtel. “The district treasury offices have released funds for the salaries of only 60 percent of government employees so far,” said Deputy Financial Comptroller General Bhesh Prasad Bhurtel. Usually, the district treasury offices start releasing funds for salaries weeks before the end of the month based on financial details submitted by the concerned government offices. The Rautahat District Treasury Comptroller’s Office has not released the salaries of provincial staffers based in the district for Shrawan (mid-July-mid-August)"
3,"September 6, 2020",Relief distribution programme in Birgunj fails to cover vulnerable families,"Meanwhile, Chief District Officer of Parsa Asman Tamang said the district administration had started distributing reliefs to impoverished families through the District Police Office and Birgunj Metropolis since the past few days. On Thursday, many residents of Ranighat in Birgunj Metropolis reached the District Police Office in Parsa to receive relief after hearing about the police office’s distribution programme through a local FM station. Khatun had also reached the police office on Thursday to receive relief but was sent away since her name was not on the list of needy people prepared by the police office. The office had collected data of the needy families in the metropolis but on Thursday those whose names were not listed had also reached the police office. A resident of Ranighat in Birgunj Metropolis-11, Devi says her family has been robbed of the opportunity to work since the prohibitory order was imposed in the metropolis on July 25 to control the spread of Covid-19"
4,"September 6, 2020",Job schemes come into question as virus-induced unemployment hits rural youths hard,"After Covid-19 rattled the world’s economy, leading to massive job losses for Nepali workers at home and abroad, the Nepal government has pledged to create more than 700,000 jobs in the current fiscal year to provide employment to those affected by the pandemic. The Ministry of Labour, Employment and Social Security says that the programme generated employment opportunities for its target of 60,ooo people across the country in the fiscal year 2019-20. Besides, the Labour Ministry has also received a soft loan of approximately Rs14 billion from the World Bank to implement the Youth Employment Transformation Initiative Project, a five-year project, under the employment programme. ” Two years into its implementation, the Prime Minister Employment Programme, which promises a minimum 100 days of employment for unemployed registered citizens, have not shown promising results. Purna Chandra Bhattarai, a former labour secretary, said the programme, despite promising a minimum 100 days of employment, has not been able to provide jobs for 50 days"
5,"September 5, 2020",Police suspect illegal gambling dens are thriving during Covid-19 restrictions,"The number of arrests on illegal gambling charges has gone down as a result, say police who suspect that illegal gambling dens have thrived during these past six months of Covid-19 lockdown and restrictions. According to the data provided by Metropolitan Police Office, Ranipokhari a total of 22 people were rounded up from illegal gambling joints in Kathmandu Valley in the last six months. According to the data maintained by the Metropolitan Police Office, Rani Pokhari, more than Rs 19 million was recovered from 1,161 suspected gamblers from different parts of the Valley in the fiscal year 2018/19. “We have not conducted that many raids in private houses in the past six months, as our personnel are busy due enforcing Covid-19 restrictions,” said Senior Superintendent of Police, Sushil Kumar Yadav, spokesperson for the Metropolitan Police Office, Ranipokhari. Every year, Nepal Police crack down on hundreds of illegal gambling dens which are operated mostly in private houses across the country and confiscate millions of rupees"
6,"September 5, 2020",Overstay fines become a hurdle for Nepali workers waiting to return home from the UAE,"There are many Nepali workers in the UAE, including Khem and his three friends, whose visas expired after the March 1 cutoff date and now face overstay fines and thereby unable to return home unless they pay the fines. According to Som Prasad Lamichhane, director with the Pravasi Nepali Coordination Committee, an organisation working for the rights of Nepali migrant workers, Nepal government could have taken a diplomatic initiative to waive off the overstay fines of migrant workers. The UAE government’s ongoing amnesty programme for foreigners, including Nepali migrants, have given a three-month grace period to the foreigners whose visa expired before March 1, meaning they have until November to return to their countries without facing any punishments or fines for overstaying their visas. “The UAE government had approached the Nepal government about sending back Nepali workers on free flights, but the Nepal government did not even give the landing permission,” Sherpa said. According to Pasang Sherpa, vice-chairman of Non-Resident Nepalese Association (NRNA), UAE chapter, overstay fines are being charged only to those workers boarding their flights via Dubai airport as different emirates have different rules"
7,"September 5, 2020",Repatriation quandary persists even with funds to support migrant workers’ airfare,"The guidelines state that the concerned recruiting agencies and foreign missions must first ensure the migrant workers have not received air tickets or other financial aid from their employers or host nations before they get support from the welfare fund to return home. The Supreme Court on June 15 ordered the government to repatriate the Nepali migrant workers stranded in various labour destinations by using the Foreign Employment Welfare Fund. He argues the guidelines simply instruct the Nepali missions to contact the employers, recruiting agencies, or the host governments and check if the workers are getting any financial support for airfare so that they could return home. Shom Prasad Luitel, lawyer who had filed the petition at the Supreme Court for repatriation of Nepali migrant workers, says the government and its embassies in labour hosting countries are not committed to the task of repatriating the stranded citizens. Din Bandhu Subedi, spokesperson for the Foreign Employment Board, which manages the welfare fund, said the board had released Rs 4 million to the Nepal Embassy in Malaysia to send back 86 stranded workers"
8,"September 5, 2020",How politicians' chopper rides to disaster-hit areas fly in the face of spirit of federalism,"The Disaster Risk Reduction and Management Act 2017 also envisions Disaster Risk Reduction and Management Provincial Council, Disaster Risk Reduction and Management Provincial Executive Committee in each of the seven provinces and local disaster management committees in the 753 local governments. After the 2015 constitution ensured Nepal as a federal democratic republic, elections held in 2017 installed three tiers of government across the country–a federal government in Kathmandu, seven provincial governments and 753 local governments. “I do not see any point in ministers and leaders from Kathmandu making rounds of every other disaster-hit areas, as provincial and local governments are already in place to respond to such events” said Khimlal Devkota, an expert on federal affairs who also writes extensively on fiscal federalism. The federal government is already facing criticism for holding on to the concept of chief district officers and making them work as the Home Ministry’s liaison in the districts even in the current federal set-up. While there is a worldwide practice of prime ministers or presidents visiting the places hit by a disaster or crisis of a large scale, such events are largely meant to instill the feeling into the citizens that the state is there to support them"
9,"September 4, 2020",Race begins for new finance minister as Oli bids farewell to Khatiwada,"Yubaraj Khatiwada resigned as finance and information and communication technology minister on Friday, a day after the ruling Nepal Communist Party decided not to nominate him, again, as a member of the National Assembly, a prerequisite for him to continue in the Oli Cabinet. “It had become obvious that the prime minister would bid farewell to the finance and communication minister, which he did today,” said Subash Nembang, a Standing Committee member who is a close confidante of Oli. “Its political meaning is that Oli did not want to show that he is in a minority as Bishnu Poudel won’t go against Gautam,” said a Standing Committee member close to Madhav Nepal. The Dahal faction, backed by senior leaders Madhav Kumar Nepal and Jhala Nath Khanal, had piled pressure on Oli to resign both as party chair and prime minister. Leaders from the Dahal-Nepal faction said the Oli himself proposed Gautam for the Upper House as soon as the Secretariat meeting began on Thursday, sensing that annoying Gautam could push him into the minority"


In [83]:
df1['ArticleSummary'][0]

'  “Friends who had tried my food have always told me to start my own food business but I never really thought I would actually start it in the midst of a pandemic,” said Bhutia.  “During the lockdown, our family business had to be shut, and I had ample free time to bake, and I resumed baking and shared my baked products with family, friends, and neighbours,” said Manandhar.   Two months after Bhutia started her business, Sushmita Manandhar started her online bakery shop (bakeaway_Nepal), on June 16.   Manandhar gets an average of five to six orders a day for cakes, muffins and cupcakes, and in a week she gets an average of six birthday cake orders.  But when the orders started increasing, I realised having home delivery services makes things much more convenient for customers,” said Manandhar'