In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import unicodedata
import re
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import contractions 

nlp = spacy.load('en_core_web_sm')

In [2]:
seed_urls = [
    "https://inshorts.com/en/read/technology",
    "https://inshorts.com/en/read/sports",
    "https://inshorts.com/en/read/world",
]

In [3]:
def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        news_articles = [
            {
                'news_headline': headline.find('span', attrs={'itemprop': 'headline'}).string,
                'news_article': article.find('div', attrs={'itemprop': 'articleBody'}).string,
                'news_category': news_category
            } 
            for headline, article in zip(
                soup('div', class_=["news-card-title news-right-box"]),
                soup('div', class_=["news-card-content news-right-box"]),
            )
        ]
        news_data.extend(news_articles)
    
    df = pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [4]:
news_df = build_dataset(seed_urls)

In [5]:
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,Beeple among 3 most valuable living artists af...,"US-based graphic designer Mike Winkelmann, the...",technology
1,Jhunjhunwala-backed Nazara Tech to open ₹583-c...,"Mumbai-based Nazara Technologies, backed by bi...",technology
2,Netflix tests way to prevent password sharing;...,Netflix is trialling a way to prevent password...,technology
3,Apple sues ex-employee for allegedly stealing ...,Apple is suing a former employee on accusation...,technology
4,LED TV prices to rise from April as open-cell ...,Prices of LED televisions may rise further fro...,technology
5,China denies plan for near $1 bn fine on Aliba...,Chinese market regulator has refuted a Wall St...,technology
6,Ant Group CEO Simon Hu resigns due to 'persona...,China's Ant Group CEO Simon Hu has resigned fr...,technology
7,"Bitcoin tops $57,000 as US passes $1.9 tn COVI...","Bitcoin prices crossed $57,000-mark for the fi...",technology
8,Govt likely to block carriers from using Huawe...,The government is likely to block Indian mobil...,technology
9,Microsoft willing to break the way open web wo...,"Google has claimed Microsoft is willing ""to br...",technology


In [6]:
news_df.tail(10)

Unnamed: 0,news_headline,news_article,news_category
65,UK tells its citizens to flee Myanmar as 'leve...,The UK has urged its citizens to flee Myanmar ...,world
66,"Nepal bans taking pictures, videos of stranger...",The Nepal government has banned expedition tea...,world
67,Army is threatening my daughter Maryam: Ex-Pak...,Former Pakistan PM Nawaz Sharif said that the ...,world
68,65 media workers killed worldwide in 2020 whil...,Sixty-five journalists and media workers were ...,world
69,S Korea to suspend defence exchanges with Myan...,South Korea's Foreign Ministry has said that i...,world
70,UK economy shrinks 2.9% in January over COVID ...,"The UK economy shrank 2.9% in January, as per ...",world
71,EU declared an 'LGBTIQ Freedom Zone' in respon...,European Parliament passed a resolution on Thu...,world
72,"US state passes bill to end yoga ban, use of '...",US' Alabama has approved a bill that'd allow p...,world
73,US issues notification to delay mandatory mini...,US President Joe Biden's administration on Fri...,world
74,US offers temporary refuge to Myanmar national...,US President Joe Biden's administration on Fri...,world


In [7]:
news_df.news_category.value_counts()

sports        25
world         25
technology    25
Name: news_category, dtype: int64

In [8]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [9]:
def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [10]:
def remove_special_chars(text, remove_digits = False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    return re.sub(pattern, ' ', text)

In [11]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])

In [12]:
def lemmatize_text(text):
    text = nlp(text)
    return ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])

In [13]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [14]:
def remove_stopwords(text, is_lower_case = False):
    tokens = [token.strip() for token in tokenizer.tokenize(text)]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    return ' '.join(filtered_tokens)

In [15]:
def normalize_corpus(corpus, html_stripping = True, contraction_expansion = True, 
                     accented_char_removal = True, text_lower_case = True,
                     text_lemmatization = True, special_char_removal = True,
                     stopword_removal = True, remove_digits = True):
    normalized_corpus = []
    for doc in corpus:
        if html_stripping:
            doc = strip_html_tags(doc)
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        if contraction_expansion:
            doc = contractions.fix(doc)
        if text_lower_case:
            doc = doc.lower()
        doc = re.sub(r'[\r|\n|\r|\n|]+', ' ', doc)
        if text_lemmatization:
            doc = lemmatize_text(doc)
        if special_char_removal:
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1", doc)
            doc = remove_special_chars(doc, remove_digits = remove_digits)
        doc = re.sub(' +', ' ', doc)
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case = text_lower_case)
        
        normalized_corpus.append(doc)
    return normalized_corpus

In [16]:
news_df['full_text'] = news_df["news_headline"].map(str) + '. ' + news_df["news_article"]

news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])

news_df.iloc[8][['full_text', 'clean_text']].to_dict()

{'full_text': 'Govt likely to block carriers from using Huawei equipment: Report . The government is likely to block Indian mobile carriers from using Huawei\'s telecom equipment, Reuters reported, citing government officials. This comes after the government said it would publish a list of "trusted sources" for operators to buy telecom equipment from. The government could also create a "no procurement" blacklist which could include China\'s Huawei, the report said.',
 'clean_text': 'govt likely block carrier use huawei equipment report government likely block indian mobile carrier use huawei telecom equipment reuter report cite government official come government say would publish list trust source operator buy telecom equipment government could also create no procurement blacklist could include china huawei report say'}

In [17]:
# news_df.to_csv('new.csv', index = False, encoding = 'utf-8')