In [8]:
import requests
from bs4 import BeautifulSoup #for parsing HTL, XML files
import pandas as pd
import numpy as np
import os
#%matplotlib inline

In [9]:
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string, #<span itemprop="headline">some headline</span>
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [10]:
news_df = build_dataset(seed_urls)
news_df.head()

Unnamed: 0,news_headline,news_article,news_category
0,"OPPO launches film under 'Be The Light, To Spr...",OPPO has launched a short film 'Phuljhari' und...,technology
1,Twitter issues statement after J&K shown as pa...,After a location tag in a live broadcast showe...,technology
2,OnePlus announces special offers on newly laun...,OnePlus has announced special offers on the ne...,technology
3,Sussanne says her Insta account was hacked aft...,"Actor Hrithik Roshan's ex-wife, entrepreneur S...",technology
4,Twitter shows J&K as China's territory; securi...,National security analyst Nitin A Gokhale took...,technology


In [11]:
news_df['news_headline']

0     OPPO launches film under 'Be The Light, To Spr...
1     Twitter issues statement after J&K shown as pa...
2     OnePlus announces special offers on newly laun...
3     Sussanne says her Insta account was hacked aft...
4     Twitter shows J&K as China's territory; securi...
5     We don't collect users' data on sex lives: Air...
6     Xiaomi issues statement after Arunachal disapp...
7     Pak lifts ban on TikTok after banning it over ...
8     Hiring your friends is a typical first-time CE...
9     Twitter removes Trump's COVID-19 advisor's pos...
10    Facebook's Messenger API will soon allow autom...
11    China revises laws to strengthen protection of...
12    UK's NHS tests drone to deliver COVID-19 kits ...
13    15 EU countries call for strategy to tackle fa...
14    Deutsche Telekom successfully tests 4G connect...
15    Facebook ex-exec offers $5,000 to jailbreak Fa...
16    Founder-led tech firms doubled their share pri...
17    South Korea's SK Hynix to buy Intel's NAND

In [4]:
news_df.news_category.value_counts()

sports        25
world         24
technology    21
Name: news_category, dtype: int64

In [12]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)  

'Well this was fun What do you think '

In [13]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [12]:
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]
print(news_df['full_text'][0])

OPPO launches film under 'Be The Light, To Spread The Light' campaign. OPPO has launched a short film 'Phuljhari' under its 'Be the light, to spread the light' campaign. Phuljhari is a story of a man who brings happiness on the face of two underprivileged kids by contributing to their desire to celebrate Diwali. OPPO aims to encourage people this Diwali to spread happiness with the F17 Pro Diwali Edition.
