In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import unicodedata
import re
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import contractions 

nlp = spacy.load('en_core_web_sm')

In [2]:
seed_urls = [
    "https://www.nytimes.com/section/world",
    "https://www.nytimes.com/section/technology",
    "https://www.nytimes.com/section/science"
            ]
seed_test = "https://www.nytimes.com/section/world"

In [3]:
def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        news_articles = [
            {
                'news_headline': k.find('h2', attrs={'class': 'css-1j9dxys e1xfvim30'}).string,
                'news_article': k.find('p', attrs={'class': 'css-1echdzn e1xfvim31'}).string,
                'news_category': news_category
            }
            for k in soup.find_all('div', class_=["css-1cp3ece"])
        ]
        news_data.extend(news_articles)
    
    df = pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [4]:
news_df = build_dataset(seed_urls)

In [5]:
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,Sorry About Your Sleep,A group of U.S. senators has joined the ranks ...,world
1,Biden Takes First Tentative Steps to Address G...,Under pressure to play catch-up on “vaccine di...,world
2,The pace of U.S. vaccinations has been acceler...,The rate of vaccinations has ramped up about 4...,world
3,Too Much on the Bottom and Not Enough in the M...,An online post of a Nanaimo bar photo swiftly ...,world
4,W.H.O. Grants Johnson & Johnson Vaccine Emerge...,The World Health Organization’s approval on Fr...,world
5,A Green Wave? Mexico’s Marijuana Market May Be...,Lawmakers in Mexico are on the verge of legali...,world
6,The W.H.O. grants emergency authorization to t...,The vaccine is now eligible for distribution t...,world
7,Why the Rape Claim Against Australia’s Attorne...,Some see unsettling parallels with another pow...,world
8,Michigan widens vaccine access to include all ...,Gov. Gretchen Whitmer announced the changes a ...,world
9,"As more of the U.S. returns to indoor dining, ...",About 17 states have made restaurant workers e...,world


In [6]:
news_df.tail(10)

Unnamed: 0,news_headline,news_article,news_category
20,Coyote That Attacked Five in Bay Area Is Final...,"The animal, which had bitten five people, incl...",science
21,Countries Tried to Curb Trade in Plastic Waste...,Data shows that American exporters continue to...,science
22,Why Older People Managed to Stay Happier Throu...,New surveys over the last year show that the a...,science
23,"A Year of Risk, Fear and Loss for Families in ...","For many nurses and doctors, medicine was an i...",science
24,European Countries Suspend Use of AstraZeneca ...,Millions of people have received the vaccine w...,science
25,More Childhood Lead Poisoning Is a Side Effect...,Lead screenings for children plummeted last sp...,science
26,Senate Confirms Biden’s Pick to Lead E.P.A.,Michael S. Regan has said he intends to act ag...,science
27,"Sickle Cell Treatment Not Linked to Cancer, Re...",Trials of experimental gene therapy for sickle...,science
28,Las mujeres informan de peores efectos secunda...,Los hombres y las mujeres suelen responder de ...,science
29,China and Russia Agree to Explore the Moon Tog...,"The two countries, moving increasingly closer,...",science


In [7]:
news_df.news_category.value_counts()

technology    10
world         10
science       10
Name: news_category, dtype: int64

In [8]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [9]:
def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [10]:
def remove_special_chars(text, remove_digits = False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    return re.sub(pattern, ' ', text)

In [11]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])

In [12]:
def lemmatize_text(text):
    text = nlp(text)
    return ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])

In [13]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [14]:
def remove_stopwords(text, is_lower_case = False):
    tokens = [token.strip() for token in tokenizer.tokenize(text)]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    return ' '.join(filtered_tokens)

In [15]:
def normalize_corpus(corpus, html_stripping = True, contraction_expansion = True, 
                     accented_char_removal = True, text_lower_case = True,
                     text_lemmatization = True, special_char_removal = True,
                     stopword_removal = True, remove_digits = True):
    normalized_corpus = []
    for doc in corpus:
        if html_stripping:
            doc = strip_html_tags(doc)
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        if contraction_expansion:
            doc = contractions.fix(doc)
        if text_lower_case:
            doc = doc.lower()
        doc = re.sub(r'[\r|\n|\r|\n|]+', ' ', doc)
        if text_lemmatization:
            doc = lemmatize_text(doc)
        if special_char_removal:
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1", doc)
            doc = remove_special_chars(doc, remove_digits = remove_digits)
        doc = re.sub(' +', ' ', doc)
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case = text_lower_case)
        
        normalized_corpus.append(doc)
    return normalized_corpus

In [18]:
news_df['full_text'] = news_df["news_headline"].map(str) + '. ' + news_df["news_article"]

news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])

news_df.iloc[0][['full_text', 'clean_text']].to_dict()

{'full_text': 'Sorry About Your Sleep. A group of U.S. senators has joined the ranks of those who want to abolish daylight saving time, which has roots in cost-cutting strategies of the late 19th century.',
 'clean_text': 'sorry sleep group senator join rank want abolish daylight saving time root cost cut strategy late th century'}

In [17]:
# news_df.to_csv('new.csv', index = False, encoding = 'utf-8')