In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime

In [5]:
from datetime import datetime

In [8]:
from datetime import timedelta

In [11]:
#### trying If-Modified-Since
# date format: If-Modified-Since: <day-name>, <day> <month> <year> <hour>:<minute>:<second> GMT 
# docs: https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/If-Modified-Since#syntax

In [3]:
url = "https://text.npr.org/"

In [6]:
current_datetime = datetime.utcnow()

In [7]:
current_datetime

datetime.datetime(2025, 9, 29, 11, 59, 5, 963701)

In [69]:
last_updated_datetime = current_datetime - timedelta(minutes=30)

In [70]:
last_updated_datetime

datetime.datetime(2025, 9, 29, 11, 29, 5, 963701)

In [71]:
# the right format
last_updated_datetime.strftime("%a, %d %b %Y %H:%M:%S GMT")

'Mon, 29 Sep 2025 11:29:05 GMT'

In [72]:
last_modified = last_updated_datetime.strftime("%a, %d %b %Y %H:%M:%S GMT")

In [73]:
headers = {
    "If-Modified-Since": last_modified
}

In [74]:
response = requests.get(url, headers=headers)

In [75]:
response.status_code

200

In [66]:
# Seems like If-Modified-Since is not reliable

### parsing the first page

In [76]:
soup = BeautifulSoup(response.text, "html.parser")

In [77]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<title>NPR : National Public Radio</title>
<meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width" name="viewport"/>
<link href="" id="favicon" rel="shortcut icon" type="image/png"/>
<style>
        body {
    display: block;
    padding: 0px 20px;
    max-width: 550px;
    margin: 0 auto;
    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
}

.full-version-link {
    margin-left: 15px;
}

.slug-line {
    font-size: 1.1rem;
    margin-bottom: 15px;
}

.hr-line {
    position: relative;
    height: 4px;
}

.hr-line:after

In [86]:
all_urls = []
for article_url in soup.find('div',{'class':'topic-container'}).find_all('a',{'class':'topic-title'}):
    full_url = url + article_url['href'].lstrip('/')
    all_urls.append(full_url)

### parsing individual articles

In [87]:
all_urls[0]

'https://text.npr.org/g-s1-90959'

In [88]:
response_article = requests.get(all_urls[0]) # not using If-Modified-Since here

In [89]:
response_article.ok

True

In [90]:
soup_article = BeautifulSoup(response_article.text, "html.parser")

In [92]:
# heading
title = soup_article.find('h1',{'class':'story-title'})

In [95]:
title.text

'Top congressional leaders head to the White House ahead of shutdown deadline'

In [98]:
# header
header = soup_article.find('div',{'class':'story-head'})

In [101]:
# title
header.h1.text

'Top congressional leaders head to the White House ahead of shutdown deadline'

In [108]:
# author
header.find_all('p')[0].text.replace('By ','')

'Barbara Sprunt'

In [112]:
# date
raw_date = header.find_all('p')[1].text

In [110]:
import re

In [111]:
date_pattern = r"\w* \d{1,2}, \d{4}"

In [113]:
re_date = re.findall(date_pattern, raw_date)

In [115]:
re_date[0]

'September 29, 2025'

In [116]:
# converting

In [122]:
clean_date = datetime.strptime(re_date[0], "%B %d, %Y").strftime("%Y-%m-%d")

In [121]:
# required format
clean_date.strftime("%Y-%m-%d")

'2025-09-29'

In [123]:
clean_date

'2025-09-29'

In [124]:
def _parse_single_article(article_url:str) -> list:
	'''
	Getting data for a single article: title, author, date
	article_url: str; 
	Returns a list of str ['title','author','date']
	'''

	article_data = []

	date_pattern = r'\w* \d{1,2}, \d{4}'

	response_article = requests.get(article_url) # not using If-Modified-Since here

	if response_article.ok == True:
		soup_article = BeautifulSoup(response_article.text, "html.parser")
		
		# getting header
		header = soup_article.find('div',{'class':'story-head'})
		
		# title
		title = header.h1.text
		article_data.append(title)

		# author
		author = header.find_all('p')[0].text.replace('By ','')
		article_data.append(author)

		# date
		raw_date = header.find_all('p')[1].text
		re_date = re.findall(date_pattern, raw_date)[0]
		# required date format
		date = datetime.strptime(re_date, "%B %d, %Y").strftime("%Y-%m-%d")
		article_data.append(date)

	return article_data

In [127]:
_parse_single_article(all_urls[5])

["Hundreds of Israeli soldiers were badly wounded in Gaza. Here's what saved them",
 'Daniel Estrin',
 '2025-09-29']

In [None]:
# master function

def get_arti