In [2]:
from collections import Counter
import random
from bs4 import BeautifulSoup
import requests
import re
import json
from dateutil.parser import parse

#### Getting data

In [3]:
url = ("https://raw.githubusercontent.com/"
"joelgrus/data/master/getting-data.html")

html = requests.get(url).text
soup = BeautifulSoup(html, features='html.parser')

first_paragraph = soup.find('p')

first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

first_paragraph_id = soup.p['id']
all_paragraphs = soup.find_all('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]

In [8]:
# Congressmen sites
url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, 'html.parser')

all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]

regex = r"^https?://.*\.house\.gov/?$"
good_urls = [url for url in all_urls if re.match(regex, url)]

html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html.parser')

press_releases: dict[str, set[str]] = {}

for house_url in good_urls[:12]:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html.parser')
    pr_links = {a['href'] for a in soup('a') if 'press releases'
                                             in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://carl.house.gov: {'/media/press-releases'}
https://barrymoore.house.gov: {'/media/press-releases'}
https://mikerogers.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://aderholt.house.gov/: {'/media-center/press-releases'}
https://strong.house.gov: set()
https://palmer.house.gov/: set()
https://sewell.house.gov/: {'/press-releases'}
https://peltola.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://radewagen.house.gov: set()
https://schweikert.house.gov/: set()
https://crane.house.gov: set()
https://rubengallego.house.gov/: set()


In [10]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """
    Returns True if a <p> inside the text mentions {keyword}
    """
    soup = BeautifulSoup(text)
    paragraphs = [p.get_text() for p in soup('p')]
    return any(keyword.lower() in paragraph.lower()
    for paragraph in paragraphs)

for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text
    if paragraph_mentions(text, 'late'):
        print(f"{house_url}")
        break # done with this house_url

https://peltola.house.gov


#### GitHub API for data collection

In [11]:
github_user = 'CyrilleSmid'
endpoint = f"https://api.github.com/users/{github_user}/repos"

repos = json.loads(requests.get(endpoint).text)

dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_count = Counter(date.weekday() for date in dates)

last_5_repositories = sorted(repos,
                             key=lambda r: r["pushed_at"],
                             reverse=True)[:5]
last_5_languages = [repo["language"]
                    for repo in last_5_repositories]
print(last_5_languages)

['C#', 'Jupyter Notebook', 'R', 'C++', 'HTML']
