In [5]:
import os
import requests
from bs4 import BeautifulSoup
import re
import sys
import time

# Function to get and parse the Medium article HTML
def fetch_article():
    url = input("Enter URL of a Medium article: ")

    if not re.match(r'https?://medium.com/.+', url):
        print('Please enter a valid Medium article URL.')
        sys.exit(1)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 429:  # Too Many Requests
            print("Hit rate limit, waiting for 60 seconds...")
            time.sleep(60)
            response = requests.get(url, headers=headers)  # Retry after wait

        response.raise_for_status()  # Ensure the request was successful

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the article: {e}")
        sys.exit(1)

    return BeautifulSoup(response.text, 'html.parser'), url

# Function to extract text from paragraphs in the article
def extract_text(soup, url):
    paragraphs = soup.find_all('p')
    text = f"URL: {url}\n\n"

    for para in paragraphs:
        text += para.get_text() + "\n\n"

    return text

# Function to save the extracted text to a file
def save_article(text, url):
    folder = './scraped_articles'
    if not os.path.exists(folder):
        os.mkdir(folder)

    filename = os.path.join(folder, f"{url.split('/')[-1]}.txt")

    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

    print(f"Article saved as {filename}")

if __name__ == '__main__':
    soup, url = fetch_article()
    article_text = extract_text(soup, url)
    save_article(article_text, url)


Enter URL of a Medium article: https://medium.com/@ignacio.de.gregorio.noblejas/google-has-finally-dethroned-chatgpt-87a8f8c10d92
Hit rate limit, waiting for 60 seconds...
Article saved as ./scraped_articles/google-has-finally-dethroned-chatgpt-87a8f8c10d92.txt
