In [None]:
# Install required packages if not already installed
!pip install newspaper3k aiohttp tqdm -qqq

In [None]:
# Wired Article Scraper Notebook

import asyncio
import aiohttp
from newspaper import Article
import pandas as pd
from tqdm.notebook import tqdm
import nest_asyncio

In [None]:
# Apply nest_asyncio to allow asyncio to work in Jupyter
nest_asyncio.apply()

In [None]:
# Function to extract text from a single URL
async def extract_article_text(session, url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        async with session.get(url, headers=headers) as response:
            html = await response.text()
            article = Article(url)
            article.set_html(html)
            article.parse()

            return {
                'url': url,
                'title': article.title,
                'text': article.text,
                'authors': article.authors,
                'publish_date': article.publish_date
            }
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return None


In [None]:
# Main function to process all URLs
async def process_urls(urls):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for url in urls:
            task = asyncio.ensure_future(extract_article_text(session, url))
            tasks.append(task)

        results = []
        for future in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing articles"):
            result = await future
            if result:
                results.append(result)
            # Add a small delay between requests
            await asyncio.sleep(1)

        return results

In [None]:
# Load your URLs (replace this with your actual method of loading URLs)
urls = [
    "https://www.wired.com/story/applebot-extended-apple-ai-scraping/",
    "https://www.wired.com/story/undress-app-ai-harm-google-apple-login/",
    # Add more URLs here
]

In [None]:
# Run the async function
results = asyncio.run(process_urls(urls))

In [None]:
# Convert results to a DataFrame
df = pd.DataFrame(results)

In [None]:
# Display the first few rows
df.head()

In [None]:
# Save to a parquet file
df.to_parquet('wired_articles.parquet', index=False)