# üìö Web Scraping Toolkit

This notebook contains tools for scraping academic and general content from:
1. **Wikipedia** - General knowledge articles
2. **arXiv** - Academic papers with PDF content extraction

## Features

### Wikipedia Scraper
- Search by keywords
- Extract article content
- Returns structured data (title, content, URL)

### arXiv Scraper  
- Search academic papers by keywords
- **Extract PDF content** without saving files (`extract_content=True`)
- Optional PDF download (`save_pdf=True`)
- Date filtering
- Get article metadata (title, abstract, authors, etc.)
- **Default**: Extract content but don't save PDFs

## Article Data Structure

Each arXiv article returns:
- `title` - Paper title
- `abstract` - Paper abstract
- `authors` - List of authors
- `pdf_url` - Link to PDF
- `published` - Publication date
- `arxiv_id` - arXiv identifier
- **`content`** - Full text extracted from PDF (if `extract_content=True`)
- `local_path` - Path to saved PDF (if `save_pdf=True`)

---

# Scraping wikipidea article

In [6]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import time

In [2]:
def search_wikipedia(query):
    """Search Wikipedia and return list of article titles"""
    search_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "opensearch",
        "search": query,
        "limit": 10,
        "format": "json"
    }
    
    headers = {
        'User-Agent': 'WikipediaScraperBot/1.0 (Educational purposes)'
    }
    
    try:
        response = requests.get(search_url, params=params, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json()
        
        # Returns [query, [titles], [descriptions], [urls]]
        titles = data[1]
        urls = data[3]
        
        return list(zip(titles, urls))
    except requests.exceptions.RequestException as e:
        print(f"Error searching Wikipedia: {e}")
        return []
    except (KeyError, IndexError) as e:
        print(f"Error parsing Wikipedia response: {e}")
        return []

def scrape_wikipedia_article(url):
    """Scrape content from a Wikipedia article"""
    headers = {
        'User-Agent': 'WikipediaScraperBot/1.0 (Educational purposes)'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Get title
        title = soup.find('h1', {'id': 'firstHeading'})
        if not title:
            return None, None
        title = title.text
        
        # Get main content (exclude references, external links, etc.)
        content_div = soup.find('div', {'id': 'mw-content-text'})
        if not content_div:
            return title, ""
        
        # Remove unwanted elements
        for unwanted in content_div.find_all(['table', 'div', 'sup', 'span'], {'class': ['infobox', 'navbox', 'reference', 'mw-editsection']}):
            unwanted.decompose()
        
        # Get paragraphs
        paragraphs = content_div.find_all('p')
        text = '\n\n'.join([p.get_text().strip() for p in paragraphs if p.get_text().strip()])
        
        return title, text
    except Exception as e:
        print(f"   Error scraping article: {e}")
        return None, None

def scrape_wikipedia_by_keywords(keywords, max_articles=5):
    """
    Search Wikipedia by keywords and scrape articles
    
    Args:
        keywords: Search query
        max_articles: Maximum number of articles to scrape
        
    Returns:
        List of dictionaries with title and content
    """
    print(f"üîç Searching Wikipedia for: '{keywords}'")
    results = search_wikipedia(keywords)
    
    if not results:
        print("No results found!")
        return []
    
    print(f"Found {len(results)} articles\n")
    
    scraped_articles = []
    
    for i, (title, url) in enumerate(results[:max_articles], 1):
        print(f"[{i}/{min(max_articles, len(results))}] Scraping: {title}")
        
        try:
            article_title, content = scrape_wikipedia_article(url)
            if article_title:
                scraped_articles.append({
                    'title': article_title,
                    'content': content,
                    'url': url
                })
                print(f"   ‚úì Scraped successfully")
            else:
                print(f"   ‚úó Failed to scrape article")
            
            # Be nice to Wikipedia servers
            time.sleep(1)
            
        except Exception as e:
            print(f"   ‚úó Error scraping {title}: {e}")
    
    print(f"\n‚úÖ Successfully scraped {len(scraped_articles)} articles!")
    return scraped_articles

In [3]:
# Example usage
print("="*80)
print("Wikipedia Article Scraper - Example")
print("="*80)

# Example 1: Climate change
print("\nüìö Example 1: Climate Change")
articles = scrape_wikipedia_by_keywords("climate change", max_articles=3)

for article in articles:
    print(f"\nTitle: {article['title']}")
    print(f"URL: {article['url']}")
    print(f"Content length: {len(article['content'])} characters")
    print(f"Preview: {article['content'][:200]}...")
    print("-"*80)

Wikipedia Article Scraper - Example

üìö Example 1: Climate Change
üîç Searching Wikipedia for: 'climate change'
Found 10 articles

[1/3] Scraping: Climate change
   ‚úì Scraped successfully
[2/3] Scraping: Climate change denial
   ‚úì Scraped successfully
[3/3] Scraping: Climate change mitigation
   ‚úì Scraped successfully

‚úÖ Successfully scraped 3 articles!

Title: Climate change
URL: https://en.wikipedia.org/wiki/Climate_change
Content length: 62051 characters
Preview: Present-day climate change includes both global warming‚Äîthe ongoing increase in global average temperature‚Äîand its wider effects on Earth's climate system. Climate change in a broader sense also inclu...
--------------------------------------------------------------------------------

Title: Climate change denial
URL: https://en.wikipedia.org/wiki/Climate_change_denial
Content length: 79658 characters
Preview: Climate change denial (also global warming denial) is a form of science denial characterized by reje

In [4]:
from pprint import pprint
pprint(articles)

[{'content': 'Present-day climate change includes both global warming‚Äîthe '
             'ongoing increase in global average temperature‚Äîand its wider '
             "effects on Earth's climate system. Climate change in a broader "
             "sense also includes previous long-term changes to Earth's "
             'climate. The current rise in global temperatures is driven by '
             'human activities, especially fossil fuel (coal, oil and natural '
             'gas) burning since the Industrial Revolution. Fossil fuel use, '
             'deforestation, and some agricultural and industrial practices '
             'release greenhouse gases. These gases absorb some of the heat '
             'that the Earth radiates after it warms from sunlight, warming '
             'the lower atmosphere. Carbon dioxide, the primary gas driving '
             'global warming, has increased in concentration by about 50% '
             'since the pre-industrial era to levels not seen for

In [5]:
# Example 2: Artificial Intelligence
print("\n\nüìö Example 2: Artificial Intelligence")
ai_articles = scrape_wikipedia_by_keywords("artificial intelligence", max_articles=2)

for article in ai_articles:
    print(f"\nTitle: {article['title']}")
    print(f"Content preview: {article['content'][:150]}...")
    print("-"*80)



üìö Example 2: Artificial Intelligence
üîç Searching Wikipedia for: 'artificial intelligence'
Found 10 articles

[1/2] Scraping: Artificial intelligence
   ‚úì Scraped successfully
[2/2] Scraping: Artificial intelligence in video games
   ‚úì Scraped successfully

‚úÖ Successfully scraped 2 articles!

Title: Artificial intelligence
Content preview: Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learnin...
--------------------------------------------------------------------------------

Title: Artificial intelligence in video games
Content preview: Artificial intelligence (AI) in video games refers to the computational systems that control non-player characters (NPCs), generate dynamic game behav...
--------------------------------------------------------------------------------


# Scraping arXiv Articles

Scrape academic papers from arXiv with optional PDF downloading

In [11]:
import os
from datetime import datetime
from typing import List, Dict, Optional
import io

# Try to import PDF reader
try:
    from pypdf import PdfReader
except ImportError:
    try:
        from PyPDF2 import PdfReader
    except ImportError:
        PdfReader = None
        print("‚ö†Ô∏è  Warning: pypdf not installed. PDF content extraction will be disabled.")
        print("   Install with: pip install pypdf")

In [12]:
def extract_pdf_content(pdf_data: bytes) -> str:
    """Extract text content from PDF bytes."""
    if PdfReader is None:
        return "[PDF content extraction unavailable - pypdf not installed]"
    
    try:
        pdf_file = io.BytesIO(pdf_data)
        reader = PdfReader(pdf_file)
        
        text_content = []
        for page_num, page in enumerate(reader.pages):
            try:
                text = page.extract_text()
                if text:
                    text_content.append(text)
            except Exception as e:
                print(f"   Warning: Could not extract text from page {page_num + 1}")
        
        full_text = "\n\n".join(text_content)
        return full_text if full_text.strip() else "[No text could be extracted from PDF]"
    
    except Exception as e:
        return f"[Error extracting PDF content: {e}]"

In [13]:
def scrape_arxiv_articles(query="ai for climate", max_results=5, save_pdf=False, 
                         extract_content=True, output_folder="pdfs", start_date=None, 
                         end_date=None, sort_by="relevance") -> List[Dict]:
    """
    Scrape articles from arXiv based on keywords.
    
    Args:
        query: Search keywords
        max_results: Maximum number of papers to retrieve
        save_pdf: Whether to download and save PDFs (default: False)
        extract_content: Whether to extract text content from PDFs (default: True)
        output_folder: Folder to save PDFs (only used if save_pdf=True)
        start_date: Filter papers from this date (YYYY-MM-DD)
        end_date: Filter papers until this date (YYYY-MM-DD)
        sort_by: Sort results by 'relevance', 'updated', or 'submitted'
    
    Returns:
        List of dictionaries containing article information
    """
    # Create output folder only if saving PDFs
    if save_pdf:
        Path(output_folder).mkdir(parents=True, exist_ok=True)
    
    # Build arXiv API URL
    url = f"http://export.arxiv.org/api/query?search_query={query}&max_results={max_results}"
    
    # Add sorting parameter
    sort_options = {
        "relevance": "relevance",
        "updated": "lastUpdatedDate",
        "submitted": "submittedDate"
    }
    if sort_by in sort_options:
        url += f"&sortBy={sort_options[sort_by]}&sortOrder=descending"
    
    # Query arXiv API
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "xml")
    
    entries = soup.find_all("entry")
    print(f"Found {len(entries)} papers for query: '{query}'")
    
    articles = []
    
    for i, entry in enumerate(entries, 1):
        # Get paper details
        title = entry.title.text.strip().replace('\n', ' ')
        paper_id = entry.id.text.split('/')[-1]
        
        # Get publication date
        published = entry.published.text
        pub_date = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
        
        # Filter by date range if specified
        if start_date:
            start = datetime.strptime(start_date, "%Y-%m-%d")
            if pub_date < start:
                print(f"Skipping (before {start_date}): {title[:60]}...")
                continue
        
        if end_date:
            end = datetime.strptime(end_date, "%Y-%m-%d")
            if pub_date > end:
                print(f"Skipping (after {end_date}): {title[:60]}...")
                continue
        
        # Construct PDF URL
        pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
        
        # Get abstract
        abstract = entry.summary.text.strip().replace('\n', ' ') if entry.summary else ""
        
        # Get authors
        authors = [author.find('name').text for author in entry.find_all('author')] if entry.find_all('author') else []
        
        article_data = {
            "title": title,
            "abstract": abstract,
            "authors": authors,
            "pdf_url": pdf_url,
            "published": pub_date.strftime('%Y-%m-%d'),
            "arxiv_id": paper_id,
            "content": None,  # Will be populated if extract_content=True
            "local_path": None
        }
        
        # Download PDF if requested OR if content extraction is needed
        pdf_data = None
        if save_pdf or extract_content:
            print(f"[{i}/{len(entries)}] Downloading: {title[:80]}...")
            try:
                pdf_response = requests.get(pdf_url)
                pdf_response.raise_for_status()
                pdf_data = pdf_response.content
                
                # Save to file if requested
                if save_pdf:
                    # Create safe filename
                    safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
                    safe_title = safe_title[:100]  # Limit filename length
                    filename = f"{i}_{safe_title}.pdf"
                    filepath = os.path.join(output_folder, filename)
                    
                    with open(filepath, 'wb') as f:
                        f.write(pdf_data)
                    
                    print(f"   ‚úì Saved to: {filepath}")
                    article_data["local_path"] = filepath
                
                # Extract content if requested
                if extract_content and pdf_data:
                    print(f"   Extracting content...")
                    content = extract_pdf_content(pdf_data)
                    article_data["content"] = content
                    print(f"   ‚úì Extracted {len(content)} characters")
                    
            except Exception as e:
                print(f"   ‚úó Failed to download: {e}")
        else:
            print(f"[{i}/{len(entries)}] {title[:80]}...")
        
        articles.append(article_data)
    
    if save_pdf:
        downloaded_count = sum(1 for a in articles if a.get("local_path"))
        print(f"\n{downloaded_count}/{len(articles)} PDFs downloaded successfully!")
    
    if extract_content:
        extracted_count = sum(1 for a in articles if a.get("content") and not a.get("content", "").startswith("["))
        print(f"‚úì Extracted content from {extracted_count}/{len(articles)} articles")
    
    if not save_pdf and not extract_content:
        print(f"\n{len(articles)} articles retrieved successfully!")
    
    return articles

## Example 1: Get Article Metadata (No PDF Download)

In [14]:
# Scrape articles WITH content extraction (no PDF save)
print("="*80)
print("üìö arXiv Example 1: Extract PDF Content")
print("="*80)

arxiv_articles = scrape_arxiv_articles(
    query="machine learning climate change",
    max_results=2,
    save_pdf=False,  # Don't save PDF files
    extract_content=True  # But DO extract the content!
)

print("\n" + "="*80)
print("RESULTS WITH CONTENT:")
print("="*80)

for i, article in enumerate(arxiv_articles, 1):
    print(f"\n[Article {i}]")
    print(f"Title: {article['title'][:80]}...")
    print(f"Authors: {', '.join(article['authors'][:3])}" + (" et al." if len(article['authors']) > 3 else ""))
    print(f"Published: {article['published']}")
    print(f"arXiv ID: {article['arxiv_id']}")
    
    print(f"\nAbstract (first 200 chars):")
    print(f"{article['abstract'][:200]}...")
    
    if article['content']:
        content_preview = article['content'][:400].replace('\n', ' ')
        print(f"\nüìÑ PDF Content (first 400 chars):")
        print(f"{content_preview}...")
        print(f"\n   Total content length: {len(article['content']):,} characters")
        print(f"   Estimated pages: ~{len(article['content']) // 3000}")
    
    print("-"*80)

üìö arXiv Example 1: Extract PDF Content
Found 2 papers for query: 'machine learning climate change'
[1/2] Downloading: Trend and Thoughts: Understanding Climate Change Concern using Machine   Learnin...
   Extracting content...
   ‚úì Extracted 15364 characters
[2/2] Downloading: The Human Effect Requires Affect: Addressing Social-Psychological   Factors of C...
   Extracting content...
   ‚úì Extracted 17118 characters
‚úì Extracted content from 2/2 articles

RESULTS WITH CONTENT:

[Article 1]
Title: Trend and Thoughts: Understanding Climate Change Concern using Machine   Learnin...
Authors: Zhongkai Shangguan, Zihe Zheng, Lei Lin
Published: 2021-11-06
arXiv ID: 2111.14929v1

Abstract (first 200 chars):
Nowadays social media platforms such as Twitter provide a great opportunity to understand public opinion of climate change compared to traditional survey methods. In this paper, we constructed a massi...

üìÑ PDF Content (first 400 chars):
Trend and Thoughts: Understanding Climate C

## Example 2: Metadata Only (Fastest - No Download)

In [None]:
# Get metadata only - fastest option (no download, no extraction)
print("="*80)
print("‚ö° arXiv Example 2: Metadata Only (Fastest)")
print("="*80)

metadata_only = scrape_arxiv_articles(
    query="artificial intelligence",
    max_results=3,
    save_pdf=False,
    extract_content=False  # Skip content extraction for speed
)

print("\n" + "="*80)
print("METADATA RESULTS:")
print("="*80)

for i, article in enumerate(metadata_only, 1):
    print(f"\n[{i}] {article['title'][:80]}...")
    print(f"    Authors: {', '.join(article['authors'][:2])}" + ("..." if len(article['authors']) > 2 else ""))
    print(f"    Published: {article['published']}")
    print(f"    PDF URL: {article['pdf_url']}")
    print(f"    Has content extracted: {article['content'] is not None}")
    print("-"*40)

## Example 3: Filter by Date Range

In [None]:
# Get recent papers only (published after a specific date)
print("="*80)
print("üìÖ arXiv Example 3: Filter by Publication Date")
print("="*80)

recent_papers = scrape_arxiv_articles(
    query="deep learning",
    max_results=5,
    save_pdf=False,
    start_date="2024-01-01"  # Only papers from 2024 onwards
)

print("\n" + "="*80)
print("RECENT PAPERS (2024+):")
print("="*80)

if recent_papers:
    for i, paper in enumerate(recent_papers, 1):
        print(f"\n[{i}] {paper['title'][:80]}...")
        print(f"    Published: {paper['published']}")
        print(f"    Authors: {', '.join(paper['authors'][:2])}" + ("..." if len(paper['authors']) > 2 else ""))
else:
    print("No papers found matching the date criteria.")

## Inspect Article Data Structure

In [10]:
# View the structure of a single article
from pprint import pprint

if arxiv_articles:
    print("="*80)
    print("EXAMPLE ARTICLE DATA STRUCTURE:")
    print("="*80)
    pprint(arxiv_articles[0])
    
    print("\n" + "="*80)
    print("AVAILABLE FIELDS:")
    print("="*80)
    for key in arxiv_articles[0].keys():
        print(f"  - {key}")

EXAMPLE ARTICLE DATA STRUCTURE:
{'abstract': 'Nowadays social media platforms such as Twitter provide a great '
             'opportunity to understand public opinion of climate change '
             'compared to traditional survey methods. In this paper, we '
             'constructed a massive climate change Twitter dataset and '
             'conducted comprehensive analysis using machine learning. By '
             'conducting topic modeling and natural language processing, we '
             'show the relationship between the number of tweets about climate '
             'change and major climate events; the common topics people '
             'discuss climate change; and the trend of sentiment. Our dataset '
             'was published on Kaggle '
             '(\\url{https://www.kaggle.com/leonshangguan/climate-change-tweets-ids-until-aug-2021}) '
             'and can be used in further research.',
 'arxiv_id': '2111.14929v1',
 'authors': ['Zhongkai Shangguan', 'Zihe Zheng', 'Le

## Combined Example: Compare Wikipedia & arXiv Data

In [None]:
# Get data from both sources on the same topic
topic = "artificial intelligence"

print("="*80)
print(f"üìä Comparing Data Sources for: '{topic}'")
print("="*80)

# Get Wikipedia articles
print("\nüåê Wikipedia Articles:")
wiki_data = scrape_wikipedia_by_keywords(topic, max_articles=2)

# Get arXiv papers
print("\n\nüìÑ arXiv Papers:")
arxiv_data = scrape_arxiv_articles(query=topic, max_results=2, save_pdf=False)

# Compare
print("\n" + "="*80)
print("COMPARISON:")
print("="*80)
print(f"Wikipedia articles: {len(wiki_data)}")
print(f"arXiv papers: {len(arxiv_data)}")

print("\nüìù Wikipedia gives general knowledge")
if wiki_data:
    print(f"   Example: {wiki_data[0]['title']}")
    print(f"   Content length: {len(wiki_data[0]['content'])} characters")

print("\nüî¨ arXiv gives academic research")  
if arxiv_data:
    print(f"   Example: {arxiv_data[0]['title']}")
    print(f"   Authors: {', '.join(arxiv_data[0]['authors'][:3])}")
    print(f"   Published: {arxiv_data[0]['published']}")

In [15]:
# Extract content without saving PDFs (DEFAULT)
articles = scrape_arxiv_articles(
    query="machine learning",
    extract_content=True,  # Extract PDF text
    save_pdf=False  # Don't save files
)

# Access the extracted content
print(articles[0]['content'][:500])  # First 500 chars
print(f"Total: {len(articles[0]['content'])} characters")

Found 5 papers for query: 'machine learning'
[1/5] Downloading: Lecture Notes: Optimization for Machine Learning...
   Extracting content...
   Extracting content...
   ‚úì Extracted 159003 characters
[2/5] Downloading: An Optimal Control View of Adversarial Machine Learning...
   ‚úì Extracted 159003 characters
[2/5] Downloading: An Optimal Control View of Adversarial Machine Learning...
   Extracting content...
   Extracting content...
   ‚úì Extracted 22210 characters
[3/5] Downloading: Minimax deviation strategies for machine learning and recognition with   short l...
   ‚úì Extracted 22210 characters
[3/5] Downloading: Minimax deviation strategies for machine learning and recognition with   short l...
   Extracting content...
   Extracting content...
   ‚úì Extracted 36241 characters
[4/5] Downloading: Machine Learning for Clinical Predictive Analytics...
   ‚úì Extracted 36241 characters
[4/5] Downloading: Machine Learning for Clinical Predictive Analytics...
   Extracting conten