# Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from newspaper import Article
from datetime import datetime
from dateutil import parser
import readability
import time
import random
import re
from urllib.parse import urljoin

# Define Core Scraping Functions, Article Extraction Function, Scrape Article List Page & Execute the Scraper

In [44]:
def get_page(url):
    """Fetch webpage with proper headers and error handling"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
def extract_metadata(soup, url):
    """Enhanced metadata extraction with multiple fallback methods"""
    metadata = {
        'title': '',
        'date': '',
        'author': '',
        'image': '',
        'description': '',
        'keywords': [],
        'canonical_url': url,
        'category': ''
    }

    # 1. TITLE EXTRACTION
    title_tag = (soup.find('meta', property='og:title') or 
                soup.find('meta', {'name': 'title'}) or
                soup.find('h1'))
    metadata['title'] = (
        title_tag['content'] if title_tag and title_tag.get('content')
        else title_tag.text.strip() if title_tag
        else soup.title.string.strip() if soup.title 
        else ''
    )

    # 2. DATE EXTRACTION (multiple fallbacks)
    date_sources = [
        soup.find('meta', property='article:published_time'),
        soup.find('meta', {'name': 'date'}),
        soup.find('time'),
        soup.find('span', class_=re.compile('date|time|published', re.I)),
        soup.find('div', class_=re.compile('date|timestamp', re.I))
    ]
    
    for source in date_sources:
        if source:
            date_str = (source.get('content') or 
                       source.get('datetime') or 
                       source.text.strip())
            if date_str:
                try:
                    metadata['date'] = parser.parse(date_str).isoformat()
                    break
                except:
                    continue
    
    # URL date fallback (e.g., /2024/03/15/article-slug)
    if not metadata['date'] and re.search(r'/(\d{4})/(\d{2})/(\d{2})/', url):
        match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', url)
        metadata['date'] = f"{match.group(1)}-{match.group(2)}-{match.group(3)}T00:00:00"

    # 3. AUTHOR EXTRACTION (multiple fallbacks)
    author_sources = [
        soup.find('meta', property='article:author'),
        soup.find('meta', {'name': 'author'}),
        soup.find('a', class_=re.compile('author|byline', re.I)),
        soup.find('span', class_=re.compile('author|byline', re.I)),
        soup.find('div', class_=re.compile('author-info', re.I)),
        soup.find('p', class_=re.compile('author|byline', re.I))
    ]
    
    for source in author_sources:
        if source:
            metadata['author'] = (
                source.get('content', '').strip() if source.name == 'meta'
                else source.text.strip()
            )
            if metadata['author']:
                # Clean author string (remove "By " prefix, etc.)
                metadata['author'] = re.sub(r'^[Bb]y\s+', '', metadata['author']).split(',')[0].split(' and ')[0]
                break

    # 4. KEYWORDS EXTRACTION (multiple fallbacks)
    keyword_sources = [
        soup.find('meta', {'name': 'keywords'}),
        soup.find('meta', {'property': 'article:tag'}),
        soup.find('meta', {'name': 'news_keywords'}),
        soup.find('div', class_=re.compile('tags|keywords', re.I)),
        soup.find('ul', class_=re.compile('tags', re.I))
    ]
    
    for source in keyword_sources:
        if source:
            if source.name == 'meta':
                content = source.get('content', '')
                metadata['keywords'] = [k.strip() for k in content.split(',') if k.strip()]
            else:
                metadata['keywords'] = [a.text.strip() for a in source.find_all('a') if a.text.strip()]
            if metadata['keywords']:
                break
    
    # Category fallback as keyword
    if not metadata['keywords']:
        metadata['keywords'] = [metadata['category']] if metadata['category'] else []

    # 5. IMAGE EXTRACTION
    image_tag = (soup.find('meta', property='og:image') or
                soup.find('meta', {'name': 'image'}) or
                soup.find('img', class_=re.compile('featured|main', re.I)))
    metadata['image'] = (
        image_tag['content'] if image_tag and image_tag.get('content')
        else image_tag['src'] if image_tag and image_tag.get('src')
        else ''
    )

    # 6. DESCRIPTION EXTRACTION
    desc_tag = (soup.find('meta', property='og:description') or
               soup.find('meta', {'name': 'description'}) or
               soup.find('p', class_=re.compile('excerpt|summary', re.I)))
    metadata['description'] = (
        desc_tag['content'] if desc_tag and desc_tag.get('content')
        else desc_tag.text.strip() if desc_tag
        else ''
    )

    # 7. CATEGORY EXTRACTION
    category_sources = [
        soup.find('meta', property='article:section'),
        soup.find('a', class_=re.compile('category|section', re.I)),
        soup.find('span', class_=re.compile('category', re.I))
    ]
    
    for source in category_sources:
        if source:
            metadata['category'] = (
                source.get('content', '').strip() if source.name == 'meta'
                else source.text.strip()
            )
            if metadata['category']:
                break
    
    # URL category fallback (e.g., /politics/article-slug)
    if not metadata['category'] and len(url.split('/')) > 3:
        potential_category = url.split('/')[3]
        if potential_category and not re.search(r'^\d+$', potential_category):
            metadata['category'] = potential_category.capitalize()

    return metadata
def clean_text(text):
    """Clean extracted article text"""
    if not text:
        return ''
    # Remove multiple whitespaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace
    return text.strip()

In [5]:
def extract_article(url):
    """Extract complete article information"""
    print(f"Processing: {url}")
    html = get_page(url)
    if not html:
        return None
    
    # Parse with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Get metadata
    article_data = extract_metadata(soup, url)
    
    # Extract main content using readability
    doc = readability.Document(html)
    content_html = doc.summary()
    content_soup = BeautifulSoup(content_html, 'html.parser')
    article_data['content'] = clean_text(content_soup.get_text())
    
    # Extract category from URL or breadcrumbs
    article_data['category'] = extract_category(url, soup)
    
    # Estimate reading time (200 words per minute)
    word_count = len(article_data['content'].split())
    article_data['reading_time'] = max(1, round(word_count / 200))
    
    # Add processing timestamp
    article_data['scraped_at'] = datetime.now().isoformat()
    
    return article_data

def extract_category(url, soup):
    """Extract category from URL or breadcrumbs"""
    # Try breadcrumbs first
    breadcrumbs = soup.find_all('a', class_=re.compile('breadcrumb|category', re.I))
    if breadcrumbs and len(breadcrumbs) > 1:
        return breadcrumbs[-2].text.strip()
    
    # Fallback to URL path
    parts = url.split('/')
    if len(parts) > 3:
        return parts[3].capitalize()
    
    return ''

In [40]:
def scrape_news_list(base_url, num_pages=1):
    """Scrape multiple pages of news listings with robust pagination handling"""
    all_articles = []
    next_page_url = base_url  # Start with the base URL
    
    for page in range(1, num_pages + 1):
        print(f"Scraping page {page}... Current URL: {next_page_url}")
        html = get_page(next_page_url)
        if not html:
            print(f"Failed to fetch page {page}")
            break
            
        soup = BeautifulSoup(html, 'html.parser')
        
        # 1. Find all article elements (multiple possible selectors)
        articles = (soup.find_all('article') or 
                   soup.find_all('div', class_=re.compile('article|post|item', re.I)) or
                   soup.find_all('h3', class_=re.compile('title|headline', re.I)))
        
        if not articles:
            print("No articles found on this page. Check selectors.")
            break
        
        # 2. Process each article
        for item in articles:
            link = item.find('a', href=True)
            if not link:
                continue
                
            article_url = urljoin(base_url, link['href'])
            
            # Check for duplicates
            if not any(a.get('canonical_url') == article_url for a in all_articles):
                try:
                    article_data = extract_article(article_url)
                    if article_data:
                        all_articles.append(article_data)
                        print(f"✓ Added: {article_data.get('title', 'No title')[:50]}...")
                except Exception as e:
                    print(f"Error processing {article_url}: {str(e)}")
                
                # Respectful delay between article requests
                time.sleep(random.uniform(1, 3))
        
        # 3. Find next page URL (handling multiple pagination patterns)
        next_page_url = None
        
        # Pattern 1: Standard /page/X pattern
        if f"/page/{page + 1}" in html:
            next_page_url = urljoin(base_url, f"/page/{page + 1}")
        
        # Pattern 2: "Next" button link
        next_button = (soup.find('a', class_=re.compile('next|page-next', re.I)) or
                     soup.find('a', string=re.compile('next|suivant|»', re.I)))
        if next_button and next_button.get('href'):
            next_page_url = urljoin(base_url, next_button['href'])
        
        # Pattern 3: Infinite scroll with data attributes
        if not next_page_url:
            load_more = soup.find('div', attrs={'data-page': str(page + 1)})
            if load_more:
                next_page_url = f"{base_url}?page={page + 1}"
        
        if not next_page_url:
            print(f"No next page found after page {page}")
            break
        
        # Respectful delay between page requests
        time.sleep(random.uniform(2, 4))
    
    print(f"\nCompleted scraping. Found {len(all_articles)} unique articles.")
    return all_articles

# Application of the algorithme for example usage with telquel.ma

In [15]:
base_url = "https://telquel.ma"
articles = scrape_news_list(base_url, num_pages=2)

# Convert to DataFrame
df = pd.DataFrame(articles)

# Display results
print(f"Scraped {len(df)} articles")
df.head()

Scraping page 1... Current URL: https://telquel.ma
Processing: https://telquel.ma/2025/08/05/smartphones-a-29-dirhams-larnaque-usurpant-le-logo-delectroplanet-qui-prolifere-sur-facebook_1945346
✓ Added:   Smartphones à 29 dirhams : l’arnaque usurpant le...
Processing: https://telquel.ma/categorie/maroc/societe
✓ Added:   Société...
Processing: https://telquel.ma/2025/08/05/hicham-ait-menna-fragilise-vers-un-possible-changement-de-presidence-au-wac_1945289
✓ Added:   Hicham Aït Menna fragilisé : vers un possible ch...
Processing: https://telquel.ma/categorie/sport
✓ Added:   Sport...
Processing: https://telquel.ma/2025/08/05/subvention-pour-une-ecloserie-de-palourdes-la-reponse-de-zakia-driouich-ravive-la-controverse_1945264
✓ Added:   Subvention pour une écloserie de palourdes : la ...
Processing: https://telquel.ma/categorie/maroc/politique
✓ Added:   Politique...
Processing: https://telquel.ma/instant-t/2025/08/05/la-medina-de-fes-inscrite-au-registre-de-lalecso-du-patrimoine-archite

Unnamed: 0,title,date,author,image,description,keywords,canonical_url,category,content,reading_time,scraped_at
0,Smartphones à 29 dirhams : l’arnaque usurpan...,2025-08-05T00:00:00,Amine Belghazi,https://cdn.telquel.ma/content/uploads/2025/08...,Des faux sites clonant Electroplanet inondent ...,"[arnaques, Electroplanet, facebook, fraude, Om...",https://telquel.ma/2025/08/05/smartphones-a-29...,Diplomatie,"Depuis plusieurs mois, des dizaines de pages f...",1,2025-08-05T15:50:58.234034
1,Société,,,https://cdn.telquel.ma/content/themes/telquel/...,L'actualité du Maroc tel qu'il est,[],https://telquel.ma/categorie/maroc/societe,Instant T,,1,2025-08-05T15:51:00.624521
2,Hicham Aït Menna fragilisé : vers un possibl...,2025-08-05T00:00:00,Younes Saoury,https://cdn.telquel.ma/content/uploads/2025/08...,"Un an après son élection, Hicham Aït Menna, pr...","[Al-Aïn, CAF, Coupe du monde des clubs, footba...",https://telquel.ma/2025/08/05/hicham-ait-menna...,Instant T,Le Wydad Athletic Club est à l’arrêt. Sportive...,1,2025-08-05T15:51:03.933983
3,Sport,,,https://cdn.telquel.ma/content/themes/telquel/...,L'actualité du Maroc tel qu'il est,[],https://telquel.ma/categorie/sport,Instant T,,1,2025-08-05T15:51:07.143142
4,Subvention pour une écloserie de palourdes :...,2025-08-05T00:00:00,Younes Saoury,https://cdn.telquel.ma/content/uploads/2025/08...,Zakia Driouich a défendu dans une réponse écri...,"[Aquaculture, clientélisme, écloserie de palou...",https://telquel.ma/2025/08/05/subvention-pour-...,Économie,Zakia Driouich a défendu dans une réponse écri...,1,2025-08-05T15:51:09.378114


## Save to CSV

In [18]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"news_articles_{timestamp}.csv"
df.to_csv(filename, index=False)
print(f"Saved to {filename}")

# Basic analysis
# %%
print("\nCategories distribution:")
print(df['category'].value_counts())

print("\nTop authors:")
print(df['author'].value_counts().head(5))

Saved to news_articles_20250805_155807.csv

Categories distribution:
category
Économie                    33
Instant T                   28
Conseils de Qitab            6
TelQuel English              4
Diaspora                     4
gigalab                      4
Com d'entreprise             4
Diplomatie                   2
Football                     2
Édito                        1
Contenu partenaire           1
Tourisme                     1
News tech                    1
Who’s Who DRH                1
Etudes                       1
Communication financière     1
Name: count, dtype: int64

Top authors:
author
                  31
La Rédaction      29
Telquel Impact     5
Telquel            5
Younes Saoury      4
Name: count, dtype: int64


# Save as JSON

In [48]:
import json
from datetime import datetime
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype

class DateTimeEncoder(json.JSONEncoder):
    """Custom JSON encoder for datetime objects"""
    def default(self, obj):
        if isinstance(obj, (datetime, pd.Timestamp)):
            return obj.isoformat()
        elif isinstance(obj, pd.Timedelta):
            return str(obj)
        elif isinstance(obj, pd.NA):
            return None
        elif isinstance(obj, (pd.Series, pd.DataFrame)):
            return obj.to_dict()
        return super().default(obj)

def save_news_data(df, base_filename="news_articles"):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    json_filename = f"{base_filename}_{timestamp}.json"
    
    # Convert datetime columns to strings
    datetime_cols = df.select_dtypes(include=['datetime64', 'datetimetz']).columns
    for col in datetime_cols:
        df[col] = df[col].apply(lambda x: x.isoformat() if pd.notna(x) else None)
    
    # Prepare metadata with safe date handling
    date_range = {}
    if 'date' in df.columns:
        try:
            if is_datetime64_any_dtype(df['date']):
                valid_dates = df[df['date'].notna()]
                if not valid_dates.empty:
                    date_range = {
                        "start": str(valid_dates['date'].min()),
                        "end": str(valid_dates['date'].max())
                    }
        except Exception as e:
            print(f"Warning: Could not process date range - {str(e)}")
    
    output_data = {
        "metadata": {
            "source": "Web Scraper",
            "version": "1.0",
            "export_date": datetime.now().isoformat(),
            "article_count": len(df),
            "columns": list(df.columns),
            "date_range": date_range
        },
        "articles": json.loads(df.to_json(orient='records', date_format='iso'))
    }
    
    # Save with custom encoder
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, 
                 ensure_ascii=False, 
                 indent=2,
                 cls=DateTimeEncoder)
    
    print(f" Successfully saved {len(df)} articles to {json_filename}")
    return json_filename

def print_analysis(df):
    """Safe analysis printing with robust date handling"""
    print("\n Data Analysis Summary")
    print("=" * 40)
    
    # Categories
    if 'category' in df.columns:
        print("\n Categories distribution:")
        print(df['category'].value_counts(dropna=False).head(10))
    
    # Authors
    if 'author' in df.columns:
        print("\n Top Authors:")
        print(df['author'].value_counts(dropna=False).head(5))
    
    # Dates
    if 'date' in df.columns:
        try:
            # Ensure we have datetime type
            if not is_datetime64_any_dtype(df['date']):
                df['date'] = pd.to_datetime(df['date'], errors='coerce')
            
            valid_dates = df[df['date'].notna()]
            if not valid_dates.empty:
                print("\n Date Range:")
                print(f"Earliest: {valid_dates['date'].min()}")
                print(f"Latest: {valid_dates['date'].max()}")
                
                print("\n Articles by Month:")
                monthly = valid_dates['date'].dt.to_period('M').value_counts().sort_index()
                print(monthly.head(12))
            else:
                print("\n No valid dates found")
        except Exception as e:
            print(f"\n Date analysis failed: {str(e)}")
    
    print("=" * 40)

# Usage example:
if __name__ == "__main__":
    json_file = save_news_data(df)
    print_analysis(df)

 Successfully saved 94 articles to news_articles_20250805_165308.json

 Data Analysis Summary

 Categories distribution:
category
Économie             33
Instant T            28
Conseils de Qitab     6
TelQuel English       4
Diaspora              4
gigalab               4
Com d'entreprise      4
Diplomatie            2
Football              2
Édito                 1
Name: count, dtype: int64

 Top Authors:
author
                  31
La Rédaction      29
Telquel Impact     5
Telquel            5
Younes Saoury      4
Name: count, dtype: int64

 Date Range:
Earliest: 2023-04-27 00:00:00
Latest: 2025-08-05 00:00:00

 Articles by Month:
date
2023-04     1
2023-07     1
2023-11     1
2025-05     1
2025-06     4
2025-07    26
2025-08    32
Freq: M, Name: count, dtype: int64


# Remark 
For JavaScript-heavy sites (requires selenium)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def js_scrape(url):
    """Use when sites require JavaScript rendering"""
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    html = driver.page_source
    driver.quit()
    return html