In [7]:
import requests
import time
import csv
import os
from bs4 import BeautifulSoup
import pandas as pd

In [8]:
def read_urls(file_path):
    """Read URLs from a text file."""
    with open(file_path, 'r') as file:
        return [line.strip() for line in file if line.strip()]

def scrape_article(url):
    """Scrape title and content into a full article, excluding the description."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the title
        title_tag = soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else 'Title not found'

        # Find the article content, excluding the description
        article_tag = soup.find('article')
        if article_tag:
            # Remove potential description elements from the article content
            for meta in article_tag.find_all('meta', attrs={'name': 'description'}):
                meta.decompose()

            # Collect all paragraphs and header tags within the article tag
            content = '\n'.join([p.get_text(strip=True) for p in article_tag.find_all(['p', 'h2', 'h3', 'h4'])])
        else:
            content = 'Article content not found'

        # Combine title and content into one full article
        full_article = f"{title}\n\n{content}"

        # Extract the description separately
        description_tag = soup.find('meta', attrs={'name': 'description'})
        description = description_tag['content'] if description_tag else 'No description found'

        return {"id": url, "article": full_article, "description": description}
    except Exception as e:
        print(f"Failed to scrape {url}: {str(e)}")
        return None

def write_to_csv(data, csv_file):
    """Write the list of dictionaries to a CSV file with specific columns."""
    output_dir = 'output_data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(os.path.join(output_dir, csv_file), 'w', newline='', encoding='utf-8') as file:
        fieldnames = ["id", "article", "description"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

def main():
    """Main function to scrape articles and save data in a CSV file."""
    file_path = '../data/url_leparisien.txt'
    csv_file = 'scraped_data.csv'

    urls = read_urls(file_path)
    articles = []

    for url in urls:
        article = scrape_article(url)
        if article:
            articles.append(article)
        time.sleep(0.1)

    write_to_csv(articles, csv_file)

    # Read the CSV file into a Pandas DataFrame and display the results
    df = pd.read_csv(os.path.join('output_data', csv_file))
    print(df)

In [9]:
if __name__ == "__main__":
    main()


                                                   id  \
0   https://www.leparisien.fr/jo-paris-2024/tennis...   
1   https://www.leparisien.fr/jo-paris-2024/jo-par...   
2   https://www.leparisien.fr/sports/pour-le-kyks-...   
3   https://www.leparisien.fr/jo-paris-2024/tennis...   
4   https://www.leparisien.fr/jo-paris-2024/jo-par...   
5   https://www.leparisien.fr/jo-paris-2024/jo-par...   
6   https://www.leparisien.fr/jo-paris-2024/jo-par...   
7   https://www.leparisien.fr/jo-paris-2024/le-rel...   
8   https://www.leparisien.fr/jo-paris-2024/il-sag...   
9   https://www.leparisien.fr/essonne-91/les-ulis-...   
10  https://www.leparisien.fr/essonne-91/les-ulis-...   

                                              article  \
0   JO Paris 2024 : les pongistes Alexis Lebrun et...   
1   JO Paris 2024 : Marchand, Mayer, Mossely, Rine...   
2   « Pour le Kyks » : l’inattendue célébration du...   
3   JO Paris 2024 : « Deux champions qui n’ont pas...   
4   JO Paris 2024 : « On peut 