In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import re

def get_article_content(url):
    # Send a GET request to the article URL
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the content of the request with BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Extract the main article content
        article_body = soup.find('article')
        if article_body:
            paragraphs = article_body.find_all('p')
        else:
            # Fallback to a different class if the article tag is not found
            article_body = soup.find('div', class_='ssrcss-uf6wea-RichTextComponentWrapper e1xue1i86')
            if article_body:
                paragraphs = article_body.find_all('p')
            else:
                return "Failed to retrieve the article content.", "Unknown", "Unknown", "Unknown"
        
        # Join the paragraph texts into a single string
        article_text = ' '.join([paragraph.get_text() for paragraph in paragraphs])
        
        # Extract additional metadata
        author_tag = soup.find('span', {'data-testid': 'byline-name'})
        if not author_tag:
            author_tag = soup.find('div', class_='ssrcss-68pt20-Text-TextContributorName e8mq1e96')
        author = author_tag.get_text().replace('By', '').strip() if author_tag else 'Unknown'
        
        time_tag = soup.find('time')
        if time_tag:
            time_text = time_tag.get_text()
            pub_date = calculate_pub_date(time_text)
        else:
            pub_date = 'Unknown'
        
        section_tag = soup.find('a', class_='ssrcss-14gqoev-InSectionLink')
        section = section_tag.get_text() if section_tag else 'Unknown'
        
        return article_text, author, pub_date, section
    else:
        return "Failed to retrieve the article content.", "Unknown", "Unknown", "Unknown"

def calculate_pub_date(relative_time):
    # Get the current time
    current_time = datetime.now()
    # Match the relative time format (e.g., "3 hours ago")
    match = re.match(r'(\d+)\s*(\w+)', relative_time)
    if match:
        quantity = int(match.group(1))
        unit = match.group(2)
        # Subtract the relative time from the current time
        if 'hour' in unit:
            pub_date = current_time - timedelta(hours=quantity)
        elif 'minute' in unit:
            pub_date = current_time - timedelta(minutes=quantity)
        elif 'second' in unit:
            pub_date = current_time - timedelta(seconds=quantity)
        elif 'day' in unit:
            pub_date = current_time - timedelta(days=quantity)
        else:
            pub_date = current_time
        return pub_date.strftime('%Y-%m-%d %H:%M:%S')
    return 'Unknown'

# URL of the BBC News website
url = "https://www.bbc.com/news"

# Send a GET request to the URL
response = requests.get(url)
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all headlines with the specific data-testid
    headlines = soup.find_all('h2', {'data-testid': 'card-headline'})
    
    # Prepare a list to hold the data
    data = []
    
    if headlines:
        # Iterate through each headline and extract the article content
        for i, headline in enumerate(headlines, 1):
            headline_text = headline.get_text().strip()
            article_url = headline.find_parent('a')['href']
            # Ensure the URL is absolute
            if not article_url.startswith('http'):
                article_url = 'https://www.bbc.com' + article_url
            
            # Get the article content and metadata
            article_content, author, pub_date, section = get_article_content(article_url)
            # Get the current timestamp
            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            # Append the data to the list
            data.append([headline_text, article_content, timestamp, article_url, author, pub_date, section])
    else:
        print("No headlines found. Here's the HTML content for debugging:")
        print(soup.prettify())
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

# Create a DataFrame with the scraped data
df = pd.DataFrame(data, columns=["headline", "content", "timestamp", "url", "author", "publication_date", "section"])

# Display the DataFrame
df


Unnamed: 0,headline,content,timestamp,url,author,publication_date,section
0,Biden says he 'screwed up' debate but vows to ...,"US President Joe Biden has admitted he ""screwe...",2024-07-04 12:44:08,https://www.bbc.com/news/articles/crgrwgnvqgvo,"Gareth Evans, Courtney Subramanian and Kayla E...",2024-07-04 11:53:08,Unknown
1,Hamas faces growing public dissent as Gaza war...,"The man in the video is beside himself, a mask...",2024-07-04 12:44:08,https://www.bbc.com/news/articles/c0vewvp14zdo,"Lucy Williamson & Rushdi Aboualouf,",2024-07-04 01:44:08,Unknown
2,Many Jamaicans without power after Hurricane B...,Hundreds of thousands of homes in Jamaica are ...,2024-07-04 12:44:08,https://www.bbc.com/news/articles/ckdg7rp7vk9o,"Nick Davis,",2024-07-04 12:09:08,Unknown
3,India preacher denies blame for crush deaths,The preacher who led an overcrowded gathering ...,2024-07-04 12:44:08,https://www.bbc.com/news/articles/c2lk5q27jd4o,"Anbarasan Ethirajan,",2024-07-04 11:44:08,Unknown
4,An iconic wildlife park has banned koala cuddl...,"For what seems like time immemorial, giving a ...",2024-07-04 12:44:08,https://www.bbc.com/news/articles/c3ge1vn783eo,"Tiffanie Turnbull,",2024-07-04 10:44:08,Unknown
...,...,...,...,...,...,...,...
56,Anderson's record-breaking career - in his own...,James Anderson has taken 700 Test wickets - mo...,2024-07-04 12:44:12,https://www.bbc.com/sport/cricket/articles/c3g...,Unknown,2024-07-04 07:44:12,Unknown
57,Man Utd manager Ten Hag signs contract extensi...,This video can not be played 'Where's this Man...,2024-07-04 12:44:12,https://www.bbc.com/sport/football/live/cyr7k0...,Unknown,Unknown,Unknown
58,Ten Hag signs new Manchester United deal until...,Erik ten Hag was appointed Manchester United m...,2024-07-04 12:44:12,https://www.bbc.com/sport/football/articles/cd...,Unknown,2024-07-04 12:01:12,Unknown
59,Foden 'feels sorry' for under-pressure Southgate,This video can not be played 'We all want it' ...,2024-07-04 12:44:12,https://www.bbc.com/sport/football/articles/cg...,Unknown,2024-07-04 12:44:12,Unknown


In [39]:
# Check for incomplete data
# Filter rows where the headline or content retrieval failed
failed_rows = df[(df['headline'] == "") | (df['content'] == "Failed to retrieve the article content.")]

# Display the filtered rows
failed_rows

Unnamed: 0,headline,content,timestamp,url,author,publication_date,section
7,How BBC's poll guru John Curtice prepares for ...,Failed to retrieve the article content.,2024-07-04 12:44:09,https://www.bbc.com/news/videos/c99wl9e2d0zo,Unknown,Unknown,Unknown
11,Watch: Mount Etna spits lava into the night sky,Failed to retrieve the article content.,2024-07-04 12:44:09,https://www.bbc.com/news/videos/cw0yjgwzldyo,Unknown,Unknown,Unknown
16,When will result be known and other key questions,Failed to retrieve the article content.,2024-07-04 12:44:09,https://www.bbc.com/news/videos/c6p23pl0j0zo,Unknown,Unknown,Unknown
28,Watch: Mount Etna spits lava into the night sky,Failed to retrieve the article content.,2024-07-04 12:44:10,https://www.bbc.com/news/videos/cw0yjgwzldyo,Unknown,Unknown,Unknown
29,How does Sir John Curtice prepare for election...,Failed to retrieve the article content.,2024-07-04 12:44:10,https://www.bbc.com/news/videos/c99wl9e2d0zo,Unknown,Unknown,Unknown
30,'When will we get the result?' and other key q...,Failed to retrieve the article content.,2024-07-04 12:44:10,https://www.bbc.com/news/videos/c6p23pl0j0zo,Unknown,Unknown,Unknown
31,Road rage ramming incident caught on CCTV,Failed to retrieve the article content.,2024-07-04 12:44:10,https://www.bbc.com/news/videos/cg64z7wx6wyo,Unknown,Unknown,Unknown
32,What is the polling day weather forecast?,Failed to retrieve the article content.,2024-07-04 12:44:10,https://www.bbc.com/news/videos/ckkg3g07gn5o,Unknown,Unknown,Unknown
35,Blazing wildfires force evacuations in California,Failed to retrieve the article content.,2024-07-04 12:44:11,https://www.bbc.com/news/videos/cllyzngqp0go,Unknown,Unknown,Unknown
41,Blazing wildfires force evacuations in California,Failed to retrieve the article content.,2024-07-04 12:44:11,https://www.bbc.com/news/videos/cllyzngqp0go,Unknown,Unknown,Unknown


In [35]:
existing_data = pd.read_csv("articles.csv")
# filter out incomplete data
filtered_df = df[~((df['headline'] == "") | (df['content'] == "Failed to retrieve the article content."))]
new_data = pd.concat([existing_data, filtered_df], ignore_index=True, ).drop(columns=['Unnamed: 0'])
new_data


Unnamed: 0,headline,content,timestamp,url,author,publication_date,section
0,Biden says he 'screwed up' debate but vows to ...,"US President Joe Biden has admitted he ""screwe...",2024-07-04 12:44:08,https://www.bbc.com/news/articles/crgrwgnvqgvo,"Gareth Evans, Courtney Subramanian and Kayla E...",2024-07-04 11:53:08,Unknown
1,Hamas faces growing public dissent as Gaza war...,"The man in the video is beside himself, a mask...",2024-07-04 12:44:08,https://www.bbc.com/news/articles/c0vewvp14zdo,"Lucy Williamson & Rushdi Aboualouf,",2024-07-04 01:44:08,Unknown
2,Many Jamaicans without power after Hurricane B...,Hundreds of thousands of homes in Jamaica are ...,2024-07-04 12:44:08,https://www.bbc.com/news/articles/ckdg7rp7vk9o,"Nick Davis,",2024-07-04 12:09:08,Unknown
3,India preacher denies blame for crush deaths,The preacher who led an overcrowded gathering ...,2024-07-04 12:44:08,https://www.bbc.com/news/articles/c2lk5q27jd4o,"Anbarasan Ethirajan,",2024-07-04 11:44:08,Unknown
4,An iconic wildlife park has banned koala cuddl...,"For what seems like time immemorial, giving a ...",2024-07-04 12:44:08,https://www.bbc.com/news/articles/c3ge1vn783eo,"Tiffanie Turnbull,",2024-07-04 10:44:08,Unknown
...,...,...,...,...,...,...,...
117,Anderson's record-breaking career - in his own...,James Anderson has taken 700 Test wickets - mo...,2024-07-04 12:44:12,https://www.bbc.com/sport/cricket/articles/c3g...,Unknown,2024-07-04 07:44:12,Unknown
118,Man Utd manager Ten Hag signs contract extensi...,This video can not be played 'Where's this Man...,2024-07-04 12:44:12,https://www.bbc.com/sport/football/live/cyr7k0...,Unknown,Unknown,Unknown
119,Ten Hag signs new Manchester United deal until...,Erik ten Hag was appointed Manchester United m...,2024-07-04 12:44:12,https://www.bbc.com/sport/football/articles/cd...,Unknown,2024-07-04 12:01:12,Unknown
120,Foden 'feels sorry' for under-pressure Southgate,This video can not be played 'We all want it' ...,2024-07-04 12:44:12,https://www.bbc.com/sport/football/articles/cg...,Unknown,2024-07-04 12:44:12,Unknown


In [28]:
new_data.to_csv("articles.csv")