In [None]:
# !pip install beautifulsoup4
# !pip install requests
# !pip install pandas

### Importing libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os


### Establishing Reuters Home page as the url to scrape:

In [None]:
url = "https://www.reuters.com"

In [None]:
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

### Scrape Categories:

The getCategories() function uses the BeautifulSoup library to extract all the categories available on the Reuters website. It extracts the category name and its corresponding link, and stores them in a list called categoriesLinks.


In [None]:
def getCategories(url):
    categories = soup.find_all("a",{
        "class":"text__text__1FZLe text__dark-grey__3Ml43 text__medium__1kbOh text__default__UPMUu nav-bar__link__3mja8"
    })
    categoriesLinks =  []
    for category in categories:
        categoriesLinks.append({
            "category":category.text.strip(),
            "link":category['href']
        })
    return categoriesLinks

In [None]:
categories = getCategories(url)

### Scrape Topics and Menus:

The getTopicsAndMenus() function scrapes news topics and their menus from each category. For each category, the function first appends the category link to the main Reuters website link. It then requests the category link, scrapes the HTML content of the page, and extracts all the news topics available. It then loops through each topic and appends it to a pandas DataFrame called df as a new row.

In [None]:
def getTopicsAndMenus(url,categoriesLinks, df):
    for category in categoriesLinks:
        category["link"] = url+category["link"]
        request = requests.get(category["link"])
        categorySoup= BeautifulSoup(request.content, "html.parser")
        buttons = categorySoup.find_all(
            "button",
            {"class":"button__button__2Ecqi button__secondary__18moI button__pill__2LA8V text-button__container__3q3zX"}
            )
        category["topics"] = []
        for button in buttons: 
            topic = button.find(
                "span",
                {"class":"text__text__1FZLe text__inherit-color__3208F text__bold__2-8Kc text__default__UPMUu text-button__medium__113uZ"}
                )
            df = pd.concat([df,pd.DataFrame([{"Category": category["category"],"CategoryLink": category["link"], "Menus": button["data-id"], "Topics": topic.text.strip()}])],ignore_index=True)
    df["topicLink"] = url+df["Menus"]
    return df

In [None]:
df = pd.DataFrame(columns=["Category","CategoryLink", "Menus", "Topics"])
df = getTopicsAndMenus(url,categories, df)

### Scrape Article Links, Titles, Categories, and Topics:

The getArtciles() function scrapes article links, titles, categories, and topics. It uses the topic link from the getTopicsAndMenus() function to extract article links, titles, and their respective topics. It then appends the extracted data to the df pandas DataFrame as new columns.

In [None]:
def getArtciles(df,url):
    data = []
    for topicLink in df["topicLink"]:
        request = requests.get(topicLink)
        topicSoup= BeautifulSoup(request.content, "html.parser")
        topicCards = topicSoup.find_all(
            "div",
            {"class":"media-story-card__hub__3mHOR story-card"}
        )
        for card in topicCards:
            categories = card.find("a",{
                "class" : "text__text__1FZLe text__inherit-color__3208F text__inherit-font__1Y8w3 text__inherit-size__1DZJi link__underline_on_hover__2zGL4"
            })
            linksOfTitles = card.find("a",{
                "class":"text__text__1FZLe text__dark-grey__3Ml43 text__medium__1kbOh text__heading_5_and_half__3YluN heading__base__2T28j heading_5_half media-story-card__heading__eqhp9"
            })
            time = card.find("time",{"class":"text__text__1FZLe text__inherit-color__3208F text__regular__2N1Xr text__extra_small__1Mw6v label__label__f9Hew label__small__274ei media-story-card__time__2i9EK"})
            data.append({
                "Title":linksOfTitles.find("span").text.strip(),
                "ArticleLink":(url+linksOfTitles["href"]) if linksOfTitles["href"] is not None else None,
                "Topics":next(categories.stripped_strings) if categories is not None else None
            })
    data = pd.DataFrame(data)
    df = pd.merge(df, data, on=['Topics'], how='outer')
    return df

In [None]:
df = getArtciles(df,url)

### Clean the Data:

The tempDf DataFrame is used to extract website, category, and topic information from the article links. It then updates the df DataFrame by replacing null values with the extracted information.

In [None]:
tempDf = pd.DataFrame()
tempDf[['website', 'category', 'topic']] = df['ArticleLink'].str.split('/', expand=True)[[2,3,4]]
mask = df[['Category', 'CategoryLink', 'Menus', 'topicLink']].isna().all(axis=1)
df.loc[mask, 'Menus'] = '/' + tempDf['category'] + '/'+ tempDf['topic'] +'/'
df.loc[mask, 'Category'] = tempDf['category']
df.loc[mask, 'CategoryLink'] = 'https://'+tempDf["website"]+"/"+tempDf["category"]+"/"
df.loc[mask, 'topicLink'] = 'https://'+tempDf["website"]+df[mask]['Menus']

In [None]:
df.dropna(subset=['ArticleLink'], inplace=True)

### Scrape Article Details:

The getArticleDetails() function scrapes the article details for each article link in the df DataFrame. It extracts the article image, article text, authors, and date.

In [None]:
def getArticleDetails(df):
    data = []
    for articleLink in df[df['ArticleLink'].notnull()]['ArticleLink']:
        if articleLink is not None:
            request = requests.get(articleLink)
            articleSoup= BeautifulSoup(request.content, "html.parser")
            image = articleSoup.find("div",{
                "class":"styles__image-container__skIG1 styles__fill__3xCr1 styles__center_center__1AaPV styles__apply-ratio__1_FYQ"
            })
            timeContainer = articleSoup.find("time",{
                "class":"text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__extra_small__1Mw6v article-header__dateline__4jE04"
            })
            title = articleSoup.find("div",{
                "class":"article-header__heading__15OpQ"
            })
            author = articleSoup.find("a",{
                "class":"author-name__author__1gx5k"
            })
            articleBodyContainer = articleSoup.find("div",{
                "class":"article-body__content__17Yit paywall-article"
            })
            
            artcileBody  = articleBodyContainer.find_all("p") if articleBodyContainer is not None else None
            wholeBody = ""
            if artcileBody is not None:
                for body in artcileBody:
                    wholeBody+=body.text+"\n"
            if timeContainer is not None:
                date = timeContainer.find_all("span")[1].text
                time = timeContainer.find_all("span")[2].text
            else:
                date = None
                time = None
            image = image.find("img")["src"] if image is not None else None
            data.append({
                "ArticleLink":articleLink,
                "Title":title.find("h1").text if title is not None else None,
                "Author":author.text if author is not None else None,
                "Date":date,
                "Time":time,
                "Image":image,
                "Article":wholeBody
                })
    data = pd.DataFrame(data)
    df = pd.merge(df, data, on=['ArticleLink'], how='outer')
    return df

In [None]:
df = getArticleDetails(df)

In [None]:
tempDf = pd.DataFrame()
tempDf['topic'] = df['topicLink'].str.split('/', expand=True)[4]
mask = df[['Topics']].isna().all(axis=1)
df.loc[mask, 'Topics'] = tempDf['topic']

In [None]:
data = pd.read_csv("./reutersData.csv")

In [None]:
df = df_concat = pd.concat([df, data]).drop_duplicates()

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.to_csv('./reutersData.csv', index=False)