In [329]:
# !pip install beautifulsoup4
# !pip install selenium
# !pip install requests
# !pip install pandas

### Importing libraries

In [330]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os


### Establishing Reuters Home page as the url to scrape:

In [331]:
url = "https://www.reuters.com"

In [332]:
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")

In [333]:
def getCategories(url):
    categories = soup.find_all("a",{
        "class":"text__text__1FZLe text__dark-grey__3Ml43 text__medium__1kbOh text__default__UPMUu nav-bar__link__3mja8"
    })
    categoriesLinks =  []
    for category in categories:
        categoriesLinks.append({
            "category":category.text.strip(),
            "link":category['href']
        })
    return categoriesLinks

In [334]:
categories = getCategories(url)
categories

[{'category': 'World', 'link': '/world/'},
 {'category': 'Business', 'link': '/business/'},
 {'category': 'Legal', 'link': '/legal/'},
 {'category': 'Markets', 'link': '/markets/'},
 {'category': 'Technology', 'link': '/technology/'}]

In [335]:
def getTopicsAndMenus(url,categoriesLinks, df):
    for category in categoriesLinks:
        category["link"] = url+category["link"]
        request = requests.get(category["link"])
        categorySoup= BeautifulSoup(request.content, "html.parser")
        buttons = categorySoup.find_all(
            "button",
            {"class":"button__button__2Ecqi button__secondary__18moI button__pill__2LA8V text-button__container__3q3zX"}
            )
        category["topics"] = []
        for button in buttons: 
            topic = button.find(
                "span",
                {"class":"text__text__1FZLe text__inherit-color__3208F text__bold__2-8Kc text__default__UPMUu text-button__medium__113uZ"}
                )
            df = pd.concat([df,pd.DataFrame([{"Category": category["category"],"CategoryLink": category["link"], "Menus": button["data-id"], "Topics": topic.text.strip()}])],ignore_index=True)
    df["topicLink"] = url+df["Menus"]
    return df

In [336]:
df = pd.DataFrame(columns=["Category","CategoryLink", "Menus", "Topics"])
df = getTopicsAndMenus(url,categories, df)
df

Unnamed: 0,Category,CategoryLink,Menus,Topics,topicLink
0,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/
1,World,https://www.reuters.com/world/,/world/americas/,Americas,https://www.reuters.com/world/americas/
2,World,https://www.reuters.com/world/,/world/asia-pacific/,Asia Pacific,https://www.reuters.com/world/asia-pacific/
3,World,https://www.reuters.com/world/,/world/china/,China,https://www.reuters.com/world/china/
4,World,https://www.reuters.com/world/,/world/europe/,Europe,https://www.reuters.com/world/europe/
5,World,https://www.reuters.com/world/,/world/india/,India,https://www.reuters.com/world/india/
6,World,https://www.reuters.com/world/,/world/middle-east/,Middle East,https://www.reuters.com/world/middle-east/
7,World,https://www.reuters.com/world/,/world/uk/,United Kingdom,https://www.reuters.com/world/uk/
8,World,https://www.reuters.com/world/,/world/us/,United States,https://www.reuters.com/world/us/
9,World,https://www.reuters.com/world/,/world/reuters-next/,Reuters Next,https://www.reuters.com/world/reuters-next/


In [337]:
def getArtciles(df,url):
    data = []
    for topicLink in df["topicLink"]:
        request = requests.get(topicLink)
        topicSoup= BeautifulSoup(request.content, "html.parser")
        topicCards = topicSoup.find_all(
            "div",
            {"class":"media-story-card__hub__3mHOR story-card"}
        )
        for card in topicCards:
            categories = card.find("a",{
                "class" : "text__text__1FZLe text__inherit-color__3208F text__inherit-font__1Y8w3 text__inherit-size__1DZJi link__underline_on_hover__2zGL4"
            })
            linksOfTitles = card.find("a",{
                "class":"text__text__1FZLe text__dark-grey__3Ml43 text__medium__1kbOh text__heading_5_and_half__3YluN heading__base__2T28j heading_5_half media-story-card__heading__eqhp9"
            })
            time = card.find("time",{"class":"text__text__1FZLe text__inherit-color__3208F text__regular__2N1Xr text__extra_small__1Mw6v label__label__f9Hew label__small__274ei media-story-card__time__2i9EK"})
            data.append({
                "Title":linksOfTitles.find("span").text.strip(),
                "ArticleLink":(url+linksOfTitles["href"]) if linksOfTitles["href"] is not None else None,
                # "Date":time.text.strip(),
                "Topics":next(categories.stripped_strings) if categories is not None else None
            })
    data = pd.DataFrame(data)
    df = pd.merge(df, data, on=['Topics'], how='outer')
    return df

In [338]:
df = getArtciles(df,url)
df

Unnamed: 0,Category,CategoryLink,Menus,Topics,topicLink,Title,ArticleLink
0,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/,South Africa's Ramaphosa adds electricity mini...,https://www.reuters.com/world/africa/south-afr...
1,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/,Uganda considers bill to criminalise identifyi...,https://www.reuters.com/world/africa/uganda-co...
2,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/,"In shadow of conflict nearby, rebel upsurge hi...",https://www.reuters.com/world/africa/shadow-co...
3,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/,South Africa current account deficit at 2.6%/G...,https://www.reuters.com/world/africa/south-afr...
4,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/,Tunisian president to dissolve municipal counc...,https://www.reuters.com/world/africa/tunisian-...
...,...,...,...,...,...,...,...
552,,,,Social Impact,,French firms team up for low-carbon truck fuel...,https://www.reuters.com/business/sustainable-b...
553,,,,Clean Energy,,Energy crisis seen posing 'existential threat'...,https://www.reuters.com/business/energy/reuter...
554,,,,Clean Energy,,Reducing energy waste key to meeting climate g...,https://www.reuters.com/business/energy/reuter...
555,,,,Governance,,"Buying Uniper was 'a mistake', Finnish finance...",https://www.reuters.com/business/energy/buying...


In [339]:

df[df['ArticleLink'].notnull()]['ArticleLink']

0      https://www.reuters.com/world/africa/south-afr...
1      https://www.reuters.com/world/africa/uganda-co...
2      https://www.reuters.com/world/africa/shadow-co...
3      https://www.reuters.com/world/africa/south-afr...
4      https://www.reuters.com/world/africa/tunisian-...
                             ...                        
552    https://www.reuters.com/business/sustainable-b...
553    https://www.reuters.com/business/energy/reuter...
554    https://www.reuters.com/business/energy/reuter...
555    https://www.reuters.com/business/energy/buying...
556    https://www.reuters.com/business/sustainable-b...
Name: ArticleLink, Length: 548, dtype: object

In [340]:
def getArticleDetails(df):
    data = []
    for articleLink in df[df['ArticleLink'].notnull()]['ArticleLink']:
        if articleLink is not None:
            request = requests.get(articleLink)
            articleSoup= BeautifulSoup(request.content, "html.parser")
            image = articleSoup.find("div",{
                "class":"styles__image-container__skIG1 styles__fill__3xCr1 styles__center_center__1AaPV styles__apply-ratio__1_FYQ"
            })
            timeContainer = articleSoup.find("time",{
                "class":"text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__extra_small__1Mw6v article-header__dateline__4jE04"
            })
            title = articleSoup.find("div",{
                "class":"article-header__heading__15OpQ"
            })
            author = articleSoup.find("a",{
                "class":"author-name__author__1gx5k"
            })
            articleBodyContainer = articleSoup.find("div",{
                "class":"article-body__content__17Yit paywall-article"
            })
            
            artcileBody  = articleBodyContainer.find_all("p") if articleBodyContainer is not None else None
            wholeBody = ""
            if artcileBody is not None:
                for body in artcileBody:
                    wholeBody+=body.text+"\n"
            if timeContainer is not None:
                date = timeContainer.find_all("span")[1].text
                time = timeContainer.find_all("span")[2].text
            else:
                date = None
                time = None
            image = image.find("img")["src"] if image is not None else None
            data.append({
                "Title":title.find("h1").text if title is not None else None,
                "Author":author.text if author is not None else None,
                "Date":date,
                "Time":time,
                "Image":image,
                "Article":wholeBody
                })
    data = pd.DataFrame(data)
    df = pd.merge(df, data, on=['Title'], how='outer')
    return df

In [342]:
df = getArticleDetails(df)
df

Unnamed: 0,Category,CategoryLink,Menus,Topics,topicLink,Title,ArticleLink,Author,Date,Time,Image,Article
0,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/,South Africa's Ramaphosa adds electricity mini...,https://www.reuters.com/world/africa/south-afr...,Bhargav Acharya,"March 6, 2023",8:51 PM UTC,https://cloudfront-us-east-2.images.arcpublish...,"JOHANNESBURG, March 6 (Reuters) - South Africa..."
1,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/,Uganda considers bill to criminalise identifyi...,https://www.reuters.com/world/africa/uganda-co...,Reuters,"March 9, 2023",2:41 PM UTC,https://cloudfront-us-east-2.images.arcpublish...,"KAMPALA, March 9 (Reuters) - Uganda's parliame..."
2,World,https://www.reuters.com/world/,/world/africa/,Africa,https://www.reuters.com/world/africa/,Uganda considers bill to criminalise identifyi...,https://www.reuters.com/world/africa/uganda-co...,Reuters,"March 9, 2023",2:41 PM UTC,https://cloudfront-us-east-2.images.arcpublish...,"KAMPALA, March 9 (Reuters) - Uganda's parliame..."
3,,,,World,,Uganda considers bill to criminalise identifyi...,https://www.reuters.com/world/africa/uganda-co...,Reuters,"March 9, 2023",2:41 PM UTC,https://cloudfront-us-east-2.images.arcpublish...,"KAMPALA, March 9 (Reuters) - Uganda's parliame..."
4,,,,World,,Uganda considers bill to criminalise identifyi...,https://www.reuters.com/world/africa/uganda-co...,Reuters,"March 9, 2023",2:41 PM UTC,https://cloudfront-us-east-2.images.arcpublish...,"KAMPALA, March 9 (Reuters) - Uganda's parliame..."
...,...,...,...,...,...,...,...,...,...,...,...,...
945,,,,,,"Wall St climbs as jobless claims rise, payroll...",,Amruta Khandekar,"March 9, 2023",3:36 PM UTC,https://cloudfront-us-east-2.images.arcpublish...,March 9 (Reuters) - Wall Street's main indexes...
946,,,,,,"Wall Street advances, Treasury yields dip on j...",,Stephen Culp,"March 9, 2023",3:48 PM UTC,,"NEW YORK, March 9 (Reuters) - Wall Street gain..."
947,,,,,,"Oil rises as weaker dollar, French strike bala...",,Alex Lawler,"March 9, 2023",3:28 PM UTC,https://cloudfront-us-east-2.images.arcpublish...,"LONDON, March 9 (Reuters) - Oil rose on Thursd..."
948,,,,,,Dutch to restrict semiconductor tech exports t...,,Toby Sterling,"March 8, 2023",11:24 PM UTC,https://cloudfront-us-east-2.images.arcpublish...,"AMSTERDAM/WASHINGTON, March 8 (Reuters) - The ..."
