<div class="alert alert-box alert-info">
    <h2> Trying Pulse Scrapping:</h2>
</div>

In [49]:
from bs4 import BeautifulSoup
import requests
import json

In [2]:
link = "https://pulse.zerodha.com"

### Studying Page:
- Pulse gives last 24 hours of news
- Page structure (useful things only) is as follows:

```html
<html>
  <ul id="news">                                                <!-- Has many news <li> -->
    <li class="box item">                                       <!-- News Item -->

      <h2 class="title">                                        <!-- News Title -->
        <a href="link">Title</a>                                <!-- Main web link -->
      </h2>
      <div class="desc">Description</div>                       <!-- News Description -->
      <span class="date" title="11:32 PM, 20 Jan 2025"></span>  <!-- News Date -->
      <span class="feed">— Bloomberg Quint</span>               <!-- News Source -->

      <ul class="similar">                                      <!-- Similar News (SN) -->
        <li>
          <a class="title2" href="link">News Title</a>          <!-- SN link, title -->
          <span class="date" title="11:32...(same)"></span>     <!-- SN Date -->
          <span class="feed">— Bloomberg Quint</span>           <!-- SN Source -->
          <!-- There is no description for the similar news -->
        </li>
        ... such more related news in li's ...
      </ul>
      
    </li>
    ... such multiple li's for news ...
  </ul>
</html>
```

### Output Format:

In [139]:
list_of_dicts = [{
    "title": "this is news headline",
    "link": "https://www.link_to_website.com",
    "description": "this is the description of the news",
    "date": "09:02 PM, 21 Jan 2025",
    "source": "website name",
    "data_search": "{title}+{description}",
    "page_content": "full content of actual news page (e.g. hindustan times, times of india, etc.)"
}]

### Fetch Page:

In [3]:
# fetch the content of the page given in link:
page = requests.get(link)
print(page)

<Response [200]>


In [9]:
# dir(page)

In [4]:
page_content = BeautifulSoup(page.content, 'html.parser')
print(page_content.prettify())

<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Pulse by Zerodha - Latest financial and market news from all major Indian news sources aggregated in one place - Pulse
  </title>
  <meta content="Indian business, market, and finance news aggregated from all major news sources in realtime in one place" name="description"/>
  <meta content="india news, india financial news, india market news, realtime indian business news, latest indian financial updates, latest market indian updates, indian news aggregator" name="keywords"/>
  <meta content="width=device-width, initial-scale=1, minimum-scale=1" name="viewport"/>
  <link href="https://fonts.googleapis.com/css?family=Roboto:400,900" rel="stylesheet" type="text/css"/>
  <link href="https://pulse.zerodha.com/public/style.css" rel="stylesheet" type="text/css">
   <link href="https://pulse.zerodha.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
   <script src="https://ajax.googleapi

In [5]:
# Save the fetched content in a file:
with open("h_pulse_full.html", "w") as file:
    file.write(page_content.prettify())

### Get news:

In [6]:
news_all = page_content.find('ul', attrs={'id': 'news'})

In [7]:
# print(news_all.prettify())
with open("h_all_news.html", "w") as file:
    file.write(news_all.prettify())

### Work on single news:

In [69]:
news_items = news_all.find_all('li', attrs={'class': 'box item'})
print(news_items[2].prettify())

<li class="box item" id="item-11471718">
 <h2 class="title">
  <a data-id="11471718" href="https://economictimes.indiatimes.com/markets/stocks/news/essar-oil-and-gas-plans-275-mn-overseas-bond-sale-post-restructuring/articleshow/117434000.cms" rel="nofollow">
   Essar Oil and Gas plans $275 mn overseas bond sale post-restructuring
  </a>
 </h2>
 <div class="desc">
  Essar Oil and Gas Exploration &amp; Production Ltd. is exploring its first post-restructuring fundraising initiative by considering a $275 million overseas bond issuance to fund expansion and refinance existing debt. Talks remain preliminary and non-binding.
 </div>
 <span class="date" title="09:29 PM, 21 Jan 2025">
  1 hour ago
 </span>
 <span class="feed">
  — Economic Times
 </span>
 <p class="share">
  <a class="fb" href="http://facebook.com/share.php?u=https%3A%2F%2Feconomictimes.indiatimes.com%2Fmarkets%2Fstocks%2Fnews%2Fessar-oil-and-gas-plans-275-mn-overseas-bond-sale-post-restructuring%2Farticleshow%2F117434000.cms

In [70]:
i = 2

In [71]:
news_title = news_items[i].find('h2', attrs={'class': 'title'}).text
news_title

'Essar Oil and Gas plans $275 mn overseas bond sale post-restructuring '

In [72]:
# get the link in a tag inside this news item > h2 > a
news_link = news_items[i].find('h2', attrs={'class': 'title'}).find('a')['href']
news_link

'https://economictimes.indiatimes.com/markets/stocks/news/essar-oil-and-gas-plans-275-mn-overseas-bond-sale-post-restructuring/articleshow/117434000.cms'

In [73]:
news_desc = news_items[i].find("div", attrs={"class": "desc"}).text
news_desc

'Essar Oil and Gas Exploration & Production Ltd. is exploring its first post-restructuring fundraising initiative by considering a $275 million overseas bond issuance to fund expansion and refinance existing debt. Talks remain preliminary and non-binding. '

In [74]:
news_date = news_items[i].find("span", attrs={"class": "date"})["title"]
news_date

'09:29 PM, 21 Jan 2025'

In [75]:
news_source = news_items[i].find("span", attrs={"class": "feed"}).text
news_source

'— Economic Times'

### Function to extract all required things from news list-item:

In [119]:
def get_news_content_from_li(news_list_item):
    news_title = news_list_item.find('h2', attrs={'class': 'title'}).text
    news_link = news_list_item.find('h2', attrs={'class': 'title'}).find('a')['href']
    news_desc = news_list_item.find("div", attrs={"class": "desc"}).text
    news_date = news_list_item.find("span", attrs={"class": "date"})["title"]
    news_source = news_list_item.find("span", attrs={"class": "feed"}).text
    news_data_search = news_title + " " + news_desc
    
    return {
        "title": news_title,
        "link": news_link,
        "description": news_desc,
        "date": news_date,
        "source": news_source,
        "data_search": news_data_search
    }

In [121]:
resp = get_news_content_from_li(news_items[2])
print(json.dumps(resp, indent=4))

{
    "title": "Essar Oil and Gas plans $275 mn overseas bond sale post-restructuring ",
    "link": "https://economictimes.indiatimes.com/markets/stocks/news/essar-oil-and-gas-plans-275-mn-overseas-bond-sale-post-restructuring/articleshow/117434000.cms",
    "description": "Essar Oil and Gas Exploration & Production Ltd. is exploring its first post-restructuring fundraising initiative by considering a $275 million overseas bond issuance to fund expansion and refinance existing debt. Talks remain preliminary and non-binding. ",
    "date": "09:29 PM, 21 Jan 2025",
    "source": "\u2014 Economic Times",
    "data_search": "Essar Oil and Gas plans $275 mn overseas bond sale post-restructuring  Essar Oil and Gas Exploration & Production Ltd. is exploring its first post-restructuring fundraising initiative by considering a $275 million overseas bond issuance to fund expansion and refinance existing debt. Talks remain preliminary and non-binding. "
}


#### Modified to handle cases if some data is missing:

In [120]:
def get_news_content_from_li_2(news_list_item):
    def safe_find_text(element, tag, attrs):
        element = element.find(tag, attrs)
        return element.text if element else ""

    def safe_find_attr(element, tag, attrs, attr_name):
        element = element.find(tag, attrs)
        return element[attr_name] if element and attr_name in element.attrs else ""

    news_title = safe_find_text(news_list_item, 'h2', {'class': 'title'})
    news_link = safe_find_attr(news_list_item, 'a', {}, 'href')
    news_desc = safe_find_text(news_list_item, "div", {"class": "desc"})
    news_date = safe_find_attr(news_list_item, "span", {"class": "date"}, "title")
    news_source = safe_find_text(news_list_item, "span", {"class": "feed"})
    news_data_search = f"{news_title} {news_desc}".strip()

    return {
        "title": news_title,
        "link": news_link,
        "description": news_desc,
        "date": news_date,
        "source": news_source,
        "data_search": news_data_search
    }

In [122]:
resp2 = get_news_content_from_li_2(news_items[2])
print(json.dumps(resp, indent=4))

{
    "title": "Essar Oil and Gas plans $275 mn overseas bond sale post-restructuring ",
    "link": "https://economictimes.indiatimes.com/markets/stocks/news/essar-oil-and-gas-plans-275-mn-overseas-bond-sale-post-restructuring/articleshow/117434000.cms",
    "description": "Essar Oil and Gas Exploration & Production Ltd. is exploring its first post-restructuring fundraising initiative by considering a $275 million overseas bond issuance to fund expansion and refinance existing debt. Talks remain preliminary and non-binding. ",
    "date": "09:29 PM, 21 Jan 2025",
    "source": "\u2014 Economic Times",
    "data_search": "Essar Oil and Gas plans $275 mn overseas bond sale post-restructuring  Essar Oil and Gas Exploration & Production Ltd. is exploring its first post-restructuring fundraising initiative by considering a $275 million overseas bond issuance to fund expansion and refinance existing debt. Talks remain preliminary and non-binding. "
}


### Logic to handle related news:

In [137]:
def get_similar_news_content_from_li(similar_news_list_item):
    def safe_find_text(element, tag, attrs):
        element = element.find(tag, attrs)
        return element.text if element else ""

    def safe_find_attr(element, tag, attrs, attr_name):
        element = element.find(tag, attrs)
        return element[attr_name] if element and attr_name in element.attrs else ""

    news_title = safe_find_text(similar_news_list_item, 'a', {'class': 'title2'})
    news_link = safe_find_attr(similar_news_list_item, 'a', {'class': 'title2'}, 'href')
    news_desc = safe_find_text(similar_news_list_item, "div", {"class": "desc"})
    news_date = safe_find_attr(similar_news_list_item, "span", {"class": "date"}, "title")
    news_source = safe_find_text(similar_news_list_item, "span", {"class": "feed"})
    news_data_search = f"{news_title} {news_desc}".strip()

    return {
        "title": news_title,
        "link": news_link,
        "description": news_desc,
        "date": news_date,
        "source": news_source,
        "data_search": news_data_search
    }

In [101]:
i = 7
news_items[i]

<li class="box item" id="item-11471712">
<h2 class="title"><a data-id="11471712" href="https://www.ndtvprofit.com/quarterly-earnings/icici-prudential-life-insurance-q3-earnings-net-profit-climbs-432-to-rs-325-crore" rel="nofollow">ICICI Prudential Life Insurance Q3 Earnings: Net Profit Climbs 43.2% To Rs 325 Crore</a></h2>
<div class="desc"> For nine months ended Dec. 2024, the firm registered a net profit of Rs 803 crore, an 18.3% year-on-year growth over Rs 679 crore seen in previous fiscal's similar period.</div>
<span class="date" title="09:02 PM, 21 Jan 2025">1.5 hours ago</span>
<span class="feed">— Bloomberg Quint</span>
<p class="share">
<a class="fb" href="http://facebook.com/share.php?u=https%3A%2F%2Fwww.ndtvprofit.com%2Fquarterly-earnings%2Ficici-prudential-life-insurance-q3-earnings-net-profit-climbs-432-to-rs-325-crore"></a>
<a class="tw" href="http://twitter.com/share?text=ICICI+Prudential+Life+Insurance+Q3+Earnings%3A+Net+Profit+Climbs+43.2%25+To+Rs+325+Crore&amp;url=htt

In [138]:
if news_items[i].find("ul", attrs={"class": "similar"}):
    print("Similar news exist. Appending it as separate news item")

    similar_news = news_items[i].find("ul", attrs={"class": "similar"}).find_all("li")
    # print(similar_news[0].prettify())
    for news in similar_news:
        resp = get_similar_news_content_from_li(news)
        print(json.dumps(resp, indent=4))
else:
    print("similar news not exist")

Similar news exist. Appending it as separate news item
{
    "title": "PNB Housing Finance Q3 Results: Net profit jumps 43% YoY to Rs 483 crore on strong home loan demand ",
    "link": "https://economictimes.indiatimes.com/markets/stocks/earnings/pnb-housing-finance-q3-results-net-profit-jumps-43-yoy-to-rs-483-crore-on-strong-home-loan-demand/articleshow/117431845.cms",
    "description": "",
    "date": "09:02 PM, 21 Jan 2025",
    "source": "\u2014 Economic Times",
    "data_search": "PNB Housing Finance Q3 Results: Net profit jumps 43% YoY to Rs 483 crore on strong home loan demand"
}
{
    "title": "Karur Vysya Bank Q3 Result: Reports 20.39% Rise In Net Profit To Rs 496 Crore",
    "link": "https://www.ndtvprofit.com/quarterly-earnings/karur-vysya-bank-q3-result-reports-2039-rise-in-net-profit-to-rs-496-crore",
    "description": "",
    "date": "09:02 PM, 21 Jan 2025",
    "source": "\u2014 Bloomberg Quint",
    "data_search": "Karur Vysya Bank Q3 Result: Reports 20.39% Rise In N

## `Final Code:`

- See the structure study and output format before this code.
- The related and main news are both combined in the same list.
- Two functions are merged into single function with related_news flag.

In [1]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import json

In [2]:
link = "https://pulse.zerodha.com"
keyword = "zomato"

In [3]:
#  Get page:
page = requests.get(link)

if page.status_code != 200:
    raise Exception(f"Failed to fetch page. Status code: {page.status_code}")
else:
    print("Page fetched successfully...")
    
page_content = BeautifulSoup(page.content, 'html.parser')

Page fetched successfully...


In [4]:
def timestamp(filename_safe=True, spaces=False):
    if filename_safe and spaces:
        return datetime.now().strftime("%Y-%m-%d %I-%M-%S %p")
    if filename_safe and not spaces:
        return datetime.now().strftime("%Y-%m-%d_%I-%M-%S_%p")

    return datetime.now().strftime("%Y-%m-%d %I:%M:%S %p")

# timestamp()

In [5]:
# Save the fetched content in a file:
filename = f"./{timestamp()}_full.html"
with open(filename, "w", encoding="utf-8") as file:
    file.write(page_content.prettify())
print(f'Page content saved to file: "{filename}"')

In [6]:
# Get the news list:
news_all = page_content.find('ul', attrs={'id': 'news'})
news_items = news_all.find_all('li', attrs={'class': 'box item'})

In [7]:
# Function to get news content from a news list item:
def get_news_content(news_list_item, is_similar_news=False):
    def safe_find_text(element, tag, attrs):
        element = element.find(tag, attrs)
        return element.text if element else ""

    def safe_find_attr(element, tag, attrs, attr_name):
        element = element.find(tag, attrs)
        return element[attr_name] if element and attr_name in element.attrs else ""


    if not is_similar_news:
        news_title = safe_find_text(news_list_item, 'h2', {'class': 'title'})
        news_link = safe_find_attr(news_list_item, 'a', {}, 'href')
        news_desc = safe_find_text(news_list_item, "div", {"class": "desc"})
        
    else:
        news_title = safe_find_text(news_list_item, 'a', {'class': 'title2'})
        news_link = safe_find_attr(news_list_item, 'a', {'class': 'title2'}, 'href')
        news_desc = "" # No description available for similar news
        
    news_date = safe_find_attr(news_list_item, "span", {"class": "date"}, "title")
    news_source = safe_find_text(news_list_item, "span", {"class": "feed"})
    news_data_search = f"{news_title} {news_desc}".strip()

    return {
        "title": news_title,
        "link": news_link,
        "description": news_desc,
        "date": news_date,
        "source": news_source,
        "data_search": news_data_search
    }

In [None]:
final_news_list = []

for news in news_items:
    # print(json.dumps(get_news_content(news), indent=4))
    final_news_list.append(get_news_content(news))
    
    # Check if similar news exist, if yes, append them as separate news items:
    if news.find("ul", attrs={"class": "similar"}):
        # print("Similar news exist. Appending it as separate news item")
        similar_news = news.find("ul", attrs={"class": "similar"}).find_all("li")

        for similar_news_item in similar_news:
            # print(json.dumps(get_news_content(similar_news_item, is_similar_news=True), indent=4))
            final_news_list.append(get_news_content(similar_news_item, is_similar_news=True))

print("\n\nProcessing completed successfully...")
print("Total main news scraped :".rjust(40), len(news_items))
print("Total related news scraped :".rjust(40), len(final_news_list) - len(news_items))
print("Total news scraped :".rjust(40), len(final_news_list))

Scraping completed successfully...
               Total news items scraped: 233
       Total related news items scraped: 122
               Total news items scraped: 355


In [9]:
# Save the final news list in a file:
filename = f"./{timestamp()}_news.json"
with open(filename, "w", encoding="utf-8") as file:
    json.dump(final_news_list, file, indent=4)

In [12]:
# Find for the keyword in the news list:
keyword = keyword.lower()
keyword_news = []

# Find the keyword in the news list:
for news in final_news_list:
    if keyword in news['data_search'].lower():
        keyword_news.append(news)

In [13]:
print("Total news items with keyword :".rjust(40), len(keyword_news), end="\n\n")
print(f'Saved all news in file: "{filename}"')

          Total news items with keyword: 10


In [14]:
# Save the keyword news list in a file:
filename = f"./{timestamp()}_query.json"
with open(filename, "w", encoding="utf-8") as file:
    json.dump(keyword_news, file, indent=4)
print(f'Saved queried news in file: "{filename}"')