In [5]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time

In [6]:
# Base URL with placeholder for page number
base_url = 'https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo={}&sortBy=0'

In [7]:
# Initialize lists to store fund data across all pages
name = []
return_1yr = []
return_3yr = []
return_5yr = []
risk_type = []
category = []

In [8]:
# Loop through all 107 pages (adjust the range if needed)
for page_num in range(107):
    # Get the URL for the current page
    url = base_url.format(page_num)
    print(f"Scraping page {page_num + 1} / 107: {url}")
    
    # Send a request to the page
    page = requests.get(url)
    
    # Parse the page content using BeautifulSoup
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Find all fund containers on the page
    fund = soup.find_all('tr', class_='f22Card')
    
    # Loop through each fund on the current page
    for i in fund:
        try:
            # Fund name
            name.append(i.find('div', class_='contentPrimary f22LH34 f22Mb4 truncate bodyBaseHeavy').text.strip())
        except AttributeError:
            name.append(np.nan)

        try:
            # Risk type
            risk_type_text = i.find('div', class_='contentSecondary f22Ls2 bodySmallHeavy').text.strip()
            risk_type.append(risk_type_text)
        except AttributeError:
            risk_type.append(np.nan)

        try:
            # Fund category (e.g., "Equity")
            category_text = i.find_all('div', class_='contentSecondary f22Ls2 bodySmallHeavy')[1].text.strip()
            category.append(category_text)
        except (IndexError, AttributeError):
            category.append(np.nan)

        try:
            # Returns: 1Y, 3Y, 5Y
            returns = i.find_all('div', class_='contentPrimary center-align f22Mb4 bodyBaseHeavy')
            return_1yr.append(returns[0].text.strip())  # 1Y return
            return_3yr.append(returns[1].text.strip())  # 3Y return
            return_5yr.append(returns[2].text.strip())  # 5Y return
        except (IndexError, AttributeError):
            return_1yr.append(np.nan)
            return_3yr.append(np.nan)
            return_5yr.append(np.nan)
    
    
    time.sleep(2)  

Scraping page 1 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=0&sortBy=0
Scraping page 2 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=1&sortBy=0
Scraping page 3 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=2&sortBy=0
Scraping page 4 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=3&sortBy=0
Scraping page 5 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=4&sortBy=0
Scraping page 6 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=5&sortBy=0
Scraping page 7 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=6&sortBy=0
Scraping page 8 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=7&sortBy=0
Scraping page 9 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=8&sortBy=0
Scraping page 10 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=9&sortBy=0
Scraping page 11 / 107: https://groww.in/mutual-funds/filter?q=&fundSize=&pageNo=10&sortB

In [9]:
# After scraping all pages, create a DataFrame to store the results
d = {
    'Mutual Fund Name': name,
    '1Y Return': return_1yr,
    '3Y Return': return_3yr,
    '5Y Return': return_5yr,
    'Risk Type': risk_type,
    'Category': category
}

In [11]:
df = pd.DataFrame(d)
df.head()

Unnamed: 0,Mutual Fund Name,1Y Return,3Y Return,5Y Return,Risk Type,Category
0,Aditya Birla Sun Life PSU Equity Fund Direct G...,66.83%,40.68%,NA%,Very High Risk,Equity
1,SBI PSU Direct Plan Growth,67.03%,39.97%,28.86%,Very High Risk,Equity
2,ICICI Prudential BHARAT 22 FOF Direct Growth,55.63%,39.49%,27.44%,Very High Risk,Equity
3,Motilal Oswal Midcap Fund Direct Growth,70.70%,39.33%,36.11%,Very High Risk,Equity
4,IDBI Small Cap Fund Direct Growth,24.28%,39.12%,16.68%,Very High Risk,Equity
