In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re

In [2]:
# Fetching all the policy names and address to policy documents page for all insurance plans.

url = "https://licindia.in/web/guest/insurance-plan"

# GET request to fetch the webpage content
response = requests.get(url)
response.raise_for_status()  # Raise an error for bad status codes

soup = BeautifulSoup(response.text, "html.parser")

accordion = soup.find("div", id="accordionExample")

tables = accordion.find_all("table")

data = []
for table in tables:
    links = table.find_all("a")
    for link in links:
        text = link.text.strip()
        href = link.get("href")
        data.append({"Text": text, "URL": href})

df = pd.DataFrame(data)

df.to_csv("accordion_tables_links.csv", index=False)

print("Data saved to 'accordion_tables_links.csv'")


Data saved to 'accordion_tables_links.csv'


In [3]:
# Fetching link of all the pdf documents for each policy for each plan
base_url = "https://licindia.in"

df = pd.read_csv("accordion_tables_links.csv")

all_links_data = []

for index, row in df.iterrows():
    page_url = base_url + row["URL"]  
    try:

        response = requests.get(page_url)
        response.raise_for_status()  

        soup = BeautifulSoup(response.text, "html.parser")

        main_content = soup.select_one("#maincontent > div > div")

        if main_content:
            links = main_content.find_all("a")
            for link in links:
                text = link.text.strip()
                href = link.get("href")
                all_links_data.append({"Source URL": page_url, "Text": text, "URL": href})
    except Exception as e:
        print(f"Failed to process URL: {page_url} | Error: {e}")

all_links_df = pd.DataFrame(all_links_data)

all_links_df.to_csv("all_page_links.csv", index=False)

print("Data saved to 'all_page_links.csv'")

Data saved to 'all_page_links.csv'


In [4]:
# Downloading the pdf files from the link and naming them appropriately

df_accordion = pd.read_csv("accordion_tables_links.csv")
df_all_links = pd.read_csv("all_page_links.csv")

os.makedirs("policy_documents", exist_ok=True)

for index, row in df_all_links.iterrows():
    pdf_url = base_url + row["URL"]  
    policy_name = df_accordion.loc[df_accordion["URL"] == row["Source URL"].replace(base_url, ""), "Text"].values[0]
    pdf_name = f"{policy_name} - {row['Text']}.pdf".replace("/", "-")  

    try:
        response = requests.get(pdf_url)
        response.raise_for_status()  

        pdf_path = os.path.join("policy_documents", pdf_name)
        with open(pdf_path, "wb") as pdf_file:
            pdf_file.write(response.content)

        print(f"Downloaded: {pdf_name}")
    except Exception as e:
        print(f"Failed to download {pdf_name} from {pdf_url} | Error: {e}")

print("All PDF files have been downloaded to the 'policy_documents' folder.")

Downloaded: LIC's Single Premium Endowment Plan - Sales Brochure  (Content is in English).pdf
Downloaded: LIC's Single Premium Endowment Plan - Policy Document  (Content is in English).pdf
Downloaded: LIC's Single Premium Endowment Plan - CIS LIC's Single Premium Endowment Plan  (Content is in English).pdf
Downloaded: LIC's New Endowment Plan - Sales brochure  (Content is in English).pdf
Downloaded: LIC's New Endowment Plan - Policy Document  (Content is in English).pdf
Downloaded: LIC's New Endowment Plan - CIS LIC's New Endowment Plan  (Content is in English).pdf
Downloaded: LIC's New Jeevan Anand - Sales brochure  (Content is in English).pdf
Downloaded: LIC's New Jeevan Anand - Policy Document  (Content is in English).pdf
Downloaded: LIC's New Jeevan Anand - CIS LIC's New Jeevan Anand  (Content is in English).pdf
Downloaded: LIC's Jeevan Lakshya - Sales Brochure  (Content is in English).pdf
Downloaded: LIC's Jeevan Lakshya - Policy Document  (Content is in English).pdf
Downloaded: L

In [5]:
def clean_policy_document_names(folder_path):
    """
    Removes the text "(Content is in English)" from all file names in the specified folder.
    
    Args:
        folder_path (str): The path to the folder containing the policy documents.
    """
    for filename in os.listdir(folder_path):
        new_name = re.sub(r" \(Content is in English\)", "", filename)

        if new_name != filename:
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, new_name)
            os.rename(old_path, new_path)
            print(f"Renamed: {filename} -> {new_name}")

policy_documents_folder = "policy_documents"
clean_policy_document_names(policy_documents_folder)

print("File names cleaned.")

Renamed: LIC's New Endowment Plan - Policy Document  (Content is in English).pdf -> LIC's New Endowment Plan - Policy Document .pdf
Renamed: LIC's New Children's Money Back Plan - Sales Brochure  (Content is in English).pdf -> LIC's New Children's Money Back Plan - Sales Brochure .pdf
Renamed: LIC's New Tech-Term - Policy Document  (Content is in English).pdf -> LIC's New Tech-Term - Policy Document .pdf
Renamed: LIC’s Yuva Term - Policy Document  (Content is in English).pdf -> LIC’s Yuva Term - Policy Document .pdf
Renamed: LIC's Accidental Death & Disability Benefit Rider - Policy Document  (Content is in English).pdf -> LIC's Accidental Death & Disability Benefit Rider - Policy Document .pdf
Renamed: LIC's New Money Back Plan-25 years - Policy Document  (Content is in English).pdf -> LIC's New Money Back Plan-25 years - Policy Document .pdf
Renamed: LIC's Bima Ratna - Sales Brochure  (Content is in English).pdf -> LIC's Bima Ratna - Sales Brochure .pdf
Renamed: LIC's Jeevan Tarun - 