### Importing required libraries

In [11]:
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

### Scraping Script

In [14]:
class IPO_Script:

    def __init__(self,wait_time,year):
        self.driver = self._initialize_driver()
        self.year = year
        self.wait = WebDriverWait(self.driver, timeout=wait_time)
        self.company_counter = 0
        self.companies_financial_data = []
        self.companies_primary_data = []
        self.companies_secondary_data = []

    def _initialize_driver(self):
        driver = webdriver.Chrome()
        driver.maximize_window()
        return driver

    def wait_to_page_load(self):
        self.wait.until(
            lambda d:d.execute_script('return document.readyState') == 'complete'
        )


    def access_page(self):

        self.driver.get(f'https://trendlyne.com/ipo/screener/year-{self.year}/')

    def rows_to_loaded(self):
        for _ in range(10):
            try:
                self.wait.until(
                        ec.presence_of_element_located((By.CLASS_NAME,'recent_detail'))
                    )
                
            except Exception as e:
                print(e)
                time.sleep(1)


    def click_all(self):
        # Selecting all to show all data in present year
        for _ in range(10):
            try:
                select_feild = self.wait.until(
                    ec.element_to_be_clickable((By.XPATH,'//*[@id="ipo-section"]/main/div[2]/div[3]/div[2]/div/div/select'))
                )
            except:
                time.sleep(1)
        select_button = Select(select_feild)
        select_button.select_by_visible_text('All')

        last_count = 0
        stable_count = 0

        for _ in range(20):  # check up to 20 times (i.e., max 20 seconds)
            # self.rows = self.driver.find_elements(By.XPATH, '//*[@id="ipo-section"]/main/div[2]/div[3]/div[1]/table/tbody/tr')
            try:
                self.rows = self.driver.find_elements(By.CSS_SELECTOR, 'div.recent_detail div.panel_table table tbody tr[role="row"]')
    
                current_count = len(self.rows)
                
                if current_count == last_count:
                    stable_count += 1
                    if stable_count >= 6:  # count stable 6 times in a row
                        break
                else:
                    stable_count = 0
                    last_count = current_count
                
                time.sleep(1)  # wait 1 second before checking again
            
                self.company_count = len(self.rows)
                if self.company_count == 0:
                    raise ValueError('Page does not load properly.\nRefreshing the page')

            except Exception as e :
                print(e)
                self.driver.refresh()
                for _ in range(5):
                    try:
                        select_feild = self.wait.until(
                            ec.element_to_be_clickable((By.XPATH,'//*[@id="ipo-section"]/main/div[2]/div[3]/div[2]/div/div/select'))
                        )
                    except:
                        time.sleep(1)
                select_button = Select(select_feild)
                select_button.select_by_visible_text('All')
                
        print(f"{self.company_count} companies loaded.")

    
    def get_mainboard_data(self,issue_type):

        for _ in range(5):
            try:
                main_board_button = self.wait.until(
                    ec.element_to_be_clickable((By.CSS_SELECTOR,'#ipo-section > main > div:nth-child(2) > div.ipo_filter > div > div.disflex > div > div:nth-child(3) > a'))
                )
                break

            except:
                time.sleep(1)
        # main_board_button = self.wait_until_element_to_be_clickable('XPATH','//*[@id="ipo-section"]/main/div[2]/div[1]/div/div[2]/div/div[3]/a')
        self.driver.execute_script("arguments[0].click();", main_board_button) 
        self.wait_to_page_load()
        self.click_all()
        self.wait_to_page_load()
        self.rows_to_loaded()
        self.scrape_basic_data(issue_type = issue_type)


    def get_sme_data(self,issue_type):

        for _ in range(5):
            try:
                sme_button = self.wait.until(
                    ec.element_to_be_clickable((By.CSS_SELECTOR,'#ipo-section > main > div:nth-child(2) > div.ipo_filter > div > div.disflex > div > div:nth-child(2) > a'))
                )
            except:
                time.sleep(1)
        self.driver.execute_script("arguments[0].click();", sme_button) 
        self.wait_to_page_load()
        self.click_all()
        self.wait_to_page_load()
        self.rows_to_loaded()
        self.scrape_basic_data(issue_type = issue_type)


    def scrape_basic_data(self,issue_type):

        for _ in range(5):
            try:
                self.wait.until(
                    ec.presence_of_element_located((By.TAG_NAME,'table'))
                )
                page_html = self.driver.page_source
                break
            except:
                time.sleep(1)
        soup = BeautifulSoup(page_html,'html.parser')
        for _ in range(5):
            try:
                rows = soup.find('div',class_='recent_detail').find('table').find('tbody').find_all('tr')
                break

            except:
                time.sleep(1)

        
        for row in rows:
            values = row.find_all('td')
            data = {
                'Company':values[0].text,
                'LTP':values[1].text,
                'Market Cap(in CR)':values[2].text,
                'Listing date':values[3].text,
                'Issue size':values[4].text,
                'Issue price':values[5].text,
                'QIB subscription':values[6].text,
                'HNI subscription':values[7].text,
                'Retail subscription':values[8].text,
                'Total subscription':values[9].text,
                'Listing open':values[10].text,
                'Listing close':values[11].text,
                'Listing gain':values[12].text,
                'Current gain':values[13].text
                }
            if issue_type.lower() == 'sme':
                data['Issue type'] = 'SME'
            elif issue_type.lower() == 'mainboard':
                data['Issue type'] = 'MainBoard'

            else:
                print('Pass Issue type among these [SME,MainBoard]')
                break
            self.companies_primary_data.append(data)
            self.company_counter += 1
    
    def each_company_data(self,type_of_data):
        self.type_of_data = type_of_data
        for row in self.rows:
            # self.wait_to_page_load()
            values = row.find_element(By.TAG_NAME,'td')

            # Accessing Company name and company detail link

            success = False
            for _ in range(5):
                try:
                    self.wait.until(
                        ec.presence_of_element_located((By.CSS_SELECTOR, 'div.recent_detail div.panel_table table tbody tr[role="row"]'))
                    )
                    # self.wait_to_page_load()
                    link = values.find_element(By.TAG_NAME, 'a')
                    self.company_name = values.text.strip()
                    success = True
                    break

                except Exception as e:
                    print(e)
                    time.sleep(1)

            if not success:
                print(f"Skipping row {self.company_counter}")
                self.company_counter += 1
                continue

            self.type_of_data = self.type_of_data.lower().strip()
            self.driver.execute_script("arguments[0].click();", link) 
            if self.type_of_data == 'secondary data':
                self.get_subscription_and_secondary_data()
                
            elif self.type_of_data == 'annual result':
                self.click_fundamental()
                self.scrape_financial_data()

            elif self.type_of_data == 'cash flow':
                self.click_fundamental()
                self.click_cash_flow()
                self.scrape_financial_data()

            elif self.type_of_data == 'balance sheet':
                self.click_fundamental()
                self.click_balance_sheet()
                self.scrape_financial_data()
            elif self.type_of_data == 'financial ratio':
                self.click_fundamental()
                self.click_financial_ratio()
                self.scrape_financial_data()

            else:
                print('Enter input among these [Secondary data,Annual result,Cash Flow,Balance Sheet,Financiall ratio]')


    def get_subscription_and_secondary_data(self):
        Subscription_loaded = False
        for _ in range(5): # Check 5 times
            try:
                self.wait.until(
                    ec.presence_of_element_located((By.XPATH,'//*[@id="ipo-section"]/main/section[1]/div[1]/table[2]/tbody/tr'))
                )
                subscription_status_feild = self.driver.find_element(By.XPATH,'//*[@id="ipo-section"]/main/section[1]/div[1]/table[2]/tbody/tr')
                subscription_status = subscription_status_feild.find_element(By.TAG_NAME,'td').text.strip()
                if '-' == subscription_status:
                    raise ValueError('Instead of value hypen(-) is present.\nRefreshing the page....')

                Subscription_loaded = True
                break
            except Exception as e:
                print(f'Re Checking the presence of subscritpion status loaded or not for company {self.company_name}. Error : {e}')
                self.driver.refresh()
                time.sleep(1)

        
        if not Subscription_loaded:
            print('Subscription status not available')
            subscription_status = np.nan


        secondary_data_available = False
        for _ in range(5):
            try:
                self.wait.until(
                    ec.presence_of_element_located((By.XPATH,'//*[@id="ipo-section"]/main/section[1]/div[2]/table'))
                )
                secondary_data_rows = self.driver.find_element(By.XPATH,'//*[@id="ipo-section"]/main/section[1]/div[2]/table/tbody/tr')
                secondary_data_rows_value = secondary_data_rows.find_elements(By.TAG_NAME,'td')
                minimum_investment = secondary_data_rows_value[0].text.strip()
                lot_size = secondary_data_rows_value[1].text.strip()
                no_of_shares = secondary_data_rows_value[2].text.strip()
                price_range = secondary_data_rows_value[3].text.strip()
                post_issue_promoter_holding = secondary_data_rows_value[4].text.strip()
                issue_size = secondary_data_rows_value[5].text.strip()
                secondary_data_available = True
                if '-' in [minimum_investment,lot_size,no_of_shares,price_range,post_issue_promoter_holding,issue_size]:
                    raise ValueError('Instead of value hypen(-) is present.\nRefreshing the page....')
                break

            except Exception as e:
                print(f'Re Checking the presence of secondary data in table for company {self.company_name}. Error : {e} ')
                self.driver.refresh()
                time.sleep(1)

        

        headers_available = False
        for _ in range(5):
            try:
                self.wait.until(
                    ec.presence_of_element_located((By.CLASS_NAME,'ipo-stock-name'))
                )
                further_detail_feild = self.driver.find_element(By.CLASS_NAME,'ipo-stock-name')
                further_detail = further_detail_feild.find_element(By.TAG_NAME,'a')
                headers_available = True

            except Exception as e:
                print(f'Retrying to find link for further details of company {self.company_name} . Error : {e}')
                time.sleep(1)

        
        further_detail.click()
        # print('Entered in Company detail page')

        for _ in range(5):
            try:
                sec_ind_field = self.wait.until(
                    ec.presence_of_element_located((By.CLASS_NAME,'tl_breadcrumb'))
                )
                sec_ind_info = sec_ind_field.find_elements(By.TAG_NAME,'li')
                try:
                    sector = sec_ind_info[1].text.strip()
                except:
                    sector = np.nan
                try:
                    industry = sec_ind_info[2].text.strip()
                except:
                    industry = np.nan

                if '-' in [sector,industry]:
                    raise ValueError('Instead of value hypen(-) is present.\nRefreshing the page....')

            except Exception as e:
                print(f'Setor Industry info is missing of compnay {self.company_name}. Error{e}')
                self.driver.refresh()
                time.sleep(2)

        for _ in range(5):
            try:
                nse_bse = self.wait.until(
                    ec.presence_of_element_located((By.CLASS_NAME,'stock_info_details'))
                )

                nse_bse = nse_bse.find_element(By.CLASS_NAME,'stock_exchange_details')
                
                nse_symbol_bse_code = nse_bse.text.strip()

                if nse_symbol_bse_code == '-':
                    raise ValueError('Content of page is not loaded fully./nRefreshing the page....')

            except Exception as e:
                print(f'Retrying to fetch nse and bse detail of company {self.company_name}. Error{e}')
                self.driver.refresh()
                time.sleep(2)
                


        data = {
                'Company':self.company_name,
                'Subscritpion Status':subscription_status,
                'Minimum Investment': minimum_investment,
                'Lot size':lot_size,
                'Issue size':issue_size,
                'No. of Shares':no_of_shares,
                'Price Range':price_range,
                'Post issue promoter holding':post_issue_promoter_holding,
                'Sector':sector,
                'Industry':industry,
                'Nse_symbol_Bse_Code':nse_symbol_bse_code
            }
        self.company_counter += 1
        
        self.companies_secondary_data.append(data)

        # time.sleep(3)
        self.driver.back()
        self.wait_to_page_load()
        self.driver.back()
        self.wait_to_page_load()
        

    def click_fundamental(self):
        counter = 1
        for _ in range(5):
            try:
                fundamental = self.wait.until(
                    ec.presence_of_element_located((By.XPATH,'//*[@id="ipo-section"]/main/nav/div/a[2]'))
                )
                fundamental.click()
                break
            except:
                if counter:
                    self.driver.refresh()
                time.sleep(1)
    def click_balance_sheet(self):
        for _ in range(5):
            try:
                for i in range(3,7):
                    balance_sheet = self.wait.until(
                        ec.presence_of_element_located((By.XPATH,f'//*[@id="fundamental_tables"]/div/div[1]/div/div[1]/div/div[{i}]'))
                    )
    
                    if balance_sheet.text.strip() == 'Balance Sheet':
                        balance_sheet.click()
                        break
            except Exception as e:
                print(e)
                time.sleep(1)

    def click_cash_flow(self):
        for _ in range(5):
            clicked = False
            try:
                for i in range(3,8):
                    cash_flows = self.wait.until(
                        ec.presence_of_element_located((By.XPATH,f'//*[@id="fundamental_tables"]/div/div[1]/div/div[1]/div/div[{i}]'))
                    )
    
                    if cash_flows.text.strip() == 'Cash Flow':
                        cash_flows.click()
                        break
    
            except:
                time.sleep(1)

    def click_financial_ratio(self):
        for _ in range(5):
            try:
                for i in range(3,8):
                    financial_ratio = self.wait.until(
                        ec.presence_of_element_located((By.XPATH,f'//*[@id="fundamental_tables"]/div/div[1]/div/div[1]/div/div[{i}]'))
                    )
    
                    if financial_ratio.text.strip() == 'Financial Ratios':
                        financial_ratio.click()
                        break
            except:
                time.sleep(1)

    def scrape_financial_data(self):
        universal_header = ["CAGR 3 Yrs","CAGR 5 Yrs","TTM",
                            "Mar '25","Mar '24","Mar '23",
                            "Mar '22","Mar '21","Mar '20",
                            "Mar '19","Mar '18","Mar '17",
                            "Mar '16","Mar '15"]

        
        for _ in range(5):
            try:
                self.wait.until(
                    ec.presence_of_element_located((By.CSS_SELECTOR,'#fundamental_tables > div > div:nth-child(1) > div > div.hidden.tl__tab_section_active--piz9_ > div > div:nth-child(2) > div > div:nth-child(3) > div.table-responsive.react-table-v7-container > table > tbody > tr:nth-child(1) > td:nth-child(5) > div'))
                )                                                    
                break
            except Exception as e:
                status = False
                for _ in range(5):
                    try:
                        self.wait.until(
                            ec.presence_of_element_located((By.CSS_SELECTOR,'#fundamental_tables > div > div:nth-child(1) > div > div.hidden.tl__tab_section_active--piz9_ > div > div:nth-child(2) > div > div:nth-child(3) > div.table-responsive.react-table-v7-container > table > tbody > tr:nth-child(1) > td:nth-child(3) > div'))
                        ) 
                        status = True
                        break
                    except:
                        time.sleep(1)
                print(e)
                time.sleep(1)

        for _ in range(5):
            try:
                header = self.wait.until(
                    ec.visibility_of_any_elements_located((By.CSS_SELECTOR,'#fundamental_tables > div > div:nth-child(1) > div > div.hidden.tl__tab_section_active--piz9_ > div > div:nth-child(2) > div > div:nth-child(3) > div.table-responsive.react-table-v7-container > table > thead > tr > th'))
                )
                header_value = [head.text for head in header]
        
                rows = self.wait.until(
                    ec.visibility_of_any_elements_located((By.CSS_SELECTOR,'#fundamental_tables > div > div:nth-child(1) > div > div.hidden.tl__tab_section_active--piz9_ > div > div:nth-child(2) > div > div:nth-child(3) > div.table-responsive.react-table-v7-container > table > tbody > tr'))
                )
                break
            except:
                time.sleep(1)

        ipo_data = dict()
        for row in rows:
            ipo_data = dict()
            table_val = row.find_elements(By.TAG_NAME,'td')
            indicator = table_val[0].text.strip()
            ipo_data['Company'] = self.company_name
            ipo_data['Indicator'] = indicator
            
            value_count = 2
            
            for head in universal_header:
                if head in header_value:
                    ipo_data[head] = table_val[value_count].text.strip()
                    value_count += 1
                else:
                    ipo_data[head] = np.nan

            self.companies_financial_data.append(ipo_data)       

        self.driver.back()
        self.wait_to_page_load()

    def access_each_company_data(self,type_of_data):
        self.access_page()
        self.wait_to_page_load()
        self.click_all()
        self.each_company_data(type_of_data=type_of_data)

    def access_primary_data(self,type_of_ipo='SME'):

        self.access_page()
        self.wait_to_page_load()
        if type_of_ipo.lower() == 'sme':
            self.get_sme_data(issue_type=type_of_ipo)
        elif type_of_ipo.lower() == 'mainboard':
            self.get_mainboard_data(issue_type=type_of_ipo)

        else:
            print('Pass these [SME,MainBoard] parameter in type of IPO')

### Data Collection

In [17]:
for i in range(2019,2026):
    for j in ['Secondary data','Annual result','Cash Flow','Balance Sheet','Financial ratio']:
        ipo = IPO_Script(10,i)
        ipo.access_each_company_data(j)
        if j == 'Secondary data':
            df = pd.DataFrame(ipo.companies_secondary_data)
        else:
            df = pd.DataFrame(ipo.companies_financial_data)
        if ipo.company_count == df.Company.nunique():
            df.to_csv(f'{j}-{i}.csv',index=False)

62 companies loaded.
Re Checking the presence of subscritpion status loaded or not for company Prince Pipes. Error : Instead of value hypen(-) is present.
Refreshing the page....
Retrying to fetch nse and bse detail of company Gensol Engineering. Error'NoneType' object has no attribute 'strip'


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=138.0.7204.169)
Stacktrace:
	GetHandleVerifier [0x0x7ff734dfe415+77285]
	GetHandleVerifier [0x0x7ff734dfe470+77376]
	(No symbol) [0x0x7ff734bc9a6a]
	(No symbol) [0x0x7ff734ba1f21]
	(No symbol) [0x0x7ff734c4f9be]
	(No symbol) [0x0x7ff734c70192]
	(No symbol) [0x0x7ff734c483e3]
	(No symbol) [0x0x7ff734c11521]
	(No symbol) [0x0x7ff734c122b3]
	GetHandleVerifier [0x0x7ff7350e1efd+3107021]
	GetHandleVerifier [0x0x7ff7350dc29d+3083373]
	GetHandleVerifier [0x0x7ff7350fbedd+3213485]
	GetHandleVerifier [0x0x7ff734e1884e+184862]
	GetHandleVerifier [0x0x7ff734e2055f+216879]
	GetHandleVerifier [0x0x7ff734e07084+113236]
	GetHandleVerifier [0x0x7ff734e07239+113673]
	GetHandleVerifier [0x0x7ff734dee298+11368]
	BaseThreadInitThunk [0x0x7ff80a8ee8d7+23]
	RtlUserThreadStart [0x0x7ff80b17c34c+44]


In [10]:
for i in range(2019,2026):
    for j in ['SME','MAINBOARD']:
        ipo = IPO_Script(10,i)
        ipo.access_primary_data(j)
        df = pd.DataFrame(ipo.companies_primary_data)
        if ipo.company_count == df.Company.nunique():
            df.to_csv(f'{j}-{i}.csv',index=False)

### Concatinating collected data

In [None]:
annual_result_df = pd.DataFrame(columns=["Company","Indicator","CAGR 3 Yrs","CAGR 5 Yrs","TTM","Mar '25","Mar '24","Mar '23","Mar '22","Mar '21","Mar '20","Mar '19","Mar '18","Mar '17","Mar '16","Mar '15"])
balance_sheet_df = pd.DataFrame(columns=["Company","Indicator","CAGR 3 Yrs","CAGR 5 Yrs","TTM","Mar '25","Mar '24","Mar '23","Mar '22","Mar '21","Mar '20","Mar '19","Mar '18","Mar '17","Mar '16","Mar '15"])
cash_flow_df = pd.DataFrame(columns=["Company","Indicator","CAGR 3 Yrs","CAGR 5 Yrs","TTM","Mar '25","Mar '24","Mar '23","Mar '22","Mar '21","Mar '20","Mar '19","Mar '18","Mar '17","Mar '16","Mar '15"])
financial_ratio_df = pd.DataFrame(columns=["Company","Indicator","CAGR 3 Yrs","CAGR 5 Yrs","TTM","Mar '25","Mar '24","Mar '23","Mar '22","Mar '21","Mar '20","Mar '19","Mar '18","Mar '17","Mar '16","Mar '15"])
mainboard_df = pd.DataFrame(columns=["Company","LTP","Market Cap(in CR)","Listing date","Issue size","Issue price","QIB subscription","HNI subscription","Retail subscription","Total subscription","Listing open","Listing close","Listing gain","Current gain","Issue type"])
sme_df = pd.DataFrame(columns=["Company","LTP","Market Cap(in CR)","Listing date","Issue size","Issue price","QIB subscription","HNI subscription","Retail subscription","Total subscription","Listing open","Listing close","Listing gain","Current gain","Issue type"])
secondary_df = pd.DataFrame(columns=['Company', 'Subscritpion Status', 'Minimum Investment', 'Lot size','Issue size', 'No. of Shares', 'Price Range','Post issue promoter holding', 'Sector', 'Industry','Nse_symbol_Bse_Code'])

In [None]:
for i in range(2019,2026):
    df_ = pd.read_csv(f'Annual result-{i}.csv')
    annual_result_df = pd.concat([annual_result_df,df_],ignore_index=True)

In [None]:
for i in range(2019,2026):
    df_ = pd.read_csv(f'Balance Sheet-{i}.csv')
    balance_sheet_df = pd.concat([balance_sheet_df,df_],ignore_index=True)

In [None]:
for i in range(2019,2026):
    df_ = pd.read_csv(f'Financial ratio-{i}.csv')
    financial_ratio_df = pd.concat([financial_ratio_df,df_],ignore_index=True)

In [None]:
for i in range(2019,2026):
    df_ = pd.read_csv(f'Cash flow-{i}.csv')
    cash_flow_df = pd.concat([cash_flow_df,df_],ignore_index=True)

In [None]:
for year in range(2019,2026):
    df_ = pd.read_csv(f'SME-{year}.csv')
    sme_df = pd.concat([sme_df,df_])

In [None]:
for year in range(2019,2026):
    df_ = pd.read_csv(f'MAINBOARD-{year}.csv')
    mainboard_df = pd.concat([mainboard_df,df_])

In [None]:
for year in range(2019,2026):
    df_ = pd.read_csv(f'Secondary data-{year}.csv')
    secondary_df = pd.concat([secondary_df,df_],ignore_index=True)

In [None]:
annual_result_df.to_csv("Annual-Result-2019-2025.csv")
balance_sheet_df.to_csv("Balance-Sheet-2019-2025.csv")
cash_flow_df.to_csv("Cash-Flow-2019-2025.csv")
financial_ratio_df.to_csv("Financial_Ratio-2019-2025.csv")
mainboard_df.to_csv('MainBoard-Data-2019-2025.csv')
sme_df.to_csv('SME-Data-2019-2025.csv')
secondary_df.to_csv('Secondary-Data-2019-2025.csv')

In [None]:
print(pd.read_csv('Annual-Result-2019-2025.csv').Company.nunique())
print(pd.read_csv('Balance-Sheet-2019-2025.csv').Company.nunique())
print(pd.read_csv('Cash-Flow-2019-2025.csv').Company.nunique())
print(pd.read_csv('Financial_Ratio-2019-2025.csv').Company.nunique())
print(pd.read_csv('SME-Data-2019-2025.csv').Company.nunique() + pd.read_csv('MainBoard-Data-2019-2025.csv').Company.nunique())
print(pd.read_csv('Secondary-Data-2019-2025.csv').Company.nunique())