In [1]:
import pandas as pd
import numpy as np


companies_list = pd.read_csv('symbols.csv')
companies_list

Unnamed: 0,Symbol,Name
0,RELIANCE,Reliance Industries
1,TCS,Tata Consultancy Services
2,HDFCBANK,HDFC Bank
3,ICICIBANK,ICICI Bank
4,BHARTIARTL,Bharti Airtel
...,...,...
815,LGBBROSLTD,LG Balakrishnan & Bros
816,NSIL,Nalwa Sons Investments
817,CARERATING,CARE Ratings
818,MEDIASSIST,Medi Assist Healthcare


In [17]:
import requests
from bs4 import BeautifulSoup
import os
import time

companies = ['RELIANCE']

if not os.path.exists('screener_pdfs'):
    os.mkdir('screener_pdfs')

# This configuration is necessary for creating a session with bse
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Referer': 'https://www.bseindia.com/',
    'DNT': '1',
    'Upgrade-Insecure-Requests': '1'
})

print("Creating a BSE India session...")
session.get("https://www.bseindia.com/")

for company in companies:
    print(f"\nChecking {company}...")
    url = f'https://www.screener.in/company/{company}/consolidated/'
    
    try:
        page = session.get(url)
        page.raise_for_status()
    except Exception as e:
        print(f"⚠️ Page load failed: {str(e)}")
        continue

    soup = BeautifulSoup(page.text, 'html.parser')
    pdf_links = soup.find_all('a', class_='concall-link', title="Raw Transcript")
    
    if not pdf_links:
        print(f"No PDF links found for {company}")
        continue
    
    print(f"Successfully found {len(pdf_links)} PDFs for {company}")
    print(f"{'-#-'*30}")
    
    for i, link in enumerate(pdf_links, 1):
        pdf_url = link['href']
        print(f"Attempting PDF {i}: {pdf_url}")
        
        if not pdf_url.endswith('.pdf'):
            print("Skipping non-PDF link")
            continue
            
        try:
            # Add BSE-specific headers
            headers = {
                'Origin': 'https://www.bseindia.com',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'same-origin'
            }
            
            response = session.get(pdf_url, headers=headers, timeout=10)
            
            # Verify successful response
            if response.status_code != 200:
                print(f"Failed with status code: {response.status_code}")
                continue
                
            if 'application/pdf' not in response.headers.get('Content-Type', ''):
                print("Response is not a PDF")
                continue
                
            # Generate filename
            filename = f"screener_pdfs/{company}_concall_{i}.pdf"
            
            # Save PDF
            with open(filename, 'wb') as f:
                f.write(response.content)
                
            print(f"✅ Successfully saved: {filename}")
            
            # Add delay to avoid rate limiting
            time.sleep(1)
            
        except Exception as e:
            print(f"Download failed: {str(e)}")

print("\n🏁 Done")

Creating a BSE India session...

Checking RELIANCE...
Successfully found 11 PDFs for RELIANCE
-#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#--#-
Attempting PDF 1: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=aa0547ba-09ef-404d-b57d-c216d1513f17.pdf
✅ Successfully saved: screener_pdfs/RELIANCE_concall_1.pdf
Attempting PDF 2: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=89c33a3e-5683-4a1d-b8f9-1952ff4df63f.pdf
✅ Successfully saved: screener_pdfs/RELIANCE_concall_2.pdf
Attempting PDF 3: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=0e6309fa-450f-4db6-88b2-d76d9a3ff756.pdf
✅ Successfully saved: screener_pdfs/RELIANCE_concall_3.pdf
Attempting PDF 4: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=191e6ab9-4751-4017-ba5f-bb63daf2d871.pdf
✅ Successfully saved: screener_pdfs/RELIANCE_concall_4.pdf
Attempting PDF 5: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=06b05d06-8c38-4aee-aa5b-5e7108ae3871.pdf

In [None]:
import pdfplumber
import re

def extract_dialogues_from_pdf(pdf_path, top_margin=70, bottom_margin=100, left_margin=30, right_margin=30):
    """
    Extract dialogues from a PDF, excluding headers and footers by cropping pages. This function also skips the first
    page and handles overlapping text at page breaks.

    Args:
    pdf_path (str): Path to the PDF file.
    top_margin (int): Number of points to exclude from the top of each page.
    bottom_margin (int): Number of points to exclude from the bottom of each page.
    left_margin (int): Number of points to exclude from the left of each page.
    right_margin (int): Number of points to exclude from the right of each page.

    Returns:
    str: A string containing all extracted dialogues, formatted as "Speaker: Dialogue".
    """
    # More lenient regex to capture speaker names including initials or single names
    speaker_pattern = re.compile(r'^([A-Z][a-zA-Z.]+(?: [A-Z][a-zA-Z.]+)*):')

    all_dialogues = ""
    buffer_text = ""  # Buffer to hold the last line of the previous page
    current_dialogue = ""
    current_speaker = None

    with pdfplumber.open(pdf_path) as pdf:
        # Skip the first page
        for page in pdf.pages[1:]:
            page_width, page_height = page.width, page.height

            # Define the bounding box to crop the page
            bbox = (left_margin, bottom_margin, page_width - right_margin, page_height - top_margin)
            cropped_page = page.crop(bbox)
            page_text = cropped_page.extract_text() or ""

            # Combine the text from the buffer and the current page
            combined_text = buffer_text + "\n" + page_text
            lines = combined_text.split("\n")

            # Handle potential duplicate lines at the start of the new page
            if lines and lines[0].strip() == buffer_text.strip():
                lines = lines[1:]

            # Process each line to extract speaker and dialogue
            for line in lines:
                speaker_match = speaker_pattern.match(line.strip())
                if speaker_match:
                    if current_speaker:
                        all_dialogues += f"{current_speaker}: {current_dialogue.strip()}\n"
                    current_speaker = speaker_match.group(1)
                    current_dialogue = line[speaker_match.end():].strip()
                else:
                    current_dialogue += " " + line.strip()

            # Update the buffer with the last line of the current page
            buffer_text = lines[-1] if lines else ""

        # Include the final speaker's dialogue if any remains
        if current_speaker and current_dialogue.strip():
            all_dialogues += f"{current_speaker}: {current_dialogue.strip()}\n"

    return all_dialogues

# Specify the PDF path and extract dialogues
pdf_path = "screener_pdfs/RELIANCE_concall_1.pdf"
extracted_dialogues = extract_dialogues_from_pdf(
    pdf_path,
    top_margin=70,
    bottom_margin=100,
    left_margin=30,
    right_margin=30
)
print(extracted_dialogues)

In [None]:
def create_dialogue_chunks(extracted_text):
    """
    Converts extracted dialogue text into a list of structured dictionaries, each representing a chunk of dialogue.

    Args:
    extracted_text (str): Extracted dialogues as a single string.

    Returns:
    list: A list of dictionaries, each containing a chunk_id, speaker, and text.
    """
    lines = extracted_text.strip().split("\n")
    chunks = []

    for i, line in enumerate(lines, start=1):
        line = line.strip()
        if not line:
            continue  # Skip any empty lines

        # Attempt to split each line into speaker and dialogue
        parts = line.split(":", 1)
        if len(parts) == 2:
            speaker, dialogue_text = parts
            speaker = speaker.strip()
            dialogue_text = dialogue_text.strip()

            # Store each dialogue as a dictionary
            chunk = {
                "chunk_id": i,
                "speaker": speaker,
                "text": dialogue_text
            }
            chunks.append(chunk)
        else:
            # Handle lines that don't conform to the expected format
            continue  # Optionally add error handling or logging here

    return chunks

# Create chunks from the extracted dialogues
chunks = create_dialogue_chunks(extracted_dialogues)

# Print each chunk in a readable format
for chunk in chunks:
    print(f"Chunk ID: {chunk['chunk_id']}")
    print(f"Speaker: {chunk['speaker']}")
    print(f"Text: {chunk['text']}")
    print("---")