In [None]:
!pip install selenium

In [None]:
import glob
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import os
import time

firefox_options = Options()
firefox_options.add_argument("--headless")
firefox_options.set_preference("browser.download.folderList", 2) 
download_dir = os.path.join(os.getcwd(), "downloads")
firefox_options.set_preference("browser.download.dir", download_dir)
firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
firefox_options.set_preference("pdfjs.disabled", True)  # Disable PDF viewer

# Ensure the download directory exists
os.makedirs(download_dir, exist_ok=True)
service = Service()
driver = webdriver.Firefox(service=service, options=firefox_options)
driver.set_window_size(1920, 1080)

stock_codes = ['ABB', 'POWERINDIA', 'ABMINTLTD', 'ACC', 'ACCELYA', 'ACCORD', 'ACCURACY','ACEINTEG', 'ACE', 'ADANIENT', 'ADANIGAS', 'ADANIGREEN', 'ADANIPORTS', 'ADANIPOWER', 'ADANITRANS']

def download_pdf(pdf_link_element, stock_code, financial_year):
    try:
        pdf_url = pdf_link_element.get_attribute("href")
        print(f"Attempting download for {stock_code} - Financial Year: {financial_year}")
        print(f"Link found: {pdf_url}")
        # Scroll to element to ensure it's in the viewport
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", pdf_link_element)
        time.sleep(1)  

        ActionChains(driver).move_to_element(pdf_link_element).click().perform()
        time.sleep(5)  # Wait for download

        # Check for the latest downloaded PDF in the download directory
        downloaded_files = glob.glob(os.path.join(download_dir, "*.pdf"))
        if downloaded_files:
            latest_file = max(downloaded_files, key=os.path.getctime)  # Get the most recent file
            target_file = os.path.join(download_dir, f"{stock_code}_{financial_year}.pdf")
            os.rename(latest_file, target_file)
            print(f"Downloaded PDF saved as {target_file}")
        else:
            print(f"PDF for {stock_code} ({financial_year}) did not download as expected.")
    except Exception as e:
        print(f"Error downloading PDF for {stock_code} ({financial_year}): {e}")

# Process each stock code
for stock_code in stock_codes:
    try:
        url = f'https://www.screener.in/company/{stock_code}/consolidated/'
        driver.get(url)
        time.sleep(5)  # Wait for page load
        pdf_links = driver.find_elements(By.CSS_SELECTOR, "div.documents.annual-reports a[href*='.pdf']")

        if not pdf_links:
            print(f"No PDF links found for stock {stock_code}")
            continue

        for i, pdf_link in enumerate(pdf_links[:2]):  # Limit downloads per stock 
            financial_year = pdf_link.text.strip().replace(" ", "_")
            download_pdf(pdf_link, stock_code, financial_year)
    except Exception as e:
        print(f"An error occurred for stock {stock_code}: {e}")
driver.quit()


In [None]:
!pip install pdfplumber

In [None]:
!pip install nltk

In [None]:
import os
import pdfplumber
from tqdm import tqdm
from pdfminer.pdfparser import PDFSyntaxError  # Import the error for explicit handling

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

input_dir = "./downloads"
output_dir = "./textfiles"
os.makedirs(output_dir, exist_ok=True)

pdf_files = [f for f in os.listdir(input_dir) if f.endswith(".pdf")]

print(f"Found {len(pdf_files)} PDF files to process.")

for filename in tqdm(pdf_files, desc="Extracting text from PDFs", unit="pdf"):
    txt_filename = filename.replace(".pdf", ".txt")
    txt_path = os.path.join(output_dir, txt_filename)

    # Check if text file already exists
    if os.path.exists(txt_path):
        continue

    pdf_path = os.path.join(input_dir, filename)
    try:
        text = extract_text_from_pdf(pdf_path)
    except PDFSyntaxError:
        print(f"Skipping {filename}: Not a valid PDF file or corrupted.")
        continue
    except Exception as e:
        print(f"Skipping {filename}: An unexpected error occurred: {e}")
        continue

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)



In [None]:
!pip install transformers tqdm

In [None]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from tqdm import tqdm

In [None]:
import re
import torch
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, logging
from tqdm import tqdm
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import textstat
from sentence_transformers import SentenceTransformer, util
from collections import Counter

# Download punkt for sentence tokenization quietly
nltk.download('punkt', quiet=True)
# Suppress detailed logging from transformers
logging.set_verbosity_error()
def extract_metrics(text):
    """
    Extract financial metrics from the text.
    If patterns don't match, returns 'N/A'.
    """
    metrics_patterns = {
        "Revenue": r"Revenue(?:\s*[:–-]?\s*[₹]?\s*([\d,\.]+(?:\s*(crore|lakh|million|billion))?))",
        "Profit After Tax (PAT)": r"Profit\s*After\s*Tax\s*\(PAT\)\s*[:–-]?\s*[₹]?\s*([\d,\.]+(?:\s*(crore|lakh|million|billion))?)",
        "EBITDA": r"EBITDA\s*[:–-]?\s*[₹]?\s*([\d,\.]+(?:\s*(crore|lakh|million|billion))?)",
    }

    extracted = {}
    for metric, pattern in metrics_patterns.items():
        match = re.search(pattern, text, flags=re.IGNORECASE)
        if match:
            value = match.group(1)
            unit = match.group(2) if match.group(2) else ""
            extracted[metric] = f"{value} {unit}".strip()
        else:
            extracted[metric] = "N/A"
    return extracted

def extract_relevant_text(file_path):
    """
    Extract lines containing financial keywords from the file.
    """
    keywords = ["revenue", "profit", "ebitda", "expenses", "cash flow", "equity", "debt"]
    relevant_lines = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if any(kw in line.lower() for kw in keywords):
                relevant_lines.append(line.strip())
    return " ".join(relevant_lines)
def summarize_text(text, model="sshleifer/distilbart-cnn-12-6", chunk_size=1024):
    """
    Summarize text using a small, fast model.
    Dynamically adjust max_length based on input sentence length to avoid warnings.
    """
    device = 0 if torch.cuda.is_available() else -1
    summarizer = pipeline("summarization", model=model, device=device)
    # Split text into manageable chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    summaries = []

    for chunk in tqdm(chunks, desc="Summarizing", unit="chunk"):
        # Dynamically set max_length (about double the input length but capped at 150)
        input_tokens = len(summarizer.tokenizer.encode(chunk))
        max_len = min(150, input_tokens * 2)
        summary = summarizer(chunk, max_length=max_len, min_length=20, truncation=True)[0]['summary_text']
        summaries.append(summary)
    return " ".join(summaries)
def evaluate_summary_with_original(summary, original_text):
    """
    Evaluate the summary against the original financial report.
    """
    # Readability metrics for the summary
    readability_scores = {
        "Flesch Reading Ease": textstat.flesch_reading_ease(summary),
        "Gunning Fog Index": textstat.gunning_fog(summary),
        "Smog Index": textstat.smog_index(summary),
        "Automated Readability Index": textstat.automated_readability_index(summary),
    }

    # Semantic similarity using SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L6-v2')
    original_embedding = model.encode(original_text, convert_to_tensor=True)
    summary_embedding = model.encode(summary, convert_to_tensor=True)
    semantic_similarity = util.pytorch_cos_sim(original_embedding, summary_embedding).item()
    # Conciseness
    original_word_count = len(original_text.split())
    summary_word_count = len(summary.split())
    conciseness_ratio = summary_word_count / original_word_count
    # Compression ratio
    compression_ratio = original_word_count / summary_word_count
    # Redundancy: Count repeated phrases in the summary
    words = summary.split()
    redundancy = sum(count - 1 for count in Counter(words).values() if count > 1)

    return {
        "Readability Scores": readability_scores,
        "Semantic Similarity": semantic_similarity,
        "Conciseness Ratio": conciseness_ratio,
        "Compression Ratio": compression_ratio,
        "Redundancy (Repeated Words)": redundancy,
    }

def process_financial_report_with_eval(file_path):
    """
    Process the financial report and evaluate the generated summary.
    """
    # Read the entire financial report as input
    with open(file_path, 'r', encoding='utf-8') as f:
        full_text = f.read()

    # Extract relevant lines and summarize
    relevant_text = extract_relevant_text(file_path)
    if not relevant_text.strip():
        print("No relevant sections identified.")
        return
    summary = summarize_text(relevant_text)

    # Extract financial metrics
    metrics = extract_metrics(relevant_text)
    # Evaluate the summary against the original financial report
    evaluation = evaluate_summary_with_original(summary, full_text)

    # Print results
    print("\n==================== Evaluation ====================")
    print("Summary Evaluation Metrics:")
    for metric, score in evaluation.items():
        if isinstance(score, dict):
            for sub_metric, sub_score in score.items():
                print(f"{sub_metric}: {sub_score}")
        else:
            print(f"{metric}: {score}")
    
    print("\n==================== Results ====================")
    print("Extracted Financial Metrics:")
    for k, v in metrics.items():
        print(f"{k}: {v}")

    print("\nGenerated Summary:")
    print(summary)
    print("====================================================")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
nltk.download('punkt_tab')

In [None]:
#Example
file_path = """./textfiles/ADANIGREEN_Financial_Year_2024
from_bse.txt"""
process_financial_report_with_eval(file_path)

Summarizing: 100%|██████████| 76/76 [05:31<00:00,  4.36s/chunk]



Summary Evaluation Metrics:
Flesch Reading Ease: 44.14
Gunning Fog Index: 10.45
Smog Index: 14.4
Automated Readability Index: 12.8
Semantic Similarity: 0.4513867199420929
Conciseness Ratio: 0.01371094537391655
Compression Ratio: 72.93443104969127
Redundancy (Repeated Words): 2295

Extracted Financial Metrics:
Revenue: .
Profit After Tax (PAT): N/A
EBITDA: 25

Generated Summary:
 Adani family's equity stake in the Adani portfolio companies . NDTV profit industry-leading profitability . EBITDA and PAT of AWL was impacted on account of hedges .  This exceptional financial performance drove our PAT to a record high EBITDA in FY 2023-24 of ` 40,129 crore, marking a substantial 70.8% growth . We have continued to deploy latest 33% growth in revenue from power supply to ` 7,735 crore .  During the year, we tapped into diversified sources to raise equity and debt equity . We increased the debt funding pool with a clear roadmap aligned with the project cash flows . We received an equity invest