#**Installations**

In [8]:
! pip install paddlepaddle paddleocr

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp311-cp311-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.10.0-py3-none-any.whl.metadata (12 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.6.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Download

In [30]:
! pip install reportlab

Collecting reportlab
  Downloading reportlab-4.3.1-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.3.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.3.1


# **Extracting text using PaddleOCR**

In [2]:
import fitz  # PyMuPDF
from paddleocr import PaddleOCR
import cv2
import numpy as np
import re
# Load the PDF
file_path = '/content/ir-featured-content-slides-jan-2016-160112130634.pdf'
doc = fitz.open(file_path)

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Extract text from each page
extracted_text = ""
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    pix = page.get_pixmap()

    # Convert to OpenCV format
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)

    # Convert to BGR format (if needed)
    if pix.n == 4:
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
    elif pix.n == 1:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

    # Run PaddleOCR on the image
    result = ocr.ocr(img, cls=True)
    print(result)
    for line in result:
        for word in line:
            extracted_text += word[1][0] + " "



# Cleaning the extracted text
def preprocess_text(text):
    # Remove extra spaces, special characters, and lowercase everything
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s.,]', '', text)  # Remove special characters
    text = text.strip().lower()
    return text

cleaned_text = preprocess_text(extracted_text)
print(extracted_text)
print(cleaned_text)

[2025/03/20 17:10:20] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_l

# **Feature Engineering and Assigning Weights**

In [3]:
SECTION_PATTERNS = {
    'Problem': r'(problem|pain point|challenge)',
    'Solution': r'(solution|approach|how it works)',
    'Market': r'(market|industry|competition)',
    'Business Model': r'(business model|revenue|monetization)',
    'Financials': r'(financials|funding|revenue|expenses)',
    'Team': r'(team|founder|experience)'
}

def extract_sections(text):
    sections = {}
    for section, pattern in SECTION_PATTERNS.items():
        match = re.search(pattern, text, re.IGNORECASE)
        sections[section] = 1 if match else 0
    return sections


sections = extract_sections(cleaned_text)
print(sections)

{'Problem': 0, 'Solution': 1, 'Market': 1, 'Business Model': 1, 'Financials': 1, 'Team': 1}


In [4]:
SECTION_WEIGHTS = {
    'Problem': 0.2,
    'Solution': 0.2,
    'Market': 0.15,
    'Business Model': 0.15,
    'Financials': 0.15,
    'Team': 0.15
}

def score_pitch(sections):
    score = sum(sections[section] * SECTION_WEIGHTS[section] for section in SECTION_WEIGHTS)
    return int(score * 100)


score = score_pitch(sections)
print(f"Pitch Score: {score}/100")


Pitch Score: 80/100


# **Scoring Model**

In [5]:
import google.generativeai as genai

# Configure the Gemini API key
genai.configure(api_key="AIzaSyBsWeI6gpxg9McLaqnRCrQsnc73IVJ_B3k")

def generate_feedback(text):
    prompt = f"""
    Analyze the following pitch deck based on the sections 'Problem','Solution','Market','Business Model','Financials','Team':
    {cleaned_text}

     and provide only the calulated pitch score (0-100) based on the topics and Identify strengths and weaknesses:
    - score:
    - Strengths:
    - Weaknesses:

    Provide improvement suggestions.
    """

    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(prompt)

    # Extract the text from the response
    feedback = response.text
    return feedback


feedback = generate_feedback(cleaned_text)
print(feedback)


**Score:** 65/100

**Strengths:**

* **Market:** The deck effectively communicates the large and growing market opportunity within the SMB e-commerce space.  The use of statistics (e.g., Shopify merchants' GMV, TAM) and the inclusion of well-known brands using the platform are persuasive.
* **Business Model:** The recurring revenue SaaS model is clearly presented as a strength, highlighting predictable income and growth potential.  The partner ecosystem is also showcased as a key differentiator.
* **Financials:**  The inclusion of financial highlights (revenue, MRR, GMV growth) demonstrates strong financial performance.  However, the presentation needs improvement in clarifying non-GAAP vs. GAAP measures.
* **Team:** While not explicitly detailed, the mention of successful partnerships and the implicit strength in the product itself suggests a competent team.  The inclusion of influential names using the platform implicitly endorses the team's abilities.

**Weaknesses:**

* **Problem:*

# **Final Report Generation**

In [6]:
import markdown
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable, ListItem
from bs4 import BeautifulSoup

def markdown_to_html(md_text):
    return markdown.markdown(md_text)  # Convert Markdown to HTML

def html_to_pdf(html,filename="pitch_report.pdf"):
    doc = SimpleDocTemplate(filename, pagesize=letter)
    styles = getSampleStyleSheet()

    # Define custom styles
    title_style = ParagraphStyle(
        'TitleStyle',
        parent=styles['Heading1'],
        fontSize=18,
        spaceAfter=10,
        textColor="#1F4E79"
    )
    section_style = ParagraphStyle(
        'SectionStyle',
        parent=styles['Heading2'],
        fontSize=14,
        spaceAfter=6,
        textColor="#2E75B6"
    )
    body_style = styles['BodyText']
    bold_style = ParagraphStyle(
        'BoldStyle',
        parent=styles['BodyText'],
        fontSize=12,
        textColor="#000000",
        spaceAfter=4
    )

    # Create content list
    content = []

    # Title
    content.append(Paragraph("Pitch Deck Analysis Report", title_style))
    content.append(Spacer(1, 12))

    '''# Pitch Score
    content.append(Paragraph(f"<b>Pitch Score:</b> {pitch_score}/100", section_style))
    content.append(Spacer(1, 12))'''

    # Feedback Section
    content.append(Paragraph("<b>Feedback:</b>", section_style))
    content.append(Spacer(1, 6))

    # Convert HTML to PDF-friendly content
    soup = BeautifulSoup(html, "html.parser")

    bullet_items = []  # Temporary storage for bullet points
    for tag in soup.contents:
        if tag.name == 'h1':
            content.append(Paragraph(tag.text, title_style))
        elif tag.name == 'h2':
            content.append(Paragraph(tag.text, section_style))
        elif tag.name == 'p':
            content.append(Paragraph(tag.text, body_style))
        elif tag.name == 'strong':
            content.append(Paragraph(f"<b>{tag.text}</b>", bold_style))
        elif tag.name == 'ul':  # Handle bullet points
            for li in tag.find_all('li'):
                bullet_items.append(ListItem(Paragraph(li.text, body_style)))
            if bullet_items:
                content.append(ListFlowable(bullet_items, bulletType='bullet', leftIndent=20))
                bullet_items = []
        content.append(Spacer(1, 6))

    # Build the PDF
    doc.build(content)
    print(f"Report saved as '{filename}'")


# Convert Markdown to HTML
html_feedback = markdown_to_html(feedback)

# Generate PDF
html_to_pdf(html_feedback)


Report saved as 'pitch_report.pdf'
