In [None]:
import nltk
nltk.download(['punkt', 'punkt_tab', 'stopwords'])

In [None]:
!pip install transformers nltk PyPDF2 sumy
import re
import json
from PyPDF2 import PdfReader
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from collections import defaultdict
from google.colab import files

# Step 1: Upload PDF
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Step 2: Extract and filter text
def extract_and_filter_text(file_name):
    reader = PdfReader(file_name)
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    # Remove bibliographies, URLs, citations
    text = re.sub(r'http\S+|doi\.org/\S+|\b\d{4}\b|\([A-Za-z]+, \d{4}\)', '', text)
    text = re.sub(r'\b[A-Z][a-z]+, \d{4}\b', '', text)
    return text

filtered_text = extract_and_filter_text(file_name)

# Step 3: Use a PUBLIC zero-shot model (no auth required)
BMC_SEGMENTS = [
    "Customer Segments",
    "Value Proposition",
    "Channels",
    "Customer Relationships",
    "Revenue Streams",
    "Key Resources",
    "Key Activities",
    "Key Partnerships",
    "Cost Structure"
]

def classify_sentences(text, segments, confidence_threshold=0.3):
    # Use a public model: facebook/bart-large-mnli
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    sentences = sent_tokenize(text)
    segment_map = defaultdict(list)

    for sentence in sentences:
        if len(sentence.split()) < 5:
            continue

        result = classifier(sentence, segments, multi_label=False)
        top_label = result["labels"][0]
        confidence = result["scores"][0]

        if confidence >= confidence_threshold:
            segment_map[top_label].append(sentence)

    return segment_map

bmc_segments = classify_sentences(filtered_text, BMC_SEGMENTS)

# Step 4: Summarize sentences for each segment
def summarize_segment(sentences, num_sentences=2):
    parser = PlaintextParser.from_string(" ".join(sentences), Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join([str(sentence) for sentence in summary])

bmc_summary = {}
for segment, sentences in bmc_segments.items():
    if len(sentences) > 0:
        summary = summarize_segment(sentences)
        bmc_summary[segment] = summary

# Step 5: Print the BMC
print("=== Business Model Canvas (Generated) ===")
print(json.dumps(bmc_summary, indent=2))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


IndexError: list index out of range