In [1]:
# 1️⃣ Install Dependencies
!pip install transformers torch -q
import torch

In [2]:
# 2️⃣ Import Libraries
import re
from transformers import pipeline
from collections import Counter
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# 3️⃣ Sample Insurance Policy Text
policy_text = """
This insurance policy provides coverage for your automobile against accidents, theft, and natural disasters. 
The policyholder must pay the premium on a monthly or annual basis to keep the coverage active. 
Coverage includes repair costs, medical expenses for passengers, and liability protection. 
Exclusions include intentional damage, use of the vehicle for commercial purposes, and driving under the influence. 
In case of a claim, the policyholder must notify the insurance company within 48 hours and submit all relevant documents. 
The policy is valid for one year and can be renewed annually. Premium amounts are subject to change based on risk assessment.
"""

In [4]:
# 4️⃣ Preprocess Text
def clean_text(text):
    text = re.sub(r'\n+', ' ', text)      # remove newlines
    text = re.sub(r'\s+', ' ', text)      # remove extra spaces
    return text

cleaned_policy = clean_text(policy_text)

In [5]:
# 5️⃣ Extractive Summarization (Simple TF-based)
def extractive_summary(text, num_sentences=3):
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())
    word_freq = Counter(words)
    # Score each sentence
    sentence_scores = {}
    for sent in sentences:
        sent_words = word_tokenize(sent.lower())
        score = sum(word_freq.get(word, 0) for word in sent_words)
        sentence_scores[sent] = score
    # Pick top N sentences
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    return ' '.join(top_sentences)

extractive_sum = extractive_summary(cleaned_policy, num_sentences=3)
print("\n=== Extractive Summary ===\n")
print(extractive_sum)


=== Extractive Summary ===

Exclusions include intentional damage, use of the vehicle for commercial purposes, and driving under the influence. In case of a claim, the policyholder must notify the insurance company within 48 hours and submit all relevant documents. The policyholder must pay the premium on a monthly or annual basis to keep the coverage active.


In [8]:
# 6️⃣ Abstractive Summarization using BART
# Load BART summarizer (downloads model once)
abstractive_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Split long text into chunks if needed
def chunk_text(text, max_words=100):
    sentences = sent_tokenize(text)
    chunks = []
    chunk = ''
    for sent in sentences:
        if len(word_tokenize(chunk)) + len(word_tokenize(sent)) <= max_words:
            chunk += sent + ' '
        else:
            chunks.append(chunk)
            chunk = sent + ' '
    if chunk:
        chunks.append(chunk)
    return chunks

chunks = chunk_text(cleaned_policy, max_words=100)
abstractive_summaries = []

for chunk in chunks:
    summary = abstractive_summarizer(chunk, max_length=80, min_length=30, do_sample=False)
    abstractive_summaries.append(summary[0]['summary_text'])

final_abstractive_summary = ' '.join(abstractive_summaries)
print("\n=== Abstractive Summary ===\n")
print(final_abstractive_summary)




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 80, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)



=== Abstractive Summary ===

The policyholder must pay the premium on a monthly or annual basis to keep the coverage active. Coverage includes repair costs, medical expenses for passengers, and liability protection. Exclusions include intentional damage, use of the vehicle for commercial purposes, and driving under the influence. The policy is valid for one year and can be renewed annually. Premium amounts are subject to change based on risk assessment. The policy is available at the following locations:
