In [1]:
!pip install transformers
!pip install pymupdf

from transformers import pipeline
import fitz  # PyMuPDF


Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


2025-07-17 13:19:16.423054: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752758356.787887      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752758356.891360      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load question answering model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-large-squad2")


config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


In [3]:
def extract_text_from_pdf(pdf_path):
    """Extracts all text from PDF."""
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text("text") for page in doc)


In [4]:
def calculate_similarity(question, paragraph):
    question_words = set(question.lower().split())
    para_words = set(paragraph.lower().split())
    common_words = question_words.intersection(para_words)
    
    tech_term_weight = sum(3 for word in common_words if word in [
        "dataset", "preprocessing", "model", "normalize", "resolution", 
        "segmentation", "training"
    ])
    
    return len(common_words) / (len(question_words) + len(para_words) - len(common_words)) + tech_term_weight


In [5]:
def find_relevant_paragraphs(text, question, num_paragraphs=2):
    paragraphs = text.split('\n\n')
    scored = []
    
    for para in paragraphs:
        if len(para.strip()) < 10:
            continue
        score = calculate_similarity(question, para)
        scored.append((para, score))
    
    scored.sort(key=lambda x: x[1], reverse=True)
    return [p[0] for p in scored[:num_paragraphs]]


In [6]:
def extract_comprehensive_steps(context, initial_answer):
    indicators = ["preprocessing", "normalization", "resiz", "organiz", "segment", 
                  "convert", "structur", "divid", "standard"]
    steps = []
    
    for para in context.split('\n'):
        if any(ind in para.lower() for ind in indicators):
            for sentence in para.split('.'):
                if any(ind in sentence.lower() for ind in indicators):
                    clean = sentence.strip()
                    if clean and len(clean) > 10:
                        steps.append(clean)
                        
    return "The preprocessing steps included: " + "; ".join(steps) if steps else initial_answer


In [7]:
def expand_what_answer(answer, context, question):
    """
    Expands short 'what' answers by pulling more descriptive sentences from the context.
    """
    # Split context into sentences
    sentences = context.split(".")
    
    # Try to find sentences with key words from the question
    question_keywords = question.lower().split()
    best_sentences = []
    
    for sentence in sentences:
        score = sum(1 for word in question_keywords if word in sentence.lower())
        if score >= 2:  # Sentence must match at least 2 question words
            best_sentences.append(sentence.strip())
    
    # Return the best match with original answer as fallback
    return ". ".join(best_sentences[:2]) if best_sentences else answer


In [8]:
def enhance_answer(answer, question, context):
    if len(answer.split()) < 5:
        if "and" in question or "," in question:
            if "what" in question.lower():
                return expand_what_answer(answer, context, question)
            
            if "which" in question.lower() and "format" in question.lower():
                if "dataset" in answer.lower() and "format" not in answer.lower():
                    return f"{answer} stored in .nii.gz format"
                if "format" in answer.lower() and "dataset" not in answer.lower():
                    return f"MMWHS dataset with {answer}"
    
    if "preprocessing" in question.lower() or "steps" in question.lower():
        return extract_comprehensive_steps(context, answer)
    
    return answer


In [9]:
def process_input(input_data, is_pdf=False, question=None):
    if is_pdf:
        text = extract_text_from_pdf(input_data)
    else:
        text = input_data

    text_for_summary = text[:2048] if len(text) > 2048 else text
    summary = summarizer(text_for_summary, max_length=200, min_length=100, do_sample=False)[0]['summary_text']

    if question:
        relevant_paras = find_relevant_paragraphs(text, question, num_paragraphs=2)
        extended_context = summary + "\n\n" + "\n".join(relevant_paras)

        answer_raw = qa_pipeline(
            question=question, 
            context=extended_context
        )['answer']

        answer_raw = enhance_answer(answer_raw, question, extended_context)
        full_answer = f"The answer to the question '{question}' is: {answer_raw}."
        
        eval_info = {
            "Answer Length": len(answer_raw),
            "Is Short Answer": len(answer_raw.split()) < 10,
            "Contains Key Phrase": any(k in answer_raw.lower() for k in [
                "dataset", "format", "tool", "model", "architecture", 
                "preprocessing", "training", "resolution", "normalization"
            ])
        }

        return {
            "Summary": summary,
            "QnA Answer": full_answer,
            "Evaluation": eval_info
        }

    return {"Summary": summary}


In [10]:
pdf_path = "/kaggle/input/contribution/Abhishek_Contribution_Extended.pdf"

# Example test without question
print(process_input(pdf_path, is_pdf=True))

# Example test with question
question = "Which deep learning model was used in the project and why?"
print(process_input(pdf_path, is_pdf=True, question=question))


{'Summary': 'The project focused on detecting Coronary Artery Calcification (CAC) using deep learning methods. For this purpose, we used CT scan images from the MMWHS dataset. Abhishek Kumar was involved in collecting, organizing, and preprocessing the images. The dataset was divided into two parts: a Training Set and a Testing Set. The Training Set is used to teach the model and the Testing Set to evaluate how well the model performs on unseen data. The Pre-processed Dataset was used for training the model.'}
{'Summary': 'The project focused on detecting Coronary Artery Calcification (CAC) using deep learning methods. For this purpose, we used CT scan images from the MMWHS dataset. Abhishek Kumar was involved in collecting, organizing, and preprocessing the images. The dataset was divided into two parts: a Training Set and a Testing Set. The Training Set is used to teach the model and the Testing Set to evaluate how well the model performs on unseen data. The Pre-processed Dataset was