In [9]:
# Install required libraries
!pip install transformers torch datasets accelerate
!pip install PyPDF2 sentence-transformers
!pip install pandas numpy matplotlib tqdm

import torch
import pandas as pd
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import PyPDF2
import os
import json
from tqdm import tqdm

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Using device: cuda


In [11]:
def extract_and_structure_pdf_content(pdf_folder_path):
    """Extract content from PDFs and create Q&A pairs for testing"""
    pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')]
    
    if len(pdf_files) == 0:
        print("No PDF files found! Please add 2-3 defense PDFs to the folder.")
        return None
    
    print(f"Processing {len(pdf_files)} PDF files for testing...")
    
    all_content = []
    pdf_sources = {}
    
    for pdf_file in tqdm(pdf_files, desc="Extracting PDF content"):
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                pdf_text = ""
                
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    pdf_text += page_text + "\n"
                
                # Store source mapping
                pdf_sources[pdf_file] = pdf_text
                
                # Create chunks for this specific PDF
                chunks = create_pdf_chunks(pdf_text, pdf_file)
                all_content.extend(chunks)
                
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
    
    return all_content, pdf_sources

def create_pdf_chunks(text, source_file):
    """Create meaningful chunks from PDF content"""
    # Split by sentences and paragraphs
    sentences = text.replace('\n', ' ').split('. ')
    chunks = []
    
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk + sentence) < 400:  # Keep chunks manageable
            current_chunk += sentence + ". "
        else:
            if len(current_chunk.strip()) > 50:  # Only keep substantial chunks
                chunks.append({
                    'text': current_chunk.strip(),
                    'source': source_file
                })
            current_chunk = sentence + ". "
    
    # Add the last chunk
    if len(current_chunk.strip()) > 50:
        chunks.append({
            'text': current_chunk.strip(),
            'source': source_file
        })
    
    return chunks

# Create the PDF folder and process files
pdf_folder = "test_defense_pdfs"
os.makedirs(pdf_folder, exist_ok=True)

print(f"Please add your 2-3 defense PDFs to the '{pdf_folder}' folder")
print("Then run the next cell to process them")


Please add your 2-3 defense PDFs to the 'test_defense_pdfs' folder
Then run the next cell to process them


In [12]:
# Process the PDFs once you've added them
if os.path.exists(pdf_folder) and len([f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]) > 0:
    pdf_content, pdf_sources = extract_and_structure_pdf_content(pdf_folder)
    
    print(f"Successfully processed {len(pdf_sources)} PDFs")
    print(f"Created {len(pdf_content)} content chunks for training")
    
    # Show what we extracted
    for pdf_name in pdf_sources.keys():
        print(f"\n--- Content from {pdf_name} ---")
        print(f"Length: {len(pdf_sources[pdf_name])} characters")
        print(f"Preview: {pdf_sources[pdf_name][:200]}...")
    
    # Save the processed content
    with open('test_pdf_content.json', 'w') as f:
        json.dump(pdf_content, f, indent=2)
    
    print("\nContent saved to 'test_pdf_content.json'")
    
else:
    print("Please add PDF files to the folder first!")
    pdf_content = None


Processing 3 PDF files for testing...


Extracting PDF content: 100%|██████████| 3/3 [00:05<00:00,  1.88s/it]

Successfully processed 3 PDFs
Created 671 content chunks for training

--- Content from briefing-paper-7-critical-emerging-technologies-in-the-defense-ecosystem.pdf ---
Length: 40004 characters
Preview: 1 
 
 
 
Critical & Emerging 
Technologies  in the Defense  Ecosystem  
Briefing  Paper  #7/Nove mber 2023  
 
President Biden and Prime Minister Modi affirm that technology will play the defining rol...

--- Content from Technology_and_the_defense_industry_real_threats_b.pdf ---
Length: 54986 characters
Preview: See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/269896898
Technology and the defense industry: real threats, bad habits, or new
(m...

--- Content from TPCR13.pdf ---
Length: 120335 characters
Preview: HEADQUARTERS INTEGRATED DEFENCE STAFF  
MINISTRY OF DEFENCE  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TECHNOLOGY PERSPECTIVE AND  
CAPABILITY ROADMAP  
(TPCR ) 
    
    
APRIL 2013 APRIL 2013 APRIL 2013 APRIL ...

Conte




In [13]:
def create_pdf_training_data(pdf_content):
    """Create training data specifically from your PDF content"""
    training_examples = []
    
    for item in pdf_content:
        text = item['text']
        source = item['source']
        
        # Create different training formats
        formats = [
            f"Based on {source}: {text}",
            f"Defense document explains: {text}",
            f"According to defense materials: {text}",
            f"Question: What does the document say about this topic?\nAnswer: {text}",
        ]
        
        training_examples.extend(formats)
    
    return training_examples

# Create training dataset from your PDFs
if pdf_content:
    training_data = create_pdf_training_data(pdf_content)
    print(f"Created {len(training_data)} training examples from your PDFs")
    
    # Create dataset for training
    train_dataset = Dataset.from_dict({"text": training_data})
    print("Training dataset ready!")
else:
    print("No PDF content available for training")

Created 2684 training examples from your PDFs
Training dataset ready!


In [14]:
# Initialize model for PDF testing
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model.to(device)

def tokenize_pdf_data(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=256,  # Shorter for PDF testing
        return_tensors="pt"
    )

if 'train_dataset' in locals():
    # Tokenize the PDF-based dataset
    tokenized_dataset = train_dataset.map(
        tokenize_pdf_data,
        batched=True,
        remove_columns=["text"]
    )
    
    # Quick training setup for PDF testing
    training_args = TrainingArguments(
        output_dir="./pdf_test_model",
        overwrite_output_dir=True,
        num_train_epochs=2,  # Quick training for testing
        per_device_train_batch_size=2,
        learning_rate=5e-5,
        logging_steps=10,
        save_steps=100,
        fp16=True,
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
    
    # Start training on your PDF content
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
    )
    
    print("Starting PDF-based training...")
    trainer.train()
    
    # Save the PDF-trained model
    trainer.save_model("./pdf_trained_model")
    tokenizer.save_pretrained("./pdf_trained_model")
    
    print("PDF-based training completed!")

Map:   0%|          | 0/2684 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting PDF-based training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,4.7962
20,4.4493
30,3.9227
40,3.8003
50,3.663
60,3.9298
70,3.2473
80,3.6009
90,3.591
100,3.4764


PDF-based training completed!


In [19]:
def ask_pdf_questions(question, max_length=200):
    """Ask questions specifically about your PDF content"""
    # Load the PDF-trained model
    if os.path.exists("./pdf_trained_model"):
        trained_model = GPT2LMHeadModel.from_pretrained("./pdf_trained_model")
        trained_tokenizer = GPT2Tokenizer.from_pretrained("./pdf_trained_model")
        trained_model.to(device)
    else:
        trained_model = model
        trained_tokenizer = tokenizer
    
    # Format question for PDF context
    prompt = f"Based on the defense documents: {question}\nAnswer:"
    
    inputs = trained_tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = trained_model.generate(
            inputs,
            max_length=max_length,
            temperature=0.7,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=trained_tokenizer.eos_token_id,
            no_repeat_ngram_size=2
        )
    
    response = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.replace(prompt, "").strip()

# Test with questions about your specific PDFs
test_questions = [
    "What are the DEFENSE FIRMS' STRATEGY IN A STALEMATE?"
]

print("Testing PDF-based model:")
print("=" * 50)

for question in test_questions:
    print(f"\nQuestion: {question}")
    answer = ask_pdf_questions(question)
    print(f"Answer: {answer}")
    print("-" * 30)

Testing PDF-based model:

Question: What are the DEFENSE FIRMS' STRATEGY IN A STALEMATE?
Answer: These include defense technologies which are  likely to be incorporated in future aircraft and may  include technologies for  enhanced stealth, electronic warfare, mine clearance, counter  improvised explosive device (IED) detection and identification, fire fighting,  terrain shaping and surveillance.   In addition, defense  technologies such as radar, IR and laser sensors, optical  sights, smoke  proofing and fire control systems are also being  explored. The technology requirements could be varied  depending on  the  country’s needs. These could include  navigation, air defence, ground attack, surveillance, cyber  security, command and control, reconnaissance and  other related  systems. Furthermore, there could also be  inclusion of advanced  electronic  warfare  capabilities like mine detection, ammunit ion, jamming, cluster munition  management and stealth. In the event of
------------

In [17]:
def interactive_pdf_testing():
    """Interactive testing specifically for your PDF content"""
    print("PDF-Based Defense Model Testing")
    print("Ask questions about the content from your PDFs")
    print("Type 'quit' to exit, 'sources' to see PDF sources")
    print("=" * 50)
    
    while True:
        question = input("\nWhat are CET's?").strip()
        
        if question.lower() == 'quit':
            break
        elif question.lower() == 'sources':
            if 'pdf_sources' in locals():
                print("\nYour PDF sources:")
                for pdf_name in pdf_sources.keys():
                    print(f"- {pdf_name}")
            continue
        
        if question:
            try:
                answer = ask_pdf_questions(question, max_length=300)
                print(f"\nBased on your PDFs: {answer}")
            except Exception as e:
                print(f"Error: {e}")

# Start interactive PDF testing
print("Run this to test your PDF-trained model interactively:")
print("interactive_pdf_testing()")

Run this to test your PDF-trained model interactively:
interactive_pdf_testing()
