In [1]:
# Install all required libraries including PDF processing
!pip install transformers torch datasets accelerate
!pip install PyPDF2 pandas numpy matplotlib tqdm
!pip install scikit-learn

import torch
import pandas as pd
import numpy as np
from transformers import (
    GPT2LMHeadModel, 
    GPT2Tokenizer, 
    GPT2Config,
    Trainer, 
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import PyPDF2
import os
import json
from tqdm import tqdm
import matplotlib.pyplot as plt

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable

Using device: cuda
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
GPU Memory: 6.0 GB


In [2]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
    return text

def create_defense_pdf_chunks(text, source_file):
    """Create defense-optimized text chunks from PDF content"""
    chunks = []
    
    # Clean the text
    text = text.replace('\n', ' ').replace('\r', ' ')
    
    # Split by sentences for better chunking
    sentences = text.split('. ')
    current_chunk = ""
    
    for sentence in sentences:
        sentence = sentence.strip()
        if len(current_chunk + sentence) < 400:  # Optimal chunk size for defense content
            current_chunk += sentence + ". "
        else:
            if len(current_chunk.strip()) > 100:  # Only keep substantial chunks
                formatted_chunk = f"Defense Document ({source_file}): {current_chunk.strip()}"
                chunks.append(formatted_chunk)
            current_chunk = sentence + ". "
    
    # Add the final chunk
    if len(current_chunk.strip()) > 100:
        formatted_chunk = f"Defense Document ({source_file}): {current_chunk.strip()}"
        chunks.append(formatted_chunk)
    
    return chunks

def process_complete_defense_dataset(dataset_path):
    """Process ALL formats: PDF, CSV, TXT, JSON"""
    all_training_data = []
    supported_formats = {'.pdf', '.csv', '.txt', '.json', '.jsonl'}
    
    print("Processing complete defense dataset (including PDFs)...")
    
    stats = {
        'total_files': 0,
        'pdf_files': 0,
        'csv_files': 0,
        'txt_files': 0,
        'json_files': 0,
        'total_examples': 0
    }
    
    for root, dirs, files in os.walk(dataset_path):
        for file in tqdm(files, desc="Processing all dataset files"):
            file_path = os.path.join(root, file)
            file_ext = os.path.splitext(file)[1].lower()
            
            if file_ext not in supported_formats:
                continue
                
            stats['total_files'] += 1
            
            try:
                if file_ext == '.pdf':
                    stats['pdf_files'] += 1
                    text = extract_text_from_pdf(file_path)
                    if text.strip():
                        pdf_chunks = create_defense_pdf_chunks(text, file)
                        all_training_data.extend(pdf_chunks)
                        print(f"✓ Processed PDF: {file} ({len(pdf_chunks)} chunks)")
                    else:
                        print(f"⚠ Empty PDF: {file}")
                
                elif file_ext == '.csv':
                    stats['csv_files'] += 1
                    df = pd.read_csv(file_path)
                    csv_data = process_defense_csv(df, file)
                    all_training_data.extend(csv_data)
                    print(f"✓ Processed CSV: {file} ({len(csv_data)} examples)")
                
                elif file_ext == '.txt':
                    stats['txt_files'] += 1
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        txt_data = process_defense_txt(content, file)
                        all_training_data.extend(txt_data)
                        print(f"✓ Processed TXT: {file} ({len(txt_data)} examples)")
                
                elif file_ext in ['.json', '.jsonl']:
                    stats['json_files'] += 1
                    json_data = process_defense_json(file_path, file)
                    all_training_data.extend(json_data)
                    print(f"✓ Processed JSON: {file} ({len(json_data)} examples)")
                    
            except Exception as e:
                print(f"✗ Error processing {file}: {e}")
    
    stats['total_examples'] = len(all_training_data)
    
    print(f"\n📊 Complete Dataset Processing Summary:")
    print(f"   Total files: {stats['total_files']}")
    print(f"   PDF files: {stats['pdf_files']}")
    print(f"   CSV files: {stats['csv_files']}")
    print(f"   TXT files: {stats['txt_files']}")
    print(f"   JSON files: {stats['json_files']}")
    print(f"   Training examples: {stats['total_examples']:,}")
    
    return all_training_data, stats

def process_defense_csv(df, source_file):
    """Process CSV files for defense training data"""
    training_examples = []
    
    # Common column names for defense data
    text_columns = ['text', 'content', 'description', 'definition', 'explanation', 'data', 'term', 'meaning']
    
    # Find the best text column
    text_col = None
    for col in text_columns:
        if col in df.columns:
            text_col = col
            break
    
    if text_col is None:
        # Use first text column if no standard column found
        text_cols = df.select_dtypes(include=['object']).columns
        if len(text_cols) > 0:
            text_col = text_cols[0]
    
    if text_col:
        # Process each row
        for idx, row in df.iterrows():
            if pd.notna(row[text_col]) and len(str(row[text_col]).strip()) > 20:
                text = str(row[text_col]).strip()
                
                # Create multiple training formats
                formats = [
                    f"Defense Encyclopedia: {text}",
                    f"Military Knowledge: {text}",
                    f"Defense Information: {text}",
                ]
                
                # Add term-definition format if applicable
                if 'term' in df.columns and pd.notna(row.get('term')):
                    term = str(row['term']).strip()
                    formats.append(f"What is {term}? {text}")
                
                training_examples.extend(formats)
    
    return training_examples

def process_defense_txt(content, source_file):
    """Process TXT files for defense training data"""
    training_examples = []
    
    # Split by paragraphs or sections
    sections = content.split('\n\n')
    
    for section in sections:
        section = section.strip()
        if len(section) > 100:  # Only keep substantial content
            # Create defense-formatted training examples
            formats = [
                f"Defense Document: {section}",
                f"Military Reference: {section}",
                f"Defense Knowledge Base: {section}",
            ]
            training_examples.extend(formats)
    
    return training_examples

def process_defense_json(file_path, source_file):
    """Process JSON/JSONL files for defense training data"""
    training_examples = []
    
    try:
        if file_path.endswith('.jsonl'):
            # Line-delimited JSON
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        data = json.loads(line.strip())
                        examples = extract_json_examples(data)
                        training_examples.extend(examples)
        else:
            # Regular JSON
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    for item in data:
                        examples = extract_json_examples(item)
                        training_examples.extend(examples)
                else:
                    examples = extract_json_examples(data)
                    training_examples.extend(examples)
    
    except Exception as e:
        print(f"Error processing JSON {source_file}: {e}")
    
    return training_examples

def extract_json_examples(data):
    """Extract training examples from JSON data"""
    examples = []
    
    if isinstance(data, dict):
        # Look for common text fields
        text_fields = ['text', 'content', 'description', 'definition', 'explanation', 'question', 'answer']
        
        for field in text_fields:
            if field in data and isinstance(data[field], str) and len(data[field].strip()) > 20:
                text = data[field].strip()
                examples.append(f"Defense Data: {text}")
                
                # Create Q&A format if applicable
                if field in ['definition', 'explanation', 'answer']:
                    examples.append(f"Question: What is this about?\nAnswer: {text}")
    
    return examples

# Create defense dataset folder
os.makedirs("defense_dataset", exist_ok=True)
print("Created 'defense_dataset' folder - add your PDF, CSV, TXT, JSON files here")

Created 'defense_dataset' folder - add your PDF, CSV, TXT, JSON files here


In [3]:
class DefenseGPT2:
    def __init__(self, model_name="gpt2"):
        """Initialize the defense-focused GPT-2 model"""
        self.model_name = model_name
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        
        # Add padding token
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Move to GPU if available
        self.model.to(device)
        
        print(f"Initialized {model_name} with {self.model.num_parameters():,} parameters")
    
    def generate_defense_response(self, prompt, max_length=200, temperature=0.7):
        """Generate defense-related text"""
        # Format prompt for defense context
        formatted_prompt = f"Defense Encyclopedia: {prompt}\nAnswer:"
        
        # Tokenize input
        inputs = self.tokenizer.encode(formatted_prompt, return_tensors="pt").to(device)
        
        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_length=max_length,
                temperature=temperature,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                pad_token_id=self.tokenizer.eos_token_id,
                no_repeat_ngram_size=2
            )
        
        # Decode and clean response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.replace(formatted_prompt, "").strip()
        
        return response

# Initialize the model
defense_gpt = DefenseGPT2()

Initialized gpt2 with 124,439,808 parameters


In [4]:
# Process your complete 96MB defense dataset (PDF, CSV, TXT, JSON)
if os.path.exists("defense_dataset"):
    dataset_content, processing_stats = process_complete_defense_dataset("defense_dataset")
    
    if dataset_content:
        print(f"\n🎯 Complete dataset ready for training!")
        print(f"   Training examples: {len(dataset_content):,}")
        
        # Save processed data
        with open('complete_defense_training_data.json', 'w') as f:
            json.dump(dataset_content, f, indent=2)
        
        print("   Processed data saved to 'complete_defense_training_data.json'")
        
        # Show samples from different sources
        print(f"\n📝 Sample training examples:")
        for i, example in enumerate(dataset_content[:3]):
            print(f"   Example {i+1}: {example[:200]}...")
    else:
        print("⚠️ No training data generated. Please check your files.")
else:
    print("❌ Please create 'defense_dataset' folder and add your files!")
    dataset_content = []

Processing complete defense dataset (including PDFs)...


Processing all dataset files:   6%|▌         | 1/18 [00:25<07:06, 25.11s/it]

✓ Processed PDF: 20140808_MOD_Acronyms_and_Abbreviations.pdf (31 chunks)


Processing all dataset files:  11%|█         | 2/18 [00:48<06:21, 23.87s/it]

✓ Processed PDF: ADB280764.pdf (4513 chunks)


Processing all dataset files:  17%|█▋        | 3/18 [01:08<05:36, 22.46s/it]

✓ Processed PDF: arms_technology_data.pdf (259 chunks)
✓ Processed CSV: army-total-world.csv (9 examples)
✓ Processed CSV: atomic-weapons.csv (0 examples)


Processing all dataset files:  39%|███▉      | 7/18 [01:09<01:05,  5.96s/it]

✓ Processed PDF: CNAS-Defense-Technology-Strategy-2.pdf (142 chunks)
✓ Processed PDF: Codebook1.0.pdf (64 chunks)


Processing all dataset files:  44%|████▍     | 8/18 [01:12<00:50,  5.00s/it]

✓ Processed PDF: CowWarList.pdf (2 chunks)
✗ Error processing cow_arms_tech_long.csv: 'utf-8' codec can't decode byte 0xdf in position 256: invalid continuation byte
✗ Error processing cow_arms_tech_wide.csv: 'utf-8' codec can't decode byte 0xd1 in position 37: invalid continuation byte


Processing all dataset files:  61%|██████    | 11/18 [01:16<00:21,  3.06s/it]

✓ Processed PDF: CoW_codebook.pdf (364 chunks)


Processing all dataset files:  67%|██████▋   | 12/18 [02:33<01:47, 17.87s/it]

✓ Processed CSV: DCAD-v1.0-dyadic.csv (0 examples)


Processing all dataset files:  72%|███████▏  | 13/18 [02:33<01:09, 13.96s/it]

✓ Processed CSV: DCAD-v1.0-main.csv (0 examples)


Processing all dataset files:  78%|███████▊  | 14/18 [02:40<00:49, 12.30s/it]

✓ Processed PDF: dictionary.pdf (1874 chunks)


Processing all dataset files:  89%|████████▉ | 16/18 [02:56<00:19,  9.64s/it]

✓ Processed PDF: Farrow-artillery.pdf (713 chunks)
✓ Processed PDF: JDET-Intelligence-Centre.pdf (33 chunks)
✓ Processed CSV: List_of_countries_by_number_of_military_and_paramilitary_personnel_2023.csv (18 examples)


Processing all dataset files: 100%|██████████| 18/18 [02:57<00:00,  9.86s/it]

✓ Processed PDF: RPsfDIACoEs06032025.pdf (37 chunks)

📊 Complete Dataset Processing Summary:
   Total files: 18
   PDF files: 11
   CSV files: 7
   TXT files: 0
   JSON files: 0
   Training examples: 8,059

🎯 Complete dataset ready for training!
   Training examples: 8,059
   Processed data saved to 'complete_defense_training_data.json'

📝 Sample training examples:
   Example 1: Defense Document (20140808_MOD_Acronyms_and_Abbreviations.pdf): Acronym  Long Title  1ACC   No. 1 Air Control Centre  1SL  First Sea Lord  200D  Second OOD  200W  Second 00W  2C  Second Customer   2C ...
   Example 2: Defense Document (20140808_MOD_Acronyms_and_Abbreviations.pdf): AC & CS  Air Command  & Control Systems  Ac Comd  Aircraft Commander   Ac Tech  Aircraft Technician   AC(H)  Aircraft Controller  (Helic...
   Example 3: Defense Document (20140808_MOD_Acronyms_and_Abbreviations.pdf): AFMM (M)  Assistant Functional  Machinery  Manager (Maintenance)   AFMM (P)  Assistant Functional  Machinery  Manager 




In [5]:
def tokenize_defense_data(examples, tokenizer, max_length=512):
    """Tokenize the defense training data"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Create training dataset from processed content
if dataset_content:
    # Create Hugging Face dataset
    train_dataset = Dataset.from_dict({"text": dataset_content})
    
    # Tokenize dataset
    tokenized_dataset = train_dataset.map(
        lambda x: tokenize_defense_data(x, defense_gpt.tokenizer),
        batched=True,
        remove_columns=["text"]
    )
    
    print(f"✅ Training dataset created:")
    print(f"   Original examples: {len(dataset_content):,}")
    print(f"   Tokenized examples: {len(tokenized_dataset):,}")
    
else:
    tokenized_dataset = None
    print("❌ No dataset to tokenize. Please add files to defense_dataset folder.")

Map:   0%|          | 0/8059 [00:00<?, ? examples/s]

✅ Training dataset created:
   Original examples: 8,059
   Tokenized examples: 8,059


In [6]:
def setup_complete_training_configuration():
    """Optimized training configuration for complete 96MB defense dataset"""
    
    return TrainingArguments(
        output_dir="./defense_complete_model",
        overwrite_output_dir=True,
        
        # Epochs - optimal for 96MB dataset with PDFs
        num_train_epochs=4,
        
        # Batch size optimized for RTX 4050
        per_device_train_batch_size=3,  # Slightly smaller for PDF processing
        gradient_accumulation_steps=5,  # Effective batch size of 15
        
        # Learning rate for domain-specific fine-tuning
        learning_rate=3e-5,
        
        # Optimization settings
        warmup_steps=250,
        weight_decay=0.01,
        
        # Memory and performance
        fp16=True,  # Essential for RTX 4050
        dataloader_pin_memory=True,
        
        # Monitoring and saving
        logging_steps=50,
        save_steps=500,
        
        # Regularization
        max_grad_norm=1.0,
        
        # Advanced settings
        remove_unused_columns=False,
        save_total_limit=2,
    )

# Setup training configuration
training_args = setup_complete_training_configuration()
data_collator = DataCollatorForLanguageModeling(
    tokenizer=defense_gpt.tokenizer,
    mlm=False,  # GPT-2 uses causal LM
)

print("🔧 Training configuration ready for complete 96MB dataset with PDFs")

🔧 Training configuration ready for complete 96MB dataset with PDFs


In [8]:
def train_complete_defense_encyclopedia():
    """Train the complete defense encyclopedia model"""
    if tokenized_dataset is None:
        print("❌ No training dataset available!")
        return
    
    # Initialize trainer
    trainer = Trainer(
        model=defense_gpt.model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset,
        tokenizer=defense_gpt.tokenizer,
    )
    
    print("🚀 Starting complete defense encyclopedia training...")
    print(f"   Training examples: {len(tokenized_dataset):,}")
    print(f"   Data sources: PDFs, CSVs, TXTs, JSONs")
    print(f"   Expected time: 3-5 hours on RTX 4050")
    
    # Start training
    trainer.train()
    
    # Save the trained model
    trainer.save_model("./defense_encyclopedia_complete")
    defense_gpt.tokenizer.save_pretrained("./defense_encyclopedia_complete")
    
    print("✅ Complete training finished and model saved!")
    return trainer

# Train the model if dataset is ready
if tokenized_dataset:
    trained_model = train_complete_defense_encyclopedia()
else:
    print("⚠️ Skipping training - no dataset available")

  trainer = Trainer(


🚀 Starting complete defense encyclopedia training...
   Training examples: 8,059
   Data sources: PDFs, CSVs, TXTs, JSONs
   Expected time: 3-5 hours on RTX 4050


Step,Training Loss
50,3.16
100,3.2611
150,3.2111
200,3.2107
250,3.2251
300,3.245
350,3.1996
400,3.1433
450,3.1641
500,3.1658


✅ Complete training finished and model saved!


In [9]:
def test_complete_defense_encyclopedia():
    """Test the trained complete defense encyclopedia"""
    # Load the trained model if it exists
    if os.path.exists("./defense_encyclopedia_complete"):
        print("Loading complete trained defense encyclopedia...")
        defense_gpt.model = GPT2LMHeadModel.from_pretrained("./defense_encyclopedia_complete")
        defense_gpt.tokenizer = GPT2Tokenizer.from_pretrained("./defense_encyclopedia_complete")
        defense_gpt.model.to(device)
    
    # Test queries for defense topics
    test_queries = [
        "What is a ballistic missile?",
        "Explain radar technology",
        "Define electronic warfare",
        "What is military strategy?",
        "Describe air defense systems",
        "What is cyber warfare?",
        "Explain stealth technology",
        "What are guided munitions?"
    ]
    
    print("🎯 Testing Complete Defense Encyclopedia:")
    print("=" * 60)
    
    for query in test_queries:
        print(f"\n❓ Query: {query}")
        response = defense_gpt.generate_defense_response(query, max_length=250)
        print(f"🤖 Response: {response}")
        print("-" * 40)

# Test the trained model
test_complete_defense_encyclopedia()

Loading complete trained defense encyclopedia...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


🎯 Testing Complete Defense Encyclopedia:

❓ Query: What is a ballistic missile?
🤖 Response: A ballistic rocket is primarily a defense against enemy missile threats. The main use ballistic missiles is to attack enemy defense forces and the main component of this defense is defense of the strategic area. A defense that is defended against an attack by ballistic rockets will produce a significant boost to national defense. See also ballistic bombing. defense; ballistic defense, also defense strategy. (JP 4-01.7) ballistic cruise missile -A ballistic ballistic weapon with a payload of up to six tons. Also called BSLM. called BM. ballistic trajectory -(*) 1. In naval mine warfare, the path the projectile travels through an area to provide a target for a missile attack. 2. An angle of attack on the weapon. 3. 1.-A line of departure for the ballistic launcher. 4. For surface-to-surface ballistic projectiles, a line connecting the launching of a rocket and its launch system. 5. To launch an ai

In [10]:
def interactive_complete_defense_encyclopedia():
    """Interactive interface for your complete defense encyclopedia"""
    print("🛡️ Complete Defense Encyclopedia - Interactive Mode")
    print("Ask questions about defense, military, and security topics")
    print("Trained on PDFs, CSVs, TXTs, and JSON files")
    print("Type 'quit' to exit, 'stats' to see dataset statistics")
    print("=" * 60)
    
    while True:
        query = input("\n🔍 Ask about defense topics: ").strip()
        
        if query.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        elif query.lower() == 'stats':
            if 'processing_stats' in locals():
                print(f"\n📊 Complete Dataset Statistics:")
                for key, value in processing_stats.items():
                    print(f"   {key}: {value}")
            continue
        
        if query:
            try:
                response = defense_gpt.generate_defense_response(query, max_length=300)
                print(f"\n🛡️ Defense Encyclopedia:")
                print(response)
            except Exception as e:
                print(f"❌ Error: {e}")
        else:
            print("Please enter a valid query.")

# Start interactive mode
print("📝 Run this to start interactive testing:")
print("interactive_complete_defense_encyclopedia()")


📝 Run this to start interactive testing:
interactive_complete_defense_encyclopedia()


In [11]:
def validate_complete_96mb_dataset():
    """Validate your complete 96MB dataset with PDFs"""
    
    if not os.path.exists("defense_dataset"):
        print("❌ Defense dataset folder not found!")
        return False
    
    total_size = 0
    file_count = 0
    format_count = {}
    
    for root, dirs, files in os.walk("defense_dataset"):
        for file in files:
            if file.endswith(('.pdf', '.csv', '.txt', '.json', '.jsonl')):
                file_path = os.path.join(root, file)
                size = os.path.getsize(file_path)
                total_size += size
                file_count += 1
                
                ext = os.path.splitext(file)[1].lower()
                format_count[ext] = format_count.get(ext, 0) + 1
    
    size_mb = total_size / (1024 * 1024)
    
    print(f"📊 Complete Dataset Validation Results:")
    print(f"   📁 Files found: {file_count}")
    print(f"   💾 Total size: {size_mb:.1f} MB")
    print(f"   📝 File formats:")
    for ext, count in format_count.items():
        print(f"      {ext}: {count} files")
    
    status = "✅ Perfect for training!" if 80 <= size_mb <= 120 else "⚠️ Size outside optimal range"
    print(f"   🎯 Status: {status}")
    
    return 80 <= size_mb <= 120

# Validate your complete dataset
validate_complete_96mb_dataset()

📊 Complete Dataset Validation Results:
   📁 Files found: 18
   💾 Total size: 96.0 MB
   📝 File formats:
      .pdf: 11 files
      .csv: 7 files
   🎯 Status: ✅ Perfect for training!


True