In [3]:
#!pip install -q transformers torch scikit-learn matplotlib seaborn pandas numpy scipy pdfplumber PyPDF2

In [4]:
#!pip install -q pdfplumber --no-deps

In [5]:
# Import necessary libraries
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import os
import pdfplumber
from pathlib import Path

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

Using device: cuda
PyTorch version: 2.6.0+cu124


In [6]:
# Load Academic Documents from PDF Subfolders
def extract_text_from_pdf(pdf_path, max_pages=5):
    """
    Extract text from PDF file.
    
    Args:
        pdf_path: Path to PDF file
        max_pages: Maximum pages to extract (for performance)
    
    Returns:
        text: Extracted text from PDF
    """
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            # Limit to max_pages for performance
            num_pages = min(len(pdf.pages), max_pages)
            
            for page_num in range(num_pages):
                page = pdf.pages[page_num]
                text += page.extract_text() + " "
        
        return text.strip()
    
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

In [9]:
def load_documents_from_folders(base_path='/kaggle/input/academic-documents/academic_documents'):
    """
    Load documents from subject subfolders.
    
    Expected folder structure:
    documents/
    ├── Physics/
    ├── Chemistry/
    ├── Mathematics/
    ├── Computer_Science/
    └── Civil_Engineering/
    
    Args:
        base_path: Path to the documents folder
    
    Returns:
        df: DataFrame with document_id, text, domain, source_file
    """
    
    documents_list = []
    document_id = 0
    
    # Define subjects and expected folder names
    subject_folders = {
        'Physics': 'Physics',
        'Chemistry': 'Chemistry',
        'Mathematics': 'Mathematics',
        'Computer Science': 'Computer_Science',
        'Civil Engineering': 'Civil_Engineering'
    }
    
    print("=" * 80)
    print("LOADING DOCUMENTS FROM PDF SUBFOLDERS")
    print("=" * 80)
    
    # Iterate through each subject folder
    for domain_name, folder_name in subject_folders.items():
        folder_path = os.path.join(base_path, folder_name)
        
        # Check if folder exists
        if not os.path.exists(folder_path):
            print(f"\n⚠ Warning: Folder not found: {folder_path}")
            print(f"  Expected folder structure:")
            print(f"  {base_path}/")
            print(f"  ├── Physics/")
            print(f"  ├── Chemistry/")
            print(f"  ├── Mathematics/")
            print(f"  ├── Computer_Science/")
            print(f"  └── Civil_Engineering/")
            continue
        
        # Get all PDF files in folder
        pdf_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.pdf')])
        
        print(f"\n{domain_name}: Found {len(pdf_files)} PDF files")
        
        # Process each PDF file
        for pdf_file in pdf_files:
            pdf_path = os.path.join(folder_path, pdf_file)
            
            # Extract text from PDF
            print(f"  Processing: {pdf_file}...", end=" ")
            text = extract_text_from_pdf(pdf_path, max_pages=5)
            
            # Check if text was extracted
            if len(text) < 50:
                print(f"⚠ (very short text - {len(text)} chars)")
            else:
                print(f"✓ ({len(text)} chars)")
            
            # Add to documents list
            documents_list.append({
                'document_id': document_id,
                'text': text,
                'domain': domain_name,
                'source_file': pdf_file,
                'file_path': pdf_path,
                'text_length': len(text)
            })
            
            document_id += 1
    
    # Create DataFrame
    df = pd.DataFrame(documents_list)
    
    return df

In [11]:
# Load documents from folders
df = load_documents_from_folders(base_path='/kaggle/input/academic-documents/academic_documents')

print("\n" + "=" * 80)
print("DOCUMENT LOADING SUMMARY")
print("=" * 80)

print(f"\nTotal documents loaded: {len(df)}")
print(f"\nDomain distribution:")
print(df['domain'].value_counts())

print(f"\n--- Document Statistics ---")
print(f"Total text characters: {df['text_length'].sum():,}")
print(f"Average document length: {df['text_length'].mean():.0f} characters")
print(f"Min document length: {df['text_length'].min()} characters")
print(f"Max document length: {df['text_length'].max()} characters")

print("\n--- Sample Documents ---")
for domain in df['domain'].unique():
    # take the first row for this domain
    sample = df[df['domain'] == domain].iloc[0]

    print(f"\n{domain} - {sample['source_file']}:")
    print(f"  Text preview: {sample['text'][:200]}...")
    print(f"  Length: {sample['text_length']} characters")

LOADING DOCUMENTS FROM PDF SUBFOLDERS

Physics: Found 11 PDF files
  Processing: Angular_Momentum.pdf... ✓ (5641 chars)
  Processing: Electron_in_periodic_solid.pdf... ✓ (9332 chars)
  Processing: Equilibrium_Thermodynamic_Parameters.pdf... ✓ (8357 chars)
  Processing: Gibbs_Paradox_and_Indistinguishability_of_Particle.pdf... ✓ (4415 chars)
  Processing: Group_Theory_Axioms_Properties.pdf... ✓ (4998 chars)
  Processing: Heisenberg’s_Uncertainty_Principle.pdf... ✓ (8199 chars)
  Processing: Linear_Vector_Spaces_and_Operators.pdf... ✓ (4767 chars)
  Processing: Maxwell’s_Equations_1.pdf... ✓ (3513 chars)
  Processing: Quantum_Mechanical_Concepts.pdf... ✓ (6128 chars)
  Processing: Second_Quantization.pdf... ✓ (5117 chars)
  Processing: Theory_of_Relativity.pdf... ✓ (6282 chars)

Chemistry: Found 10 PDF files
  Processing: Biomineralization.pdf... ✓ (6132 chars)
  Processing: Determination_o_ Stability_Constant_by_Polarographic.pdf... ✓ (6342 chars)
  Processing: Greenhouse_effect.pdf... 

Cannot set gray non-stroke color because /'p93' is an invalid float value


✓ (4481 chars)
  Processing: QUANTUM_CHEMISTRY.pdf... ✓ (4580 chars)
  Processing: Thermal_Power_Plant.pdf... ✓ (7955 chars)
  Processing: Waste_Water_Purification.pdf... ✓ (6519 chars)

Mathematics: Found 12 PDF files
  Processing: Cauchy’s_theorem_and_it’s_consequences.pdf... ✓ (7490 chars)
  Processing: Finite_and_Infinite_sets.pdf... ✓ (8740 chars)
  Processing: Introduction_to_Continuity.pdf... ✓ (6363 chars)
  Processing: LINEAR_TRANSFORMATIONS.pdf... ✓ (5799 chars)
  Processing: Lagrange’s_Interpolation.pdf... ✓ (5078 chars)
  Processing: Least_Squares_Method.pdf... ✓ (4948 chars)
  Processing: MATRIX_REPRESENTATIONS.pdf... ✓ (5114 chars)
  Processing: Nilpotent_groups.pdf... ✓ (7631 chars)
  Processing: Quotient_Space.pdf... ✓ (6416 chars)
  Processing: RANK_OF_A_MATRIX.pdf... ✓ (6876 chars)
  Processing: Singular_Homology_Groups.pdf... ✓ (7054 chars)
  Processing: VECTOR_SPACES.pdf... ✓ (4120 chars)

Computer Science: Found 12 PDF files
  Processing: Analysis_of_Algorithms.pdf

Cannot set gray non-stroke color because /'p11' is an invalid float value
Cannot set gray non-stroke color because /'p41' is an invalid float value
Cannot set gray non-stroke color because /'p70' is an invalid float value
Cannot set gray non-stroke color because /'p98' is an invalid float value
Cannot set gray non-stroke color because /'p126' is an invalid float value


✓ (6133 chars)
  Processing: DATABASE_RECOVERY.pdf... ✓ (1660 chars)
  Processing: Introduction_to_Operating_Systems.pdf... ✓ (11787 chars)
  Processing: Linked_List_Implementation_of_List_ADT.pdf... ✓ (9523 chars)
  Processing: NoSQL.pdf... ✓ (8919 chars)
  Processing: Normalisation.pdf... ✓ (4356 chars)
  Processing: Perception_and_Colours.pdf... ✓ (12438 chars)
  Processing: Relational_Algebra.pdf... ✓ (3865 chars)
  Processing: Virtual_Memory.pdf... ✓ (9876 chars)

Civil Engineering: Found 5 PDF files
  Processing: CONCRETE_TECHNOLOGY.pdf... ✓ (12512 chars)
  Processing: DESIGN_OF_STEEL_STRUCTURE.pdf... ✓ (10237 chars)
  Processing: FOUNDATION_ENGINEERING.pdf... ✓ (6018 chars)
  Processing: HYDROLOGY_AND_IRRIGATION_ENGINEERING.pdf... ✓ (8179 chars)
  Processing: SURVEYING.pdf... ✓ (10061 chars)

DOCUMENT LOADING SUMMARY

Total documents loaded: 50

Domain distribution:
domain
Mathematics          12
Computer Science     12
Physics              11
Chemistry            10
Civil Engin

In [12]:
# Data Cleaning for Extracted PDF Text
def clean_pdf_text(text):
    """
    Clean text extracted from PDFs.
    
    PDFs sometimes have:
    - Extra whitespace and newlines
    - Encoding issues
    - Page numbers and footers
    - Special characters
    
    Args:
        text: Raw text from PDF
    
    Returns:
        cleaned_text: Cleaned text
    """
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove common PDF artifacts
    text = text.replace('\x00', '')  # Remove null bytes
    text = text.replace('\n', ' ')   # Replace newlines with space
    text = text.replace('\r', '')    # Remove carriage returns
    
    # Remove multiple spaces
    while '  ' in text:
        text = text.replace('  ', ' ')
    
    return text.strip()

# Apply cleaning to all documents
print("\nCleaning extracted PDF text...")
df['text'] = df['text'].apply(clean_pdf_text)

print(f"\n--- After Cleaning ---")
print(f"Average text length: {df['text_length'].mean():.0f} → {df['text'].str.len().mean():.0f} characters")
print(f"Documents with very short text (<100 chars): {(df['text'].str.len() < 100).sum()}")

# Filter out documents with very short text (likely failed extraction)
min_text_length = 100
df_filtered = df[df['text'].str.len() >= min_text_length].copy()

if len(df_filtered) < len(df):
    print(f"\n⚠ Warning: Removed {len(df) - len(df_filtered)} documents with insufficient text")
    print("  These documents may have failed to extract properly")

df = df_filtered.reset_index(drop=True)

print(f"\n✓ Final document count: {len(df)}")


Cleaning extracted PDF text...

--- After Cleaning ---
Average text length: 7003 → 7003 characters
Documents with very short text (<100 chars): 0

✓ Final document count: 50


In [13]:
# Document Validation and Quality Check
def validate_documents(df):
    """
    Validate that documents are suitable for embedding.
    
    Args:
        df: DataFrame with documents
    
    Returns:
        validation_report: Dictionary with validation results
    """
    
    report = {
        'total_documents': len(df),
        'documents_by_domain': df['domain'].value_counts().to_dict(),
        'min_text_length': df['text'].str.len().min(),
        'max_text_length': df['text'].str.len().max(),
        'avg_text_length': df['text'].str.len().mean(),
        'documents_with_content': (df['text'].str.len() > 50).sum(),
        'issues': []
    }
    
    # Check for missing values
    if df['text'].isnull().sum() > 0:
        report['issues'].append(f"Found {df['text'].isnull().sum()} documents with missing text")
    
    # Check for empty documents
    empty_docs = (df['text'].str.len() == 0).sum()
    if empty_docs > 0:
        report['issues'].append(f"Found {empty_docs} empty documents")
    
    # Check domain distribution
    for domain in df['domain'].unique():
        count = (df['domain'] == domain).sum()
        if count == 0:
            report['issues'].append(f"No documents found for domain: {domain}")
    
    # Check for duplicate documents
    duplicates = df['text'].duplicated().sum()
    if duplicates > 0:
        report['issues'].append(f"Found {duplicates} duplicate documents")
    
    return report

# Validate documents
validation = validate_documents(df)

print("\n" + "=" * 80)
print("DOCUMENT VALIDATION REPORT")
print("=" * 80)

print(f"\nTotal Documents: {validation['total_documents']}")
print(f"\nDocuments by Domain:")
for domain, count in validation['documents_by_domain'].items():
    print(f"  {domain}: {count}")

print(f"\nText Length Statistics:")
print(f"  Minimum: {validation['min_text_length']} characters")
print(f"  Maximum: {validation['max_text_length']} characters")
print(f"  Average: {validation['avg_text_length']:.0f} characters")
print(f"  Documents with content: {validation['documents_with_content']}")

if validation['issues']:
    print(f"\n⚠ Issues Found:")
    for issue in validation['issues']:
        print(f"  - {issue}")
else:
    print(f"\n✓ No issues found - documents are ready for embedding!")


DOCUMENT VALIDATION REPORT

Total Documents: 50

Documents by Domain:
  Mathematics: 12
  Computer Science: 12
  Physics: 11
  Chemistry: 10
  Civil Engineering: 5

Text Length Statistics:
  Minimum: 1660 characters
  Maximum: 12671 characters
  Average: 7003 characters
  Documents with content: 50

✓ No issues found - documents are ready for embedding!


In [17]:
# # --- FIX: Resolve Protobuf Compatibility Error on Kaggle ---
# # Install a protobuf version that still has MessageFactory.GetPrototype
# !pip install -q "protobuf==3.20.3" "googleapis-common-protos<2"

# import os
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

# print("Protobuf version fix applied. NOW RESTART THE KERNEL and rerun all cells.")

In [16]:
# Next step: Load BERT Model and Tokenizer
from transformers import AutoTokenizer, AutoModel

print("\n" + "=" * 80)
print("LOADING BERT MODEL FOR SENTENCE EMBEDDINGS")
print("=" * 80)

model_name = "sentence-transformers/all-MiniLM-L6-v2"
print(f"\nLoading model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

model = model.to(device)
model.eval()

print("✓ Model loaded successfully")
print(f"Model moved to: {device}")
print(f"Model size: {sum(p.numel() for p in model.parameters()):,} parameters")
print(f"\nTokenizer vocabulary size: {tokenizer.vocab_size}")
print(f"Maximum sequence length: {tokenizer.model_max_length}")

sample_text = "Natural language processing is fascinating!"
tokens = tokenizer.tokenize(sample_text)
print("\n--- Tokenization Example ---")
print("Text:", sample_text)
print("Tokens:", tokens)


LOADING BERT MODEL FOR SENTENCE EMBEDDINGS

Loading model: sentence-transformers/all-MiniLM-L6-v2
✓ Model loaded successfully
Model moved to: cuda
Model size: 22,713,216 parameters

Tokenizer vocabulary size: 30522
Maximum sequence length: 512

--- Tokenization Example ---
Text: Natural language processing is fascinating!
Tokens: ['natural', 'language', 'processing', 'is', 'fascinating', '!']


In [18]:
# Sentence Embedding Function (Mean Pooling)
import numpy as np

def get_sentence_embedding(text, tokenizer, model, device):
    """
    Generate BERT sentence embedding for a text using mean pooling.
    """
    inputs = tokenizer(
        text,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )

    last_hidden_state = outputs.last_hidden_state  # (1, seq_len, hidden)

    # Mean pooling with attention mask
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    sum_embeddings = (last_hidden_state * mask_expanded).sum(1)
    sum_mask = mask_expanded.sum(1)
    mean_embeddings = sum_embeddings / sum_mask

    embedding = mean_embeddings.cpu().numpy().squeeze()
    return embedding

print("\n" + "=" * 80)
print("TESTING EMBEDDING FUNCTION")
print("=" * 80)

test_text = "Machine learning is a subset of artificial intelligence."
emb = get_sentence_embedding(test_text, tokenizer, model, device)
print("\nText:", test_text)
print("Embedding shape:", emb.shape)   # should be (384,) for all-MiniLM-L6-v2[web:48]
print("First 10 values:", emb[:10])
print("Norm:", np.linalg.norm(emb))


TESTING EMBEDDING FUNCTION

Text: Machine learning is a subset of artificial intelligence.
Embedding shape: (384,)
First 10 values: [-0.12901162 -0.05818983  0.39556104  0.16732194  0.15705997 -0.21351758
 -0.1550703  -0.0893684  -0.31694397 -0.01851244]
Norm: 5.499391


In [19]:
# Generate Embeddings for All Documents
from tqdm import tqdm

print("\n" + "=" * 80)
print("GENERATING EMBEDDINGS FOR ALL DOCUMENTS")
print("=" * 80)

embeddings_list = []
for text in tqdm(df["text"], desc="Generating embeddings"):
    emb = get_sentence_embedding(text, tokenizer, model, device)
    embeddings_list.append(emb)

embeddings = np.array(embeddings_list)
df["embedding"] = list(embeddings)

print("\n✓ Embeddings generated")
print("Embeddings shape:", embeddings.shape)


GENERATING EMBEDDINGS FOR ALL DOCUMENTS


Generating embeddings: 100%|██████████| 50/50 [00:00<00:00, 111.96it/s]


✓ Embeddings generated
Embeddings shape: (50, 384)





In [20]:
# Semantic Search with Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity  # built-in cosine similarity[web:53]

def semantic_search(query, embeddings, df, tokenizer, model, device, top_k=5):
    query_emb = get_sentence_embedding(query, tokenizer, model, device)
    sims = cosine_similarity([query_emb], embeddings)[0]  # (n_docs,)

    top_idx = np.argsort(sims)[-top_k:][::-1]

    results = []
    for idx in top_idx:
        results.append({
            "document_id": int(df.iloc[idx]["document_id"]),
            "domain": df.iloc[idx]["domain"],
            "source_file": df.iloc[idx].get("source_file", ""),
            "text": df.iloc[idx]["text"],
            "similarity": float(sims[idx])
        })
    return results

def print_search_results(query, top_k=5):
    print("\n" + "=" * 80)
    print("SEMANTIC SEARCH")
    print("=" * 80)
    print("Query:", query)
    print("-" * 80)

    results = semantic_search(query, embeddings, df, tokenizer, model, device, top_k)

    for i, r in enumerate(results, 1):
        print(f"{i}. [{r['domain']}] (doc_id={r['document_id']}, sim={r['similarity']:.4f})")
        if r["source_file"]:
            print(f"   File: {r['source_file']}")
        print("   Text preview:", r["text"][:200].replace("\n", " "), "...")
        print()
    return results

# Quick smoke test
print_search_results("neural networks and deep learning", top_k=5)
print_search_results("bridge design and concrete", top_k=5)


SEMANTIC SEARCH
Query: neural networks and deep learning
--------------------------------------------------------------------------------
1. [Computer Science] (doc_id=33, sim=0.1902)
   File: Analysis_of_Algorithms.pdf
   Text preview: e-PGPathshala Subject : Computer Science Paper: Data Structures Module 5: Analysis of Algorithms Quadrant 1- e-text Welcome to the e-PG Pathshala Lecture Series on Data Structures. This time we are go ...

2. [Physics] (doc_id=10, sim=0.1434)
   File: Theory_of_Relativity.pdf
   Text preview: Discipline: Physics Subject: Electromagnetic Theory Unit 14: Lesson/ Module: Theory of Relativity - I Author (CW): Prof. V. K. Gupta Department/ University: Department of Physics and Astrophysics, Uni ...

3. [Mathematics] (doc_id=32, sim=0.1255)
   File: VECTOR_SPACES.pdf
   Text preview: CHAPTER 2. VECTOR SPACES MODULE 2. SUBSPACES INDRANATH SENGUPTA Contents 1. Subspaces of R n and C n 1 2. Linear combination of vectors 3 Let V be a vector space over a eld F. 

[{'document_id': 45,
  'domain': 'Civil Engineering',
  'source_file': 'CONCRETE_TECHNOLOGY.pdf',
  'text': "MODULE I CEMENT Cement is a binder, a substance that sets and hardens and can bind other materials together. Cements used in construction can be characterized as being either hydraulic or non-hydraulic, depending upon the ability of the cement to be used in the presence of water. Non-hydraulic cement will not set in wet conditions or underwater, rather it sets as it dries and reacts with carbon dioxide in the air. It can be attacked by some aggressive chemicals after setting. Hydraulic cement is made by replacing some of the cement in a mix with activated aluminum silicates, pozzolana, such as fly ash. The chemical reaction results in hydrates that are not very water-soluble and so are quite durable in water and safe from chemical attack. This allows setting in wet condition or underwater and further protects the hardened material from chemical attack (e.g., Portland cement). Us

In [26]:
# Simple Command-Line Search Loop
def interactive_search_loop():
    print("\n" + "=" * 80)
    print("BERT SEMANTIC SEARCH ENGINE - INTERACTIVE MODE")
    print("=" * 80)
    print("Type a query and press Enter.")
    print("Type 'exit' or 'quit' to stop.\n")

    while True:
        q = input("Query: ").strip()
        if q.lower() in ["exit", "quit"]:
            print("Exiting search.")
            break
        if len(q) < 3:
            print("Please type a longer query.\n")
            continue
        print_search_results(q, top_k=5)

# Uncomment to use interactively (in Kaggle console)
interactive_search_loop()


BERT SEMANTIC SEARCH ENGINE - INTERACTIVE MODE
Type a query and press Enter.
Type 'exit' or 'quit' to stop.



Query:  cement is a binding material



SEMANTIC SEARCH
Query: cement is a binding material
--------------------------------------------------------------------------------
1. [Civil Engineering] (doc_id=45, sim=0.6400)
   File: CONCRETE_TECHNOLOGY.pdf
   Text preview: MODULE I CEMENT Cement is a binder, a substance that sets and hardens and can bind other materials together. Cements used in construction can be characterized as being either hydraulic or non-hydrauli ...

2. [Chemistry] (doc_id=13, sim=0.2533)
   File: Greenhouse_effect.pdf
   Text preview: ____________________________________________________________________________________________________ Subject Chemistry Paper No and Title Paper 4: Environmental Chemistry Module No and Title 22: Green ...

3. [Chemistry] (doc_id=19, sim=0.2373)
   File: Thermal_Power_Plant.pdf
   Text preview: ____________________________________________________________________________________________________ Subject Chemistry Paper No and Title Paper 4: Environmental Chemistry Module No 

Query:  exit


Exiting search.
