# Industrial Document Analyzer (AI-powered)
This notebook extracts text from PDF/TXT documents, generates summaries using a Hugging Face model, and extracts keywords.
Technologies: Python, Hugging Face Transformers, pdfplumber, YAKE.

---


In [None]:
# Install required libraries
!pip install -q transformers pdfplumber yake sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.7/80.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.5/360.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Imports and helper functions
import io, re
import pdfplumber
from transformers import pipeline
import yake

def extract_text_from_pdf_bytes(pdf_bytes):
    text = ""
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def extract_text_from_txt_bytes(txt_bytes, encoding='utf-8'):
    return txt_bytes.decode(encoding)

def chunk_text_by_sentences(text, max_chars=1000, overlap_chars=200):
    sentences = re.split(r'(?<=[\.!?])\s+', text)
    chunks, current = [], ""
    for sent in sentences:
        if len(current) + len(sent) + 1 <= max_chars:
            current = (current + " " + sent).strip()
        else:
            if current:
                chunks.append(current.strip())
            current = sent
    if current:
        chunks.append(current.strip())
    if overlap_chars > 0 and len(chunks) > 1:
        overlapped = []
        for i, ch in enumerate(chunks):
            if i == 0:
                overlapped.append(ch)
            else:
                prev = overlapped[-1]
                overlap = prev[-overlap_chars:] if len(prev) > overlap_chars else prev
                overlapped.append(overlap + " " + ch)
        chunks = overlapped
    return chunks

_summarizer = None
def get_summarizer(model_name="sshleifer/distilbart-cnn-12-6", device=-1):
    global _summarizer
    if _summarizer is None:
        print("Loading summarization model...")
        _summarizer = pipeline("summarization", model=model_name, device=device)
    return _summarizer

def summarize_long_text(text, model_name="sshleifer/distilbart-cnn-12-6"):
    summarizer = get_summarizer(model_name)
    if len(text) < 1200:
        out = summarizer(text, max_length=150, min_length=30, do_sample=False)
        return out[0]['summary_text']
    chunks = chunk_text_by_sentences(text, max_chars=1000, overlap_chars=200)
    partial_summaries = []
    for chunk in chunks:
        out = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
        partial_summaries.append(out[0]['summary_text'])
    merged = " ".join(partial_summaries)
    final = summarizer(merged, max_length=180, min_length=50, do_sample=False)
    return final[0]['summary_text']

def extract_keywords(text, max_keywords=10):
    kw_extractor = yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=max_keywords, features=None)
    return kw_extractor.extract_keywords(text)


In [None]:
# Upload a file and analyze it
from google.colab import files

uploaded = files.upload()
for filename, filebytes in uploaded.items():
    print("\n--- Processing:", filename)
    if filename.lower().endswith('.pdf'):
        text = extract_text_from_pdf_bytes(filebytes)
    elif filename.lower().endswith('.txt'):
        text = extract_text_from_txt_bytes(filebytes)
    else:
        print("Unsupported file type:", filename)
        continue

    print("Extracted characters:", len(text))
    print("\n--- Preview (first 800 chars) ---\n")
    print(text[:800])

    summary = summarize_long_text(text)
    print("\n# SUMMARY ####################\n")
    print(summary)

    keywords = extract_keywords(text, max_keywords=12)
    print("\n# KEYWORDS ###################\n")
    for kw, score in keywords:
        print(f"{kw} ({score:.4f})")


Saving AI-Language-Conservation-Neural-Networks-Endangered-Language-Revitalization.pdf to AI-Language-Conservation-Neural-Networks-Endangered-Language-Revitalization.pdf

--- Processing: AI-Language-Conservation-Neural-Networks-Endangered-Language-Revitalization.pdf
Extracted characters: 67323

--- Preview (first 800 chars) ---

Artificial Intelligence in Language Conservation: Exploring Neural
Networks for Endangered Language Revitalization
Abstract
The accelerating global phenomenon of language endangerment threatens to erase nearly half of the world's approximately 7,000
languages by the end of this century, resulting in irreparable losses to cultural heritage, traditional knowledge systems, and linguistic
diversity. This research presents a comprehensive investigation into the transformative potential of artificial intelligence (AI) and
neural network architectures for endangered language revitalization. We propose a novel hybrid framework that integrates meta-
learning algorithms 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 150, but your input_length is only 144. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=72)
Your max_length is set to 150, but your input_length is only 133. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)
Token indices sequence length is longer than the specified maximum sequence length for this model (4364 > 1024). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self

---
## Next steps / Improvements
- Add OCR (Tesseract) for scanned PDFs.
- Integrate Streamlit for a simple web app interface.
- Add Named Entity Recognition (NER) for extracting equipment IDs, dates, etc.

This notebook is ready for testing and demonstration.
