# Document Classifier V4

### This script iterates through files inside a specified folder and outputs the filename, 5 tags to describe the file, and an advanced summary of the file contents using the Huggingface Transformers library

In [None]:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [1]:
import os
import pdfplumber
import docx
import pandas as pd
import json
import re
from transformers import pipeline, BartTokenizer
from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models


def clean_text(text):
    """Additional cleaning to handle special cases in document text extraction."""
    text = re.sub(r'([a-zA-Z])-([a-zA-Z])', r'\1\2', text)
    text = re.sub(r'([a-zA-Z])([,.:;!?\)])', r'\1 \2', text)
    text = re.sub(r'([,.:;!?\(])([a-zA-Z])', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def extract_text(file_path):
    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension.lower() in ['.doc', '.docx']:
        return extract_text_from_docx(file_path)
    elif file_extension.lower() == '.txt':
        return extract_text_from_txt(file_path)
    elif file_extension.lower() == '.csv':
        return extract_text_from_csv(file_path)
    elif file_extension.lower() == '.json':
        return extract_text_from_json(file_path)
    return ""

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = " ".join(page.extract_text() or '' for page in pdf.pages)
    return clean_text(full_text)

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    full_text = " ".join(para.text for para in doc.paragraphs if para.text.strip())
    return clean_text(full_text)

def extract_text_from_txt(txt_path):
    with open(txt_path, 'r', encoding='utf-8') as file:
        full_text = file.read()
    return clean_text(full_text)

def extract_text_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    full_text = " ".join(df.astype(str).sum(axis=1))
    return clean_text(full_text)

def extract_text_from_json(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    full_text = ' '.join([str(value) for value in data.values()])
    return clean_text(full_text)

def preprocess_text(text):
    if not text:
        return []
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(re.sub(r'\W', ' ', text.lower()))
    words = [word for word in words if word not in stop_words and len(word) > 1]
    return words

def perform_topic_modeling(words):
    if not words:
        return []
    num_topics = 1
    num_words = 5
    dictionary = corpora.Dictionary([words])
    corpus = [dictionary.doc2bow(words)]
    if not corpus:
        return []
    lda_model = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=num_words)
    return topics

def summarize_text(text, summarizer, tokenizer, max_length=1024, min_length=50):
    """Generates a summary of the text, handles text longer than the model's maximum length limit."""
    cleaned_text = clean_text(text)
    tokens = tokenizer.encode(cleaned_text, return_tensors='pt', add_special_tokens=True)
    if tokens.size(1) > max_length:
        part_length = int(len(cleaned_text) / (tokens.size(1) / max_length + 1))
        parts = [cleaned_text[i:i + part_length] for i in range(0, len(cleaned_text), part_length)]
        summaries = [summarizer(part, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text'] for part in parts if len(part) > min_length]
        return ' '.join(summaries)
    else:
        summary = summarizer(cleaned_text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']

def correct_spelling(text):
    """Corrects spelling in the summarized text."""
    spell = SpellChecker()
    words = text.split()
    corrected_words = [spell.correction(word) if spell.unknown([word]) else word for word in words]
    return ' '.join(corrected_words)

def process_directory(directory_path, summarizer, tokenizer):
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            extracted_text = extract_text(file_path)
            words = preprocess_text(extracted_text)
            topics = perform_topic_modeling(words)
            tags = [word for _, topic in topics for word in re.findall(r'\"(.*?)\"', topic) if topics]
            summary = summarize_text(extracted_text, summarizer, tokenizer)
            print(f"File: {filename}")
            if tags:
                print(f"Tags: {tags}")
            print(f"Summary: {summary}\n")
            
directory_path = "testdocuments"
process_directory(directory_path, summarizer, tokenizer)


  "class": algorithms.Blowfish,
Token indices sequence length is longer than the specified maximum sequence length for this model (5101 > 1024). Running this sequence through the model will result in indexing errors
Your max_length is set to 1024, but you input_length is only 871. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=435)
Your max_length is set to 1024, but you input_length is only 908. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=454)
Your max_length is set to 1024, but you input_length is only 899. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=449)
Your max_length is set to 1024, but you input_length is only 793. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=396)
Your max_length is set to 1024, but you input_length is only 840. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=420)
Your

File: Financial-Analyst-Course-Curriculum.pdf
Tags: ['financial', 'excel', 'analysis', 'accounting', 'lecture']
Summary: Excel is the world's #1 software for the Office. Learn about the Excel interface – How to perform data entry in Excel. Use Excel's Freeze Panes to Handle Large Datasets. Use the “Tell me what you want to do’ function to find Excel functionalities. Learn how to scroll fast and be much quicker in Excel. Using Named Ranges to Make Formulas More Readable. Using Customsort to Sort Multiple Columns Within a Table. Using Lookup Functions ( Vlookup ) to Fill the Database Sheet. Excel Charts - The Easy Way to Do It – Learn how to create charts in Excel 2016 65. Make Your Excel Charts Look Sexier -Proven Tips --> Intermediate Excel 66. Creating a Bridge Chart in Excel2016 -As Easy as It Gets 67. New Ways to Visualize Your Data -TreemapCharts 68. How to Represent Trends with Sparklines 10 Financial Statement Excel Financial Modeling Accounting Financial Analysis Finance 101 Cap

Your max_length is set to 1024, but you input_length is only 815. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=407)
Your max_length is set to 1024, but you input_length is only 824. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=412)
Your max_length is set to 1024, but you input_length is only 829. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=414)
Your max_length is set to 1024, but you input_length is only 826. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=413)
Your max_length is set to 1024, but you input_length is only 722. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=361)


File: test.pdf
Tags: ['data', 'project', 'nasa', 'stock', 'consumable']
Summary: The primary objective of this data science project is to overalloperationaleffectivenessofthesecriticaltestingfacili- conduct a comprehensive analysis of NASA’s consumable ties. Through datadriven in- sights and predictive analytics , we aim to contribute to the optimization of resource allocation. ConsumablesDataScienceProjectProposal—2/4 science to optimize consumable management. oxygen, helium , and other relevant gases will be a central sourceofinformationforthisproject. Historical pricing data for consumables will be sourced withinNASA’swindtunnelfacilities. Financial reports may requireparsingtoextractrelevantfinan- couldimpacttheavailabilityandpricingofconsumables. onsumablesuppliers. Byassessing marketdatawillundergorigorousdatapreprocessing, encom- thefinancialhealthandmarketperformanceofsuppliers, we passing tasks such as cleaning , validation , and structuring. Time series markettrends will be u