# Document Classifier V2

### This script iterates through files inside a specified folder and outputs the filename, 5 tags to describe the file, and a simple summary of the file contents

In [None]:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [1]:
import os
import pdfplumber
import docx
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim import corpora, models
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
import nltk

def clean_text(text):
    """Improve the cleaning process to enhance summary quality."""
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_text(file_path):
    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension.lower() in ['.doc', '.docx']:
        return extract_text_from_docx(file_path)
    elif file_extension.lower() == '.txt':
        return extract_text_from_txt(file_path)
    elif file_extension.lower() == '.csv':
        return extract_text_from_csv(file_path)
    return ""

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = " ".join(page.extract_text() or '' for page in pdf.pages)
    return clean_text(full_text)

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    full_text = " ".join(para.text for para in doc.paragraphs if para.text.strip())
    return clean_text(full_text)

def extract_text_from_txt(txt_path):
    with open(txt_path, 'r', encoding='utf-8') as file:
        full_text = file.read()
    return clean_text(full_text)

def extract_text_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    full_text = " ".join(df.astype(str).sum(axis=1))
    return clean_text(full_text)

def preprocess_text(text):
    if not text:
        return []
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(re.sub(r'\W', ' ', text.lower()))
    words = [word for word in words if word not in stop_words and len(word) > 1]
    return words

def perform_topic_modeling(words):
    if not words:
        return []
    num_topics = 2
    num_words = 5
    dictionary = corpora.Dictionary([words])
    corpus = [dictionary.doc2bow(words)]
    if not corpus:
        return []
    lda_model = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=num_words)
    return topics

def summarize_text(text):
    """Generates a summary of the text using advanced techniques."""
    parser = PlaintextParser.from_string(text, Tokenizer('english'))
    summarizer = LuhnSummarizer()
    summary_sentences = summarizer(parser.document, 3)
    summary = ' '.join(sentence._text for sentence in summary_sentences)
    return summary

def process_directory(directory_path):
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            extracted_text = extract_text(file_path)
            words = preprocess_text(extracted_text)
            topics = perform_topic_modeling(words)
            tags = [word for _, topic in topics for word in re.findall(r'\"(.*?)\"', topic) if topics]
            summary = summarize_text(extracted_text)
            print(f"File: {filename}")
            if tags:
                print(f"Tags: {tags}")
            else:
                print("No sufficient data to generate tags.")
            print(f"Summary: {summary}\n")

directory_path = "testdocuments"
process_directory(directory_path)


  "class": algorithms.Blowfish,


File: Financial-Analyst-Course-Curriculum.pdf
Tags: ['financial', 'excel', 'analysis', 'accounting', 'lecture', 'financial', 'excel', 'analysis', 'accounting', 'powerpoint']
Summary: Course Introduction 12 Modeling - 3-statement Model (2/3) 23 FSA Analyzing P&G s Financials 2 Excel Quick Introduction 13 Modeling - 3-statement Model (3/3) 24 Fundamentals of Financial Analysis 3 Excel Useful Tools 14 Accounting - Introduction 25 Finance 101 The Time Value of Money 4 Excel Best Practices 15 Accounting The Three Statements 26 Finance 101 Discounting Cash Flows 5 Excel - Functions 16 Accounting Core Principles 27 Finance 101 Loan Calculations 6 Excel Financial Functions Accounting What, When, and How much 28 Capital Budgeting The Theory 17 7 Excel Build a P&L from scratch 18 Accounting Practical Exercise 29 Capital Budgeting Case Study 8 Excel Building charts Accounting Cash Flow Statements 30 PowerPoint Quick Introduction 19 9 Excel Intro to Pivot Tables 20 Accounting The Importance of Tim