# Document Classifier V1

### This script iterates through files inside a specified folder and outputs the filename as well as 5 tags to describe the file contents

In [1]:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [2]:
import os
import pdfplumber
import docx
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models
import nltk

def extract_text(file_path):
    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension.lower() in ['.doc', '.docx']:
        return extract_text_from_docx(file_path)
    elif file_extension.lower() == '.txt':
        return extract_text_from_txt(file_path)
    elif file_extension.lower() == '.csv':
        return extract_text_from_csv(file_path)
    return ""

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = " ".join(page.extract_text() or '' for page in pdf.pages)
    return full_text.strip()

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    full_text = " ".join(para.text for para in doc.paragraphs if para.text.strip())
    return full_text.strip()

def extract_text_from_txt(txt_path):
    with open(txt_path, 'r', encoding='utf-8') as file:
        full_text = file.read()
    return full_text.strip()

def extract_text_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    full_text = " ".join(df.astype(str).sum(axis=1))
    return full_text.strip()

def preprocess_text(text):
    if not text:
        return []
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(re.sub(r'\W', ' ', text.lower()))
    words = [word for word in words if word not in stop_words and len(word) > 1]
    return words

def perform_topic_modeling(words):
    if not words:
        return []
    num_topics = 1
    num_words = 5
    dictionary = corpora.Dictionary([words])
    corpus = [dictionary.doc2bow(words)]
    if not corpus:
        return []
    lda_model = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=num_words)
    return topics

def process_directory(directory_path):
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            extracted_text = extract_text(file_path)
            words = preprocess_text(extracted_text)
            topics = perform_topic_modeling(words)
            tags = [word for _, topic in topics for word in re.findall(r'\"(.*?)\"', topic) if topics]
            if tags:
                print(f"File: {filename}, Tags: {tags}")
            else:
                print(f"File: {filename}, No sufficient data to generate tags.")

directory_path = "testdocuments"
process_directory(directory_path)


  "class": algorithms.Blowfish,


File: Financial-Analyst-Course-Curriculum.pdf, Tags: ['financial', 'excel', 'analysis', 'accounting', 'lecture']
File: test.pdf, Tags: ['data', 'project', 'nasa', 'consumable', 'stock']
