#### ADVANCED SEMANTIC CHUNKING - BASIC WITH GENSIM (LDA) 
Pada metode ini hanya menggunakan metode chungking berdasarkan topik menggunakan Gensim (LDA - Latent Dirichlet Allocation). Gensim menggunakan model berbasis bag-of-words seperti LDA (Latent Dirichlet Allocation) untuk mengekstrak topik dari teks.

Cara Kerja:
- Memetakan dokumen ke dalam ruang vektor berdasarkan frekuensi kata.
- Menggunakan probabilitas untuk menemukan distribusi kata dalam berbagai topik.

Kelebihan:
- Cocok untuk analisis topik berbasis statistik.
- Tidak memerlukan model berbasis pembelajaran mendalam.

Kekurangan:
- Tidak mempertimbangkan konteks urutan kata dalam kalimat.
- Tidak menghasilkan representasi teks yang dapat digunakan untuk perbandingan semantik.

In [4]:
import os
import spacy
import PyPDF2
import docx
import gensim
from gensim import corpora

nlp = spacy.load("en_core_web_sm")

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_topics(text, num_topics=5):
    words = [token.lemma_ for token in nlp(text) if token.is_alpha and not token.is_stop]
    dictionary = corpora.Dictionary([words])
    corpus = [dictionary.doc2bow(words)]
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topics = lda_model.print_topics()
    return topics

def advanced_chunk_text(text, chunk_size=700):
    doc = nlp(text)
    chunks = []
    current_chunk = ""
    current_entities = set()
    
    for sent in doc.sents:
        entities = {ent.text for ent in sent.ents}
        if len(current_chunk) + len(sent.text) < chunk_size and (not current_entities or entities & current_entities):
            current_chunk += " " + sent.text
            current_entities.update(entities)
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent.text
            current_entities = entities
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def process_document(file_path, output_folder):
    ext = file_path.split(".")[-1].lower()
    
    if ext == "txt":
        text = read_txt(file_path)
    elif ext == "pdf":
        text = read_pdf(file_path)
    elif ext in "docx":
        text = read_docx(file_path)
    else:
        return
    
    chunks = advanced_chunk_text(text)
    topics = extract_topics(text)
    
    output_file = os.path.join(output_folder, os.path.basename(file_path).split(".")[0] + "_chunks.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        for i, chunk in enumerate(chunks):
            f.write(f"--- Chunk {i+1} ---\n{chunk}\n\n")
        
        for topic in topics:
            f.write(f"--- Extracted Topics {i+1} ---\n{topic}\n\n")
    
    print(f"Processed: {file_path} -> {output_file}")

def main():
    input_folder = "data"
    output_folder = "output-basic-gensim"
    os.makedirs(output_folder, exist_ok=True)
    
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)
        if os.path.isfile(file_path):
            process_document(file_path, output_folder)

if __name__ == "__main__":
    main()


Processed: data\dokumen_docx.docx -> output-basic-gensim\dokumen_docx_chunks.txt
Processed: data\dokumen_pdf.pdf -> output-basic-gensim\dokumen_pdf_chunks.txt
Processed: data\dokumen_txt.txt -> output-basic-gensim\dokumen_txt_chunks.txt


#### ADVANCED SEMANTIC CHUNKING - BASIC WITH SENTENCETRANSFORMER (BERT)
Pada metode ini hanya menggunakan metode chungking berdasarkan topik menggunakan SentenceTransformer (BERT-based Embeddings). SentenceTransformer menggunakan model berbasis Transformer (BERT, RoBERTa, dll.) untuk menghasilkan embedding kalimat yang lebih kontekstual.

Cara Kerja:
- Mengubah kalimat menjadi vektor berdimensi tinggi menggunakan model deep learning.
- Mempertimbangkan makna keseluruhan teks dalam konteksnya.

Kelebihan:
- Menghasilkan embedding yang lebih kaya dan kontekstual.
- Bisa digunakan untuk perbandingan kemiripan antar-kalimat.

Kekurangan:
- Memerlukan lebih banyak daya komputasi dibandingkan dengan Gensim LDA.
- Model lebih besar dan memerlukan dependensi tambahan seperti PyTorch atau TensorFlow.

In [7]:
import os
import spacy
import PyPDF2
import docx
from sentence_transformers import SentenceTransformer

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_topics(text, num_topics=5):
    sentences = [sent.text for sent in nlp(text).sents]
    embeddings = model.encode(sentences)
    return embeddings

def advanced_chunk_text(text, chunk_size=700):
    doc = nlp(text)
    chunks = []
    current_chunk = ""
    current_entities = set()
    
    for sent in doc.sents:
        entities = {ent.text for ent in sent.ents}
        if len(current_chunk) + len(sent.text) < chunk_size and (not current_entities or entities & current_entities):
            current_chunk += " " + sent.text
            current_entities.update(entities)
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent.text
            current_entities = entities
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def process_document(file_path, output_folder):
    ext = file_path.split(".")[-1].lower()
    
    if ext == "txt":
        text = read_txt(file_path)
    elif ext == "pdf":
        text = read_pdf(file_path)
    elif ext == "docx":
        text = read_docx(file_path)
    else:
        return
    
    chunks = advanced_chunk_text(text)
    topics = extract_topics(text)
    
    output_file = os.path.join(output_folder, os.path.basename(file_path).split(".")[0] + "_chunks.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        for i, chunk in enumerate(chunks):
            f.write(f"--- Chunk {i+1} ---\n{chunk}\n\n")
        
        for topic in topics:
            f.write(f"--- Extracted Topics {i+1} ---\n{topic}\n\n")
    
    print(f"Processed: {file_path} -> {output_file}")

def main():
    input_folder = "data"
    output_folder = "output-basic-sentence-transformers"
    os.makedirs(output_folder, exist_ok=True)
    
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)
        if os.path.isfile(file_path):
            process_document(file_path, output_folder)

if __name__ == "__main__":
    main()

Processed: data\dokumen_docx.docx -> output-basic-sentence-transformers\dokumen_docx_chunks.txt
Processed: data\dokumen_pdf.pdf -> output-basic-sentence-transformers\dokumen_pdf_chunks.txt
Processed: data\dokumen_txt.txt -> output-basic-sentence-transformers\dokumen_txt_chunks.txt


#### ADVANCED SEMANTIC CHUNKING - NODE ONLY

Metode ini hanya menggunakan teknik chunking berbasis graf tanpa perangkingan. Model ini membentuk node berdasarkan kemiripan semantik antar kalimat menggunakan SentenceTransformer dan graph-based clustering.

Cara Kerja:
- Teks dipecah menjadi kalimat.
- Kalimat direpresentasikan sebagai vektor embedding menggunakan SentenceTransformer.
- Graf dibentuk berdasarkan cosine similarity antar kalimat.
- Komunitas dalam graf ditentukan menggunakan greedy modularity optimization, yang menghasilkan chunk berbasis hubungan semantik.

Kelebihan:
- Memanfaatkan hubungan semantik antar kalimat.
- Cocok untuk dokumen dengan struktur naratif yang kuat.

Kekurangan:
- Tidak ada prioritas dalam hasil chunking.
- Semua chunk dianggap memiliki bobot yang sama dalam informasi.

In [8]:
import os
import spacy
import docx
import PyPDF2
import networkx as nx
import gensim
from gensim import corpora
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load NLP model
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def graph_based_chunking(text, chunk_size=700):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    
    if not sentences:
        return []
    
    sentence_embeddings = model.encode(sentences)
    similarity_matrix = cosine_similarity(sentence_embeddings)
    G = nx.Graph()
    
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            G.add_edge(i, j, weight=similarity_matrix[i, j])
    
    partitions = nx.community.greedy_modularity_communities(G)
    
    chunks = []
    for community in partitions:
        chunk = " ".join([sentences[i] for i in sorted(community)])
        if len(chunk) > chunk_size:
            sub_chunks = [chunk[i:i+chunk_size] for i in range(0, len(chunk), chunk_size)]
            chunks.extend(sub_chunks)
        else:
            chunks.append(chunk)
    
    return chunks

def extract_topics(text, num_topics=5):
    words = [token.lemma_ for token in nlp(text) if token.is_alpha and not token.is_stop]
    dictionary = corpora.Dictionary([words])
    corpus = [dictionary.doc2bow(words)]
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topics = lda_model.print_topics()
    return topics

def process_document(file_path, output_folder):
    ext = file_path.split(".")[-1].lower()
    
    if ext == "txt":
        text = read_txt(file_path)
    elif ext == "pdf":
        text = read_pdf(file_path)
    elif ext == "docx":
        text = read_docx(file_path)
    else:
        return
    
    chunks = graph_based_chunking(text)
    topics = extract_topics(text)
    
    output_file = os.path.join(output_folder, os.path.basename(file_path).split(".")[0] + "_chunks.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        for i, chunk in enumerate(chunks):
            f.write(f"--- Chunk {i+1} ---\n{chunk}\n\n")
        
        for topic in topics:
            f.write(f"--- Extracted Topics {i+1} ---\n{topic}\n\n")
    
    print(f"Processed: {file_path} -> {output_file}")

def main():
    input_folder = "data"
    output_folder = "output-graph-node-only"
    os.makedirs(output_folder, exist_ok=True)
    
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)
        if os.path.isfile(file_path):
            process_document(file_path, output_folder)

if __name__ == "__main__":
    main()

Processed: data\dokumen_docx.docx -> output-graph-node-only\dokumen_docx_chunks.txt
Processed: data\dokumen_pdf.pdf -> output-graph-node-only\dokumen_pdf_chunks.txt
Processed: data\dokumen_txt.txt -> output-graph-node-only\dokumen_txt_chunks.txt


#### ADVANCED SEMANTIC CHUNKING - NODE WITH CHUNK RANK

Metode ini menggunakan chunking berbasis graf, namun dengan tambahan perangkingan chunk berdasarkan jumlah kata atau kepadatan informasi.

Cara Kerja:
- Proses chunking sama seperti metode "Node Only".
- Setelah chunk terbentuk, setiap chunk diberi skor berdasarkan jumlah kata.
- Chunk dengan informasi lebih padat diberikan peringkat lebih tinggi.

Kelebihan:
- Memungkinkan ekstraksi chunk yang lebih informatif.
- Cocok untuk proses summarization berbasis chunk.

Kekurangan:
- Ranking berdasarkan jumlah kata belum tentu mencerminkan makna semantik yang lebih penting.
- Bisa menyebabkan perubahan urutan asli dokumen jika digunakan untuk reordering.

In [9]:
import os
import spacy
import docx
import PyPDF2
import networkx as nx
import numpy as np
import gensim
from gensim import corpora
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load NLP model
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def graph_based_chunking(text, chunk_size=700):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    
    if not sentences:
        return []
    
    sentence_embeddings = model.encode(sentences)
    similarity_matrix = cosine_similarity(sentence_embeddings)
    G = nx.Graph()
    
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            G.add_edge(i, j, weight=similarity_matrix[i, j])
    
    partitions = nx.community.greedy_modularity_communities(G)
    
    chunks = []
    for community in partitions:
        chunk = " ".join([sentences[i] for i in sorted(community)])
        if len(chunk) > chunk_size:
            sub_chunks = [chunk[i:i+chunk_size] for i in range(0, len(chunk), chunk_size)]
            chunks.extend(sub_chunks)
        else:
            chunks.append(chunk)
    
    return chunks

def extract_topics(text, num_topics=5):
    words = [token.lemma_ for token in nlp(text) if token.is_alpha and not token.is_stop]
    dictionary = corpora.Dictionary([words])
    corpus = [dictionary.doc2bow(words)]
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topics = lda_model.print_topics()
    return topics

def rank_chunks(chunks):
    # Rank chunk by word count
    chunk_scores = [len(chunk.split()) for chunk in chunks]
    ranked_chunks = sorted(zip(chunk_scores, chunks), reverse=True, key=lambda x: x[0])
    return ranked_chunks

def process_document(file_path, output_folder):
    ext = file_path.split(".")[-1].lower()
    
    if ext == "txt":
        text = read_txt(file_path)
    elif ext == "pdf":
        text = read_pdf(file_path)
    elif ext == "docx":
        text = read_docx(file_path)
    else:
        return
    
    chunks = graph_based_chunking(text)
    topics = extract_topics(text)
    ranked_chunks = rank_chunks(chunks)
    
    output_file = os.path.join(output_folder, os.path.basename(file_path).split(".")[0] + "_chunks.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        for i, (score, chunk) in enumerate(ranked_chunks):
            topics = extract_topics(chunk)
            f.write(f"--- Chunk {i+1} (Score: {score}) ---\n{chunk}\n\n")
        
        for topic in topics:
            f.write(f"--- Extracted Topics {i+1} ---\n{topic}\n\n")
    
    print(f"Processed: {file_path} -> {output_file}")

def main():
    input_folder = "data"
    output_folder = "output-graph-chunk-rank"
    os.makedirs(output_folder, exist_ok=True)
    
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)
        if os.path.isfile(file_path):
            process_document(file_path, output_folder)

if __name__ == "__main__":
    main()

Processed: data\dokumen_docx.docx -> output-graph-chunk-rank\dokumen_docx_chunks.txt
Processed: data\dokumen_pdf.pdf -> output-graph-chunk-rank\dokumen_pdf_chunks.txt
Processed: data\dokumen_txt.txt -> output-graph-chunk-rank\dokumen_txt_chunks.txt


#### ADVANCED SEMANTIC CHUNKING - NODE WITH TOPIC RANK

Metode ini menggunakan chunking berbasis graf dan menambahkan perangkingan berbasis topik menggunakan LDA (Latent Dirichlet Allocation).

Cara Kerja:
- Setelah chunk terbentuk, topik utama diekstrak dari setiap chunk menggunakan Gensim LDA.
- Topik terbaik dipilih berdasarkan probabilitas tertinggi dalam distribusi topik chunk tersebut.

Kelebihan:
- Memungkinkan identifikasi topik utama dalam setiap chunk.
- Berguna untuk analisis topik otomatis dari dokumen panjang.

Kekurangan:
- LDA berbasis bag-of-words, sehingga tidak mempertimbangkan urutan kata.
- Hasil topik bisa kurang akurat jika jumlah topik tidak ditentukan dengan baik.

In [10]:
import os
import spacy
import docx
import PyPDF2
import networkx as nx
import numpy as np
import gensim
from gensim import corpora
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load NLP model
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def graph_based_chunking(text, chunk_size=700):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    
    if not sentences:
        return []
    
    sentence_embeddings = model.encode(sentences)
    similarity_matrix = cosine_similarity(sentence_embeddings)
    G = nx.Graph()
    
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            G.add_edge(i, j, weight=similarity_matrix[i, j])
    
    partitions = nx.community.greedy_modularity_communities(G)
    
    chunks = []
    for community in partitions:
        chunk = " ".join([sentences[i] for i in sorted(community)])
        if len(chunk) > chunk_size:
            sub_chunks = [chunk[i:i+chunk_size] for i in range(0, len(chunk), chunk_size)]
            chunks.extend(sub_chunks)
        else:
            chunks.append(chunk)
    
    return chunks

def extract_topics(text, num_topics=5):
    words = [token.lemma_ for token in nlp(text) if token.is_alpha and not token.is_stop]
    dictionary = corpora.Dictionary([words])
    corpus = [dictionary.doc2bow(words)]
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topics = lda_model.show_topics(formatted=False)
    
    # Rank topics by probability
    ranked_topics = sorted(topics, key=lambda x: -sum(prob for _, prob in x[1]))
    best_topic = ranked_topics[0] if ranked_topics else None
    
    return best_topic

def process_document(file_path, output_folder):
    ext = file_path.split(".")[-1].lower()
    
    if ext == "txt":
        text = read_txt(file_path)
    elif ext == "pdf":
        text = read_pdf(file_path)
    elif ext == "docx":
        text = read_docx(file_path)
    else:
        return
    
    chunks = graph_based_chunking(text)
    
    output_file = os.path.join(output_folder, os.path.basename(file_path).split(".")[0] + "_chunks.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        for i, chunk in enumerate(chunks):
            best_topic = extract_topics(chunk)
            topic_str = f"Topic {best_topic[0]}: {[word for word, _ in best_topic[1]]}" if best_topic else "No topics found"
            f.write(f"--- Chunk {i+1} ---\n{chunk}\nBest Topic: {topic_str}\n\n")
    
    print(f"Processed: {file_path} -> {output_file}")

def main():
    input_folder = "data"
    output_folder = "output-graph-topic-rank"
    os.makedirs(output_folder, exist_ok=True)
    
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)
        if os.path.isfile(file_path):
            process_document(file_path, output_folder)

if __name__ == "__main__":
    main()


Processed: data\dokumen_docx.docx -> output-graph-topic-rank\dokumen_docx_chunks.txt
Processed: data\dokumen_pdf.pdf -> output-graph-topic-rank\dokumen_pdf_chunks.txt
Processed: data\dokumen_txt.txt -> output-graph-topic-rank\dokumen_txt_chunks.txt


#### ADVANCED SEMANTIC CHUNKING - LLAMA WITH EMBEDDING RANK

Metode ini menggunakan pemrosesan bahasa alami (NLP) berbasis spaCy untuk chunking teks, serta peringkat berbasis embedding menggunakan model `HuggingFaceEmbeddings`. Model LDA (Latent Dirichlet Allocation) digunakan untuk mengekstrak topik utama dari setiap chunk.

Cara Kerja:
- Dokumen dipecah menjadi chunk menggunakan model NLP spaCy.
- Embedding setiap chunk dihitung menggunakan `HuggingFaceEmbeddings`.
- Chunk diberi peringkat berdasarkan norma embedding menggunakan `numpy.linalg.norm`.
- Topik utama dari setiap chunk diekstrak menggunakan Gensim LDA.
- Topik terbaik dipilih berdasarkan probabilitas tertinggi dalam distribusi topik chunk tersebut.

Kelebihan:
- Menggunakan embedding untuk peringkat yang lebih akurat dibanding metode berbasis kata.
- Mampu mengidentifikasi topik utama dalam setiap chunk dengan metode LDA.
- Dapat diterapkan pada berbagai jenis dokumen (TXT, PDF, DOCX).

Kekurangan:
- LDA masih berbasis bag-of-words, sehingga tidak mempertimbangkan urutan kata.
- Kualitas chunk tergantung pada model NLP yang digunakan.
- Hasil topik bisa kurang akurat jika jumlah topik tidak ditentukan dengan baik.

In [11]:
import os
import spacy
import docx
import PyPDF2
import numpy as np
import gensim
from gensim import corpora
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama.llms import OllamaLLM

# Load NLP model
nlp = spacy.load("en_core_web_sm")
OLLAMA_MODEL = "llama3.2"
llm = OllamaLLM(model=OLLAMA_MODEL)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def read_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def semantic_chunking(text, chunk_size=700):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    
    if not sentences:
        return []
    
    chunks = []
    chunk = ""
    for sentence in sentences:
        if len(chunk) + len(sentence) < chunk_size:
            chunk += " " + sentence
        else:
            chunks.append(chunk.strip())
            chunk = sentence
    if chunk:
        chunks.append(chunk.strip())
    
    return chunks

def extract_topics(text, num_topics=5):
    words = [token.lemma_ for token in nlp(text) if token.is_alpha and not token.is_stop]
    dictionary = corpora.Dictionary([words])
    corpus = [dictionary.doc2bow(words)]
    lda_model = gensim.models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=4)
    topics = lda_model.show_topics(formatted=False)
    
    ranked_topics = sorted(topics, key=lambda x: -sum(prob for _, prob in x[1]))
    best_topic = ranked_topics[0] if ranked_topics else None
    
    return best_topic

def rank_chunks(chunks):
    chunk_embeddings = [embedding_model.embed_query(chunk) for chunk in chunks]
    scores = [np.linalg.norm(embedding) for embedding in chunk_embeddings]
    ranked_chunks = sorted(zip(chunks, scores), key=lambda x: -x[1])
    return ranked_chunks

def process_document(file_path, output_folder):
    ext = file_path.split(".")[-1].lower()
    
    if ext == "txt":
        text = read_txt(file_path)
    elif ext == "pdf":
        text = read_pdf(file_path)
    elif ext == "docx":
        text = read_docx(file_path)
    else:
        return
    
    chunks = semantic_chunking(text)
    ranked_chunks = rank_chunks(chunks)
    
    output_file = os.path.join(output_folder, os.path.basename(file_path).split(".")[0] + "_chunks.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        for i, (chunk, score) in enumerate(ranked_chunks):
            best_topic = extract_topics(chunk)
            topic_str = f"Topic {best_topic[0]}: {[word for word, _ in best_topic[1]]}" if best_topic else "No topics found"
            f.write(f"--- Chunk {i+1} (Score: {score:.4f}) ---\n{chunk}\nBest Topic: {topic_str}\n\n")
    
    print(f"Processed: {file_path} -> {output_file}")

def main():
    input_folder = "data"
    output_folder = "output-llm-semantic-chunking"
    os.makedirs(output_folder, exist_ok=True)
    
    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)
        if os.path.isfile(file_path):
            process_document(file_path, output_folder)

if __name__ == "__main__":
    main()

Processed: data\dokumen_docx.docx -> output-llm-semantic-chunking\dokumen_docx_chunks.txt
Processed: data\dokumen_pdf.pdf -> output-llm-semantic-chunking\dokumen_pdf_chunks.txt
Processed: data\dokumen_txt.txt -> output-llm-semantic-chunking\dokumen_txt_chunks.txt
