Semantic chunking (Cosine Similarity)

In [52]:
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document
import PyPDF2

# Load spaCy model for tokenization and POS tagging
nlp = spacy.load("en_core_web_sm")

# Function to read text from PDF file
def read_pdf(file_path):
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to read text from Word file
def read_word(file_path):
    doc = Document(file_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Function to read text from TXT file
def read_txt(file_path):
    with open(file_path, 'r') as file:
        return file.read()

# Function for chunking and semantic analysis
def semantic_chunking(text):
    # Tokenize and parse sentences
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    
    # Initialize the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    
    # Transform sentences into vectors
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Calculate cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Output chunking results based on similarity
    chunks = []
    for idx, row in enumerate(cosine_sim):
        # Group sentences with high similarity
        similar_sentences = [sentences[i] for i in range(len(row)) if row[i] > 0.2 and i != idx]
        if similar_sentences:
            chunks.append(similar_sentences)
    
    return chunks

# Function to process file and perform semantic chunking
def process_file(file_path, output_file):
    file_extension = file_path.split('.')[-1].lower()
    
    if file_extension == 'pdf':
        text = read_pdf(file_path)
    elif file_extension == 'docx':
        text = read_word(file_path)
    elif file_extension == 'txt':
        text = read_txt(file_path)
    else:
        raise ValueError("Unsupported file format")
    
    # Verify that text has been extracted
    if len(text) < 100:
        print("Teks terlalu pendek untuk proses chunking.")
        return
    
    print("Teks berhasil diekstraksi, memulai chunking...")
    
    # Perform semantic chunking
    chunks = semantic_chunking(text)
    
    # Print results to terminal
    for idx, chunk in enumerate(chunks):
        print(f"Chunk {idx + 1}:")
        for sentence in chunk:
            print(f"- {sentence}")
        print("\n")
    
    # Save the chunks to a text file
    try:
        with open(output_file, 'w') as out_file:
            for idx, chunk in enumerate(chunks):
                out_file.write(f"Chunk {idx + 1}:\n")
                for sentence in chunk:
                    out_file.write(f"- {sentence}\n")
                out_file.write("\n")
        print(f"Hasil chunking telah disimpan di {output_file}")
    except Exception as e:
        print(f"Terjadi kesalahan saat menulis ke file: {e}")

# Example usage (adjust the file path accordingly)
file_path = "Dokumen.pdf"  # Ganti dengan file yang ingin diproses
output_file = "chunking_results.txt"  # Nama file output
process_file(file_path, output_file)


Teks berhasil diekstraksi, memulai chunking...
Chunk 1:
- PERTAMINA   mulai benar -benar mengawali 
sejarahnya sebagai perusahaan   energi setelah melakukan perubahan nama dari PT 
Eksploitasi   Tambang Minyak Sumatera Utara menjadi PT Perusahaan   Minyak 
Nasional (PERMINA).  



Chunk 2:
- Pertamina  
 
Perjalanan panjang PT Pertamina (Persero) atau “PERTAMINA”   dalam 
menjaga ketahanan energi nasional dimulai sejak   sekitar tahun 1950 -an melalui 
pendirian PT Eksploitasi   Tambang Minyak Sumatera Utara oleh Pemerintah 
Indonesia   yang ditugaskan untuk mengelola ladang minyak di wilayah   Sumatera. 

- nama   PN  Pertambangan Minyak Nasional   (Permina). 



Chunk 3:
- Kemudian pada   20 Agustus 1968, PN Permina bergabung dengan PN   Pertamin 
sehingga menjadi sebuah perusahaan baru bernama   PN Pertambangan Minyak dan 
Gas Bumi Negara (Pertamina).  



Chunk 4:
- PERTAMINA   mulai benar -benar mengawali 
sejarahnya sebagai perusahaan   energi setelah melakukan perubahan nama dar

Semantic Chunking (Clustering)

In [54]:
import spacy
import docx
import PyPDF2
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Load spaCy model for tokenization and POS tagging
nlp = spacy.load("en_core_web_sm")

# Function to read text from PDF file
def read_pdf(file_path):
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to read text from Word file
def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Function to read text from TXT file
def read_txt(file_path):
    with open(file_path, 'r') as file:
        return file.read()

# Function to perform K-means clustering on sentences
def semantic_chunking(text, num_clusters=3):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 0]
    
    # Transform sentences into TF-IDF vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)
    
    # Group sentences by cluster
    clusters = {}
    for idx, label in enumerate(kmeans.labels_):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(sentences[idx])

    # Sort clusters by cluster label to ensure they are in order
    sorted_clusters = dict(sorted(clusters.items()))
    
    return sorted_clusters

# Function to process file and perform semantic chunking
def process_file(file_path, output_file, num_clusters=3):
    file_extension = file_path.split('.')[-1].lower()
    
    if file_extension == 'pdf':
        text = read_pdf(file_path)
    elif file_extension == 'docx':
        text = read_word(file_path)
    elif file_extension == 'txt':
        text = read_txt(file_path)
    else:
        raise ValueError("Unsupported file format")
    
    # Verify that text has been extracted
    if len(text) < 100:
        print("Teks terlalu pendek untuk proses chunking.")
        return
    
    print("Teks berhasil diekstraksi, memulai chunking...")
    
    # Perform semantic chunking with clustering
    clusters = semantic_chunking(text, num_clusters)
    
    # Print results to terminal
    for cluster_id, cluster_sentences in clusters.items():
        print(f"Cluster {cluster_id + 1}:")
        for sentence in cluster_sentences:
            print(f"- {sentence}")
        print("\n")
    
    # Save the clusters to a text file
    try:
        with open(output_file, 'w') as out_file:
            for cluster_id, cluster_sentences in clusters.items():
                out_file.write(f"Cluster {cluster_id + 1}:\n")
                for sentence in cluster_sentences:
                    out_file.write(f"- {sentence}\n")
                out_file.write("\n")
        print(f"Hasil chunking telah disimpan di {output_file}")
    except Exception as e:
        print(f"Terjadi kesalahan saat menulis ke file: {e}")

# Example usage (adjust the file path accordingly)
file_path = "Dokumen.pdf"  # Ganti dengan file yang ingin diproses
output_file = "chunking_results1.txt"  # Nama file output
process_file(file_path, output_file, num_clusters=3)


Teks berhasil diekstraksi, memulai chunking...
Cluster 1:
- Desember 1957,
- Pada tanggal 1 Juli 1961 sebagaimana ditetapkan melalui   Peraturan 
Pemerintah No. 198 Tahun 1961, Permina ditetapkan   menjadi sebuah   Perusahaan 
Negara   (PN) dengan
- nama   PN  Pertambangan Minyak Nasional   (Permina).
- Kemudian pada   20 Agustus 1968, PN Permina bergabung dengan PN   Pertamin 
sehingga menjadi sebuah perusahaan baru bernama   PN Pertambangan Minyak dan 
Gas Bumi Negara (Pertamina).
- Selang beberapa tahun kemudian tepatnya tanggal 15   Desember 1971, 
Pemerintah mengatur peran PN Pertamina   untuk menghasilkan dan mengolah migas 
dari ladang -ladang   minyak serta menyediakan kebutuhan bahan bakar dan gas   di 
Indonesia sehingga nama PN Pertamina diubah menjadi   Perusahaan Pertambangan 
Minyak dan Gas Bumi Negara.
- Dalam tonggak sejarah berikutnya, melalui PP No. 31 
Tahun   2003 tanggal 17 September 2003, Perusahaan Pertambangan   Minyak dan 
Gas Bumi Negara berubah nama menjadi  