In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re



In [None]:
class TextPreprocessor:
    def __init__(self):
        """
        Initialize text preprocessing utilities
        """
        # Custom stopwords (you can expand this list)
        self.stopwords = set(['the', 'a', 'an', 'in', 'to', 'for', 'of', 'and', 'is', 'are'])

    def clean_text(self, text):
        """
        Clean and normalize text
        """
        # Convert to lowercase
        text = text.lower()

        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def remove_stopwords(self, text):
        """
        Remove stopwords from text
        """
        return ' '.join([word for word in text.split()
                         if word not in self.stopwords])

In [None]:
class KnowledgeBaseVectorizer:
    def __init__(self, vectorization_method='tfidf'):
        """
        Initialize vector representation of knowledge base

        :param vectorization_method: Choice of vectorization (tfidf, word2vec)
        """
        self.preprocessor = TextPreprocessor()
        self.vectorization_method = vectorization_method

        # Vectorizer will be set based on method
        self.vectorizer = None
        self.vectors = None

    def prepare_documents(self, documents):
        """
        Preprocess and clean documents

        :param documents: List of raw text documents
        :return: List of cleaned documents
        """
        cleaned_docs = []
        for doc in documents:
            # Clean text
            cleaned_text = self.preprocessor.clean_text(doc)
            # Remove stopwords
            cleaned_text = self.preprocessor.remove_stopwords(cleaned_text)
            cleaned_docs.append(cleaned_text)

        return cleaned_docs

    def vectorize(self, documents):
        """
        Convert documents to vector representations

        :param documents: List of preprocessed documents
        :return: Vector representations
        """
        # Prepare documents
        prepared_docs = self.prepare_documents(documents)

        # Choose vectorization method
        if self.vectorization_method == 'tfidf':
            self.vectorizer = TfidfVectorizer()
            self.vectors = self.vectorizer.fit_transform(prepared_docs)

        return self.vectors

    def compute_similarity(self, query):
        """
        Compute similarity between query and knowledge base

        :param query: User's query text
        :return: Similarity scores
        """
        # Preprocess query
        cleaned_query = self.preprocessor.clean_text(query)
        cleaned_query = self.preprocessor.remove_stopwords(cleaned_query)

        # Vectorize query
        query_vector = self.vectorizer.transform([cleaned_query])

        # Compute cosine similarity
        similarities = cosine_similarity(query_vector, self.vectors)

        return similarities[0]

In [None]:
class SimilarityMatcher:
    def __init__(self, knowledge_base_documents):
        """
        Initialize similarity matching system

        :param knowledge_base_documents: List of documents from PDF
        """
        self.vectorizer = KnowledgeBaseVectorizer()
        self.knowledge_base_vectors = self.vectorizer.vectorize(knowledge_base_documents)

    def find_most_similar(self, user_query, top_k=3):
        """
        Find most similar documents to user query

        :param user_query: User's input text
        :param top_k: Number of top similar documents to return
        :return: Top similar documents and their similarity scores
        """
        # Compute similarities
        similarities = self.vectorizer.compute_similarity(user_query)

        # Get top k similar documents
        top_indices = similarities.argsort()[-top_k:][::-1]
        top_similarities = similarities[top_indices]

        return top_indices, top_similarities


In [None]:
# Example Usage
def main():
    # Sample PDF extracted documents
    pdf_documents = [
        "Machine learning is a subset of artificial intelligence",
        "Neural networks are computational models inspired by biological neural networks",
        "Deep learning involves multiple layers of neural networks"
    ]

    # Initialize system
    matcher = SimilarityMatcher(pdf_documents)

    # Example query
    # user_query = "Tell me about neural networks"

    #Tes 1
    user_query = "Apa itu neural network"

    # Find similar documents
    similar_indices, similarity_scores = matcher.find_most_similar(user_query)

    # Print results
    for idx, score in zip(similar_indices, similarity_scores):
        print(f"Document: {pdf_documents[idx]}")
        print(f"Similarity Score: {score}")

if __name__ == "__main__":
    main()

Document: Neural networks are computational models inspired by biological neural networks
Similarity Score: 0.49022339633833373
Document: Deep learning involves multiple layers of neural networks
Similarity Score: 0.3175701804283441
Document: Machine learning is a subset of artificial intelligence
Similarity Score: 0.0
