In [7]:
import json
import time

class JaccardSimilarity:
    def __init__(self, tokenized_documents):
        self.documents = [set(doc) for doc in tokenized_documents]

    def _calculate_jaccard_similarity(self, query_terms, doc_terms):
        intersection = len(query_terms.intersection(doc_terms))
        union = len(query_terms.union(doc_terms))
        return intersection / union if union != 0 else 0

    def search(self, query):
        query_terms = set(query.split())
        relevant_documents = []

        for doc_index, doc_terms in enumerate(self.documents):
            similarity = self._calculate_jaccard_similarity(query_terms, doc_terms)
            if similarity > 0:
                relevant_documents.append((doc_index, similarity))

        relevant_documents.sort(key=lambda x: x[1], reverse=True)
        return relevant_documents


if __name__ == "__main__":
    # Load tokenized documents from file
    with open('tokenized_documents.txt', 'r', encoding='utf-8') as f:
        tokenized_documents = [set(line.strip().split()) for line in f.readlines()]

    # Create a JaccardSimilarity instance
    jaccard_similarity = JaccardSimilarity(tokenized_documents)

    # Load document names
    with open('document_names.txt', 'r', encoding='utf-8') as f:
        document_names = [line.strip() for line in f.readlines()]

    # Load the URL to document info dictionary from the JSON file
    with open('urltodoc.json', 'r', encoding='utf-8') as f:
        url_to_document = json.load(f)

    # Input query
    query = input("Masukkan query: ")

    start_time = time.time()
    # Search using Jaccard similarity
    search_results = jaccard_similarity.search(query)

    # Display search results
    print("\nHasil Pencarian:")
    n=0
    for rank, (doc_index, similarity) in enumerate(search_results, start=1):
        if similarity == 0:
            continue
        document_name = document_names[doc_index]
        document_url = url_to_document.get(document_name, "N/A")
        print(f"Rank: {rank}")
        print(f"Nama Dokumen: {document_name}")
        print(f"Similaritas Jaccard: {similarity:.4f}")
        print(f"Document URL: '{document_url}\n")
        n+=1

    end_time = time.time()
    print(f"Waktu eksekusi: {end_time - start_time:.9f} detik")
    print(f"Jumlah dokumen: {n}")



Hasil Pencarian:
Rank: 1
Nama Dokumen: Khalid_bin_Sa_id.txt
Similaritas Jaccard: 0.0147
Document URL: 'https://id.wikipedia.org/wiki/Khalid_bin_Sa%27id

Rank: 2
Nama Dokumen: Al-Mutawakkil_I.txt
Similaritas Jaccard: 0.0135
Document URL: 'https://id.wikipedia.org/wiki/Al-Mutawakkil_I

Rank: 3
Nama Dokumen: Ibnu_Yunus.txt
Similaritas Jaccard: 0.0122
Document URL: 'https://id.wikipedia.org/wiki/Ibnu_Yunus

Rank: 4
Nama Dokumen: Bani_Taim.txt
Similaritas Jaccard: 0.0104
Document URL: 'https://id.wikipedia.org/wiki/Bani_Taim

Rank: 5
Nama Dokumen: Yazid_bin_Walid.txt
Similaritas Jaccard: 0.0087
Document URL: 'https://id.wikipedia.org/wiki/Yazid_bin_Walid

Rank: 6
Nama Dokumen: Pertempuran_Yamamah.txt
Similaritas Jaccard: 0.0085
Document URL: 'https://id.wikipedia.org/wiki/Pertempuran_Yamamah

Rank: 7
Nama Dokumen: Al-Humaidi.txt
Similaritas Jaccard: 0.0085
Document URL: 'https://id.wikipedia.org/wiki/Al-Humaidi

Rank: 8
Nama Dokumen: Pengepungan_Tha_if.txt
Similaritas Jaccard: 0.0083
Docum