In [2]:
import os, glob, heapq, nltk, time, json
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from pretfidf import vectorizer

# Create stopword remover object
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

# Create a stemming object
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

# Load the TF-IDF matrix
tfidf_matrix = np.load('tfidf_matrix.npy', allow_pickle=True)

# Tokenisasi, penghapusan stopwords, dan stemming pada query
query = input("Masukkan kata yang ingin dicari: ")
query = stopword.remove(query)  # Menghapus stopwords dari query
query_tokens = nltk.word_tokenize(query)  # Tokenisasi query
query_tokens = [stemmer.stem(word) for word in query_tokens]  # Melakukan stemming pada query

# Transform the query into a TF-IDF vector using the same vectorizer
query_vector = vectorizer.transform([' '.join(query_tokens)])

# Calculate cosine similarity between the query and documents
cosine_scores = cosine_similarity(query_vector, tfidf_matrix)

# Load document names from the text file
with open('document_names.txt', 'r', encoding='utf-8') as f:
    document_names = [line.strip() for line in f.readlines()]

# Load the URL to document info dictionary
with open('urltodoc.json', 'r', encoding='utf-8') as f:
    url_to_document = json.load(f)

# Create a list to store the results with document titles, scores, and positions of query words
results = []

corpus_pi_path = 'C:/Users/USER/OneDrive/Desktop/SEMESTER 5/Akademik/PI-kelompok6/Koprus_PI'

# Loop through the cosine scores and check if each document contains words from the query
for idx, score in enumerate(cosine_scores[0]):
    if score > 0:
        # If the cosine score is greater than 0, it means the document contains words from the query
        document_title = document_names[idx]  # Get the real document title
        positions = []

        # Open the document and find all the positions where the query word appears
        with open(os.path.join(corpus_pi_path, document_title), 'r', encoding='utf-8') as f:
            document = f.read()

            for match in nltk.re.finditer(query, document):
                positions.append(match.start())

        # Get the URL for the document
        document_url = url_to_document[document_title]

        results.append((score, document_title, positions, document_url))

# Sort the results by cosine score in descending order
results.sort(reverse=True)

start_time = time.time()

# Print the ranking with document titles, scores, positions of query words, and URLs
print("Dokumen teratas berdasarkan cosine similarity:")
for rank, (score, document_title, positions, document_url) in enumerate(results, start=1):
    print(f"Rank: {rank}")
    print(f"Nama Dokumen: {document_title}")
    print(f"Skor Cosine Similarity: {score:.8f}")
    print(f"Posisi Index untuk Kata dalam Query: {positions}")
    print(f"URL Dokumen: {document_url}")
    print()

end_time = time.time()

print(f"Waktu eksekusi: {end_time - start_time:.2f} detik")

Dokumen teratas berdasarkan cosine similarity:
Rank: 1
Nama Dokumen: Muhammad.txt
Skor Cosine Similarity: 0.53355021
Posisi Index untuk Kata dalam Query: [4008]
URL Dokumen: https://id.wikipedia.org/wiki/Muhammad

Rank: 2
Nama Dokumen: Ali_bin_Abi_Thalib.txt
Skor Cosine Similarity: 0.23353468
Posisi Index untuk Kata dalam Query: []
URL Dokumen: https://id.wikipedia.org/wiki/Ali_bin_Abi_Thalib

Rank: 3
Nama Dokumen: Penutup_Para_Nabi.txt
Skor Cosine Similarity: 0.21627222
Posisi Index untuk Kata dalam Query: []
URL Dokumen: https://id.wikipedia.org/wiki/Penutup_Para_Nabi

Rank: 4
Nama Dokumen: Muhammad_bin_Abdul_Wahhab.txt
Skor Cosine Similarity: 0.19802410
Posisi Index untuk Kata dalam Query: []
URL Dokumen: https://id.wikipedia.org/wiki/Muhammad_bin_Abdul_Wahhab

Rank: 5
Nama Dokumen: Islam.txt
Skor Cosine Similarity: 0.18806196
Posisi Index untuk Kata dalam Query: []
URL Dokumen: https://id.wikipedia.org/wiki/Islam

Rank: 6
Nama Dokumen: Hijrah.txt
Skor Cosine Similarity: 0.17645820
