In [7]:
import os
import glob
import heapq
import nltk
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import tokenized

# Create stopword remover object
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

# Create a stemming object
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

# Open the file for reading
with open('tokenized_documents.txt', 'r', encoding='utf-8') as f:
    # Read the contents of the file and combine tokens back into text
    documents = [' '.join(line.strip().split()) for line in f]

# Tokenisasi, penghapusan stopwords, dan stemming pada query
query = input("Masukkan kata yang ingin dicari: ")
query = stopword.remove(query)  # Menghapus stopwords dari query
query_tokens = nltk.word_tokenize(query)  # Tokenisasi query
query_tokens = [stemmer.stem(word) for word in query_tokens]  # Melakukan stemming pada query

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True)

# Calculate the TF-IDF vectors for the documents
tfidf_matrix = vectorizer.fit_transform(documents)
features = vectorizer.get_feature_names_out()

# Transform the query into a TF-IDF vector
query_vector = vectorizer.transform([' '.join(query_tokens)])

# Calculate the cosine similarity scores between the query vector and the document vectors
cosine_scores = cosine_similarity(query_vector, tfidf_matrix)

# Membuat daftar dokumen yang relevan dengan kata-kata dalam query
relevant_documents = []
for i, document_tokens in enumerate(documents):
    if any(word in document_tokens for word in query_tokens):
        relevant_documents.append(i)

# Function to calculate Jaccard Similarity
def jaccard_similarity(set1, set2):
    set1 = set(set1)
    set2 = set(set2)
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union if union != 0 else 0

# Calculate Jaccard Similarity scores for each document
jaccard_scores = [jaccard_similarity(query_tokens, nltk.word_tokenize(document)) for document in documents]

# Get the top-ranked documents based on Jaccard Similarity
jaccard_indices = heapq.nlargest(len(relevant_documents), relevant_documents, key=lambda i: jaccard_scores[i])

# Get the document names
document_names = [os.path.basename(file) for file in tokenized.files]

cosine_indices = heapq.nlargest(len(relevant_documents), relevant_documents, key=lambda i: cosine_scores[0][i])

# Print the ranking, document name, score, and word positions in the document
print("Dokumen teratas berdasarkan cosine similarity:")
for rank, i in enumerate(cosine_indices, start=1):
    print(f"Rank: {rank}")
    print(f"Nama Dokumen: {document_names[i]}")
    print(f"Skor Cosine Similarity: {cosine_scores[0][i]}")
    positions = [idx for idx, word in enumerate(documents[i].split()) if word in query_tokens]
    print(f"Posisi Index untuk Kata dalam Query: {positions}")
    print()

print("Dokumen teratas berdasarkan Jaccard Similarity:")
for rank, i in enumerate(jaccard_indices, start=1):
    print(f"Rank: {rank}")
    print(f"Nama Dokumen: {document_names[i]}")
    print(f"Skor Jaccard Similarity: {jaccard_scores[i]}")
    positions = [idx for idx, word in enumerate(documents[i].split()) if word in query_tokens]
    print(f"Posisi Index untuk Kata dalam Query: {positions}")
    print()

# Menyimpan nama dokumen ke dalam file
with open('document_names.txt', 'w', encoding='utf-8') as txt_file:
    txt_file.write('\n'.join(document_names))

print("Nama dokumen telah disimpan dalam file: document_names.txt")


Dokumen teratas berdasarkan cosine similarity:
Rank: 1
Nama Dokumen: Muhammad.txt
Skor Cosine Similarity: 0.5335502064450063
Posisi Index untuk Kata dalam Query: [6, 8, 60, 62, 68, 70, 120, 135, 152, 168, 176, 214, 227, 259, 269, 300, 316, 330, 356, 381, 388, 399, 400, 434, 447, 521, 530, 547, 549, 620, 635, 747, 749, 751, 759, 767, 773, 808, 812, 836, 857, 863, 897, 907, 918, 932, 978, 1008, 1072, 1080, 1090, 1099, 1103, 1142, 1162, 1164, 1185, 1214, 1285, 1288, 1343, 1367, 1370, 1385, 1425, 1465, 1489, 1514, 1546, 1602, 1669, 1684, 1717, 1744, 1749, 1771, 1806, 1827, 1861, 1881, 1929, 1972, 1992, 2024, 2062, 2077, 2097, 2110, 2140, 2174, 2222, 2239, 2275, 2314, 2335, 2361, 2365, 2381, 2398, 2428, 2440, 2450, 2454, 2459, 2480, 2501, 2514, 2549, 2574, 2591, 2699, 2715, 2928, 2936, 2953, 2972, 2975, 3044, 3048, 3063, 3126, 3139, 3162, 3183, 3226, 3243, 3312, 3322, 3329, 3412, 3427, 3476, 3480, 3490, 3509, 3525, 3548, 3554, 3563, 3585, 3610, 3623, 3659, 3695, 3698, 3709, 3735, 3746, 3770