In [1]:
import os
import re
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Create a Sastrawi stemmer
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

# Function to read and preprocess documents
def preprocess_document(file_path, stemmer, stopwords_ind):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        # Convert to lowercase
        content = content.lower()
        # Tokenize the content
        words = word_tokenize(content)
        # Remove punctuation and keep alphanumeric tokens
        words = [word for word in words if word.isalnum()]
        # Stem the words using Sastrawi
        stemmed_words = [stemmer.stem(word) for word in words]
        # Extract stopwords from the document
        stopwords_from_doc = [word for word in stemmed_words if word in stopwords_ind]

        # Extract URL if available
        urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', content)
        
        return ' '.join(stemmed_words), stopwords_from_doc, urls

# Get a list of Indonesian stopwords
stopwords_ind = set(stopwords.words('indonesian'))

# Define the corpus folder
corpus_folder = 'Koprus_PI'  # Replace with your corpus folder path

# Create a directory to store tokenized, stemmed words and URLs
output_dir = 'processed_texts'
os.makedirs(output_dir, exist_ok=True)

# Lists to store the extracted URLs and their corresponding documents
document_urls = []

# Tokenize, stem, extract URLs, and save the processed words and URLs to files
for filename in os.listdir(corpus_folder):
    if filename.endswith('.txt'):
        file_path = os.path.join(corpus_folder, filename)
        content, doc_stopwords, urls = preprocess_document(file_path, stemmer, stopwords_ind)
        
        # Save the processed words to a file
        output_file_path = os.path.join(output_dir, f'{os.path.splitext(filename)[0]}_processed.txt')
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(content)
        
        # Store the extracted URLs for each document
        document_urls.append((filename, urls))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/razanfawwaz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/razanfawwaz/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Function to load processed words from files
def load_processed_words(directory):
    processed_words = []
    for filename in os.listdir(directory):
        if filename.endswith('_processed.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                processed_words.append(file.read())
    return processed_words

# Load processed words from the 'processed_texts' directory
processed_words = load_processed_words(output_dir)

In [3]:
# Define a dictionary to store the index (word to document mapping)
word_to_doc_index = {}

# Index the documents and build the word-to-document index
for idx, content in enumerate(processed_words):
    # Split the content into words
    words = content.split()
    # For each word, update the index
    for word in words:
        if word not in word_to_doc_index:
            word_to_doc_index[word] = []
        # Append the document index to the word's index
        word_to_doc_index[word].append(idx)

import json

# Path to save the index file
index_file_path = 'word_to_doc_index.json'

# Save the word-to-document index to a JSON file
with open(index_file_path, 'w', encoding='utf-8') as index_file:
    json.dump(word_to_doc_index, index_file, ensure_ascii=False, indent=4)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine stopwords from documents with Indonesian stopwords
combined_stopwords = list(stopwords_ind) + doc_stopwords # Using doc_stopwords from Cell 1

# Calculate TF-IDF for the processed words
vectorizer = TfidfVectorizer(stop_words=combined_stopwords)  # Use the combined stopwords
tfidf_matrix = vectorizer.fit_transform(processed_words)



In [9]:
import webbrowser

# Create a query
query = input('Enter search keywords: ')
query = query.lower()

# Transform the query into TF-IDF representation
query_vector = vectorizer.transform([query])

# Calculate similarity scores (cosine similarity) between the query and documents
cosine_similarities = tfidf_matrix.dot(query_vector.T).toarray().flatten()

# Sort the documents based on similarity scores
sorted_document_indices = cosine_similarities.argsort()[::-1][:5]

# Display the results (title, URL, and stopwords from the document)
j = 0
for i in sorted_document_indices:
    j += 1
    print(j)
    precision = len(set(doc_stopwords) & set(processed_words[i].split())) / len(set(doc_stopwords))
    recall = len(set(doc_stopwords) & set(processed_words[i].split())) / len(set(processed_words[i].split()))
    f1 = 2 * precision * recall / (precision + recall)
    print("Title:", os.path.splitext(os.path.basename(os.listdir(corpus_folder)[i]))[0])
    print("Similarity Score:", cosine_similarities[i])
    # Count the precision, recall, and f1
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)
    print("=" * 50)


while True:
    try:
        # Ask for input as a number from the user
        num = int(input("Select the desired website number (1-5): "))

        # Check if the input is within the range 1-5
        if 1 <= num <= 5:
            # Get the document index corresponding to the user's choice
            selected_index = sorted_document_indices[num - 1]

            # Open the URLs associated with the selected document
            for url in document_urls[selected_index][1]:
                webbrowser.open(url)  # Access the URLs from the tuple and open each one
            break  # Exit the loop after opening the URLs
        else:
            print("Invalid choice. Please enter a number between 1 and 5.")
    except ValueError:
        print("Please enter a valid number.")

1
Title: Surah_Hud
Similarity Score: 0.5803544581622448
Precision: 0.9090909090909091
Recall: 0.02599090318388564
F1: 0.05053695514845231
2
Title: Nuh_(tokoh_Alkitab)
Similarity Score: 0.30141471241201273
Precision: 0.4772727272727273
Recall: 0.21875
F1: 0.3
3
Title: Surah_Al-Falaq
Similarity Score: 0.25711898252231713
Precision: 0.9090909090909091
Recall: 0.05442176870748299
F1: 0.10269576379974325
4
Title: Kafir
Similarity Score: 0.2164151779402389
Precision: 0.9545454545454546
Recall: 0.028493894165535955
F1: 0.05533596837944664
5
Title: Salat_Taubat
Similarity Score: 0.21419728426608728
Precision: 0.8863636363636364
Recall: 0.026530612244897958
F1: 0.05151915455746367
