In [None]:

# Install required libraries
!pip install Arabic-Stopwords

# Import necessary libraries
import pandas as pd
import numpy as np
import re
from snowballstemmer import stemmer
from tqdm import tqdm
import arabicstopwords.arabicstopwords as stp


In [None]:

# Load Arabic stop words
stopWords = set(stp.stopwords_list())

# Function to remove stop words
def remove_stop_words(sentence):
    terms = []
    for term in sentence.split():
        if term not in stopWords:
            terms.append(term)
    return " ".join(terms)

# Function to normalize Arabic text
def normalize(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return text

# Arabic stemmer
ar_stemmer = stemmer("arabic")

# Function to stem words
def stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])

# Full preprocessing pipeline
def preprocess(sentence):
    sentence = remove_stop_words(sentence)
    sentence = normalize(sentence)
    sentence = stem(sentence)
    return sentence


In [None]:

# Function to build an inverted index
def build_inverted_index(documents):
    inverted_index = {}
    for doc_id, text in enumerate(documents):
        # Preprocess the document
        processed_text = preprocess(text)
        terms = processed_text.split()

        # Populate the inverted index
        for term in terms:
            if term in inverted_index:
                inverted_index[term].add(doc_id)
            else:
                inverted_index[term] = {doc_id}
    return inverted_index

# Example documents
documents = [
    "هذا هو النص الأول للتجربة",
    "النص الثاني يحتوي على كلمات مختلفة",
    "النص الثالث يحتوي على كلمات متشابهة مع النص الأول"
]

# Build the inverted index
inverted_index = build_inverted_index(documents)

# Display the inverted index
for term, doc_ids in inverted_index.items():
    print(f"{term}: {sorted(doc_ids)}")


In [None]:

# Function to query the inverted index
def query_inverted_index(term, inverted_index):
    term = preprocess(term)  # preprocess the query term
    return inverted_index.get(term, set())

# Example query
query_term = "التجربة"
result = query_inverted_index(query_term, inverted_index)

print(f"Documents containing '{query_term}': {sorted(result)}")
