In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

## Βήμα 1: Συλλογή Δεδομένων
Τα δεδομένα που χρησιμοποιήθηκαν για την υλοποίηση της εργασίας είναι βρίσκονται στον εξής σύνδεσμο: https://www.kaggle.com/datasets/sameersmahajan/people-wikipedia-data

In [None]:
df = pd.read_csv('in/pure_people_wiki.csv')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## Βήμα 2. Προεπεξεργασία κειμένου (Text Processing):

In [None]:
def tokenize(text):
    return text.split()

def remove_punctuation(words):
    cleaned_words = []
    for word in words:
        if word not in string.punctuation:
            cleaned_words.append(word)
    return cleaned_words

def filter_stop_words(words):
    filtered_words = []
    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    return filtered_words

def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

def preprocess_text(text):
    tokens = tokenize(text)                      # Step 1: Tokenization
    tokens = remove_punctuation(tokens)          # Step 2: Remove Punctuation
    tokens = filter_stop_words(tokens)           # Step 3: Remove Stop Words
    lemmatized_tokens = lemmatize_words(tokens)  # Step 4: Lemmatization
    return lemmatized_tokens

# Apply preprocessing to each text entry
df['processed_text'] = df['text'].apply(preprocess_text)

# Display the processed DataFrame
print(df[['URI', 'name', 'processed_text']].head())

## Βήμα 3: Ευρετήριο (Indexing)
### α. Δημιουργία inverted index

In [None]:
corpus = {}

# test for the first corpus (see lab example)
#corpus['uri0'] = {tok: df['processed_text'][0].count(tok) for tok in df['processed_text'][0]}
#print(corpus['uri0'])

# Do the same for the first 1000 entries
# We enumerate the URI column of the dataframe so we can index it.
# Then we take its list of tokens and we count the amount of times each token appears
for i, uri in enumerate(df['URI'][:1000]):
    corpus[uri] = {tok: df['processed_text'][i].count(tok) for tok in df['processed_text'][i]}

# revert it back to a dataframe
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns]


### β. Αποθήκευση του Inverted Index σε .csv αρχείο

In [None]:
# save the dataframe to a csv file
df.to_csv('out/results.csv', index = True)

Μετατροπή του .csv αρχείου σε dictionary

In [None]:
df = pd.read_csv('out/results.csv') # read the csv file
df.set_index(df.columns[0], inplace=True) # set the first column as the index
# print(df)

data_dict = df.to_dict(orient='index') # Convert DataFrame to dictionary
# Print the first 5 entries of the dictionary
for uri, terms in list(data_dict.items())[:5]:
    print(f"Document URI: {uri}, Terms: {terms}")

## 4. Μηχανή Αναζήτησης
### α. Επεξεργασία Ερωτήματος (Query Processing)

In [None]:
def search_query_dict(data_dict, query):
    
    tokens = query.split() # Split the query into separate tokens
    print("Tokenized query: ", tokens)

    initial_term = tokens[0].lower() # retrieve the first term

    # Ensure the initial term exists in the data_dict
    if initial_term not in next(iter(data_dict.values())):  # Check the terms in the first document
        print(f"Term '{initial_term}' not found in the documents.")
        return set()

    # Get URIs for the initial term (those documents where the term has a frequency > 0)
    results = {uri for uri, terms in data_dict.items() if terms.get(initial_term, 0) > 0}
    combined_results = results

    # Process additional terms with Boolean operators (AND, OR, NOT)
    i = 1
    while i < len(tokens):
        operator = tokens[i].upper()
        term = tokens[i + 1].lower()

        # Ensure the term exists in the data
        if term not in next(iter(data_dict.values())):
            print(f"Term '{term}' not found in the documents.")
            return set()

        # Get URIs for the current term (those documents where the term has a frequency > 0)
        term_uris = {uri for uri, terms in data_dict.items() if terms.get(term, 0) > 0}

        # Apply the Boolean operators
        if operator == "AND":
            combined_results &= term_uris  # Intersection for AND
        elif operator == "OR":
            combined_results |= term_uris  # Union for OR
        elif operator == "NOT":
            combined_results -= term_uris  # Difference for NOT

        i += 2  # Move to the next pair

    print(f"Combined result URIs: {combined_results}")
    
    # Return the final result set of matching URIs
    return combined_results

In [None]:
# Test the search_query_dict function
query = "musician AND jazz"
matching_uris = search_query_dict(data_dict, query)
print("Final matching URIs: ", matching_uris)
print("Number of matching URIs: ", len(matching_uris))