In [71]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

## Βήμα 1: Συλλογή Δεδομένων
Τα δεδομένα που χρησιμοποιήθηκαν για την υλοποίηση της εργασίας είναι βρίσκονται στον εξής σύνδεσμο: https://www.kaggle.com/datasets/sameersmahajan/people-wikipedia-data

In [73]:
df = pd.read_csv('in/pure_people_wiki.csv')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## Βήμα 2. Προεπεξεργασία κειμένου (Text Processing):

In [None]:
def tokenize(text):
    return text.split()

def remove_punctuation(words):
    cleaned_words = []
    for word in words:
        if word not in string.punctuation:
            cleaned_words.append(word)
    return cleaned_words

def filter_stop_words(words):
    filtered_words = []
    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    return filtered_words

def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

def preprocess_text(text):
    tokens = tokenize(text)                      # Step 1: Tokenization
    tokens = remove_punctuation(tokens)          # Step 2: Remove Punctuation
    tokens = filter_stop_words(tokens)           # Step 3: Remove Stop Words
    lemmatized_tokens = lemmatize_words(tokens)  # Step 4: Lemmatization
    return lemmatized_tokens

# Apply preprocessing to each text entry
df['processed_text'] = df['text'].apply(preprocess_text)

# Display the processed DataFrame
print(df[['URI', 'name', 'processed_text']].head())

## Βήμα 3: Ευρετήριο (Indexing)
### α. Δημιουργία inverted index

In [None]:
corpus = {}

# Do the same for the first 1000 entries
# We enumerate the URI column of the dataframe so we can index it.
# Then we take its list of tokens and we count the amount of times each token appears
for i, uri in enumerate(df['URI'][:1000]):
    corpus[uri] = {tok: df['processed_text'][i].count(tok) for tok in df['processed_text'][i]}

# revert it back to a dataframe
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns]


### β. Αποθήκευση του Inverted Index σε .csv αρχείο

In [76]:
# save the dataframe to a csv file
df.to_csv('out/results.csv', index = True)

Μετατροπή του .csv αρχείου σε dictionary

In [None]:
df = pd.read_csv('out/results.csv') # read the csv file
df.set_index(df.columns[0], inplace=True) # set the first column as the index
# print(df)

data_dict = df.to_dict(orient='index') # Convert DataFrame to dictionary
# Print the first 5 entries of the dictionary
for uri, terms in list(data_dict.items())[:5]:
    print(f"Document URI: {uri}, Terms: {terms}")

## 4. Μηχανή Αναζήτησης
### α. Επεξεργασία Ερωτήματος (Query Processing)

In [78]:
def search_query_dict(data_dict, query):
    tokens = query.split()  # Split the query into separate tokens
    print("Tokenized query: ", tokens)

    initial_term = tokens[0].lower()  # Retrieve the first term

    # Ensure the initial term exists in the data_dict
    if initial_term not in next(iter(data_dict.values())):  # Check the terms in the first document
        print(f"Term '{initial_term}' not found in the documents.")
        return []

    # Get URIs for the initial term (those documents where the term has a frequency > 0)
    results = {uri for uri, terms in data_dict.items() if terms.get(initial_term, 0) > 0}
    combined_results = results


    # Process additional terms with Boolean operators (AND, OR, NOT)
    i = 1
    while i < len(tokens):
        operator = tokens[i].upper()
        term = tokens[i + 1].lower()

        # Ensure the term exists in the data
        if term not in next(iter(data_dict.values())):
            print(f"Term '{term}' not found in the documents.")
            return []

        # Get URIs for the current term (those documents where the term has a frequency > 0)
        term_uris = {uri for uri, terms in data_dict.items() if terms.get(term, 0) > 0}

        # Apply the Boolean operators
        if operator == "AND":
            combined_results &= term_uris  # Intersection for AND
        elif operator == "OR":
            combined_results |= term_uris  # Union for OR
        elif operator == "NOT":
            combined_results -= term_uris  # Difference for NOT

        i += 2  # Move to the next pair

    #print(f"Combined result URIs: {combined_results}")
    return combined_results

    


**Εκτέλεση Boolean Query και εμφάνιση πλήθους αποτελεσμάτων (χωρίς κατάταξη)**

In [None]:
# Test the search_query_dict function with a boolean query (without ranking)
query = "musician AND jazz"
matching_uris = search_query_dict(data_dict, query)
print("Number of matching URIs: ", len(matching_uris))

### β. Κατάταξη αποτελεσμάτων

In [None]:
# Dictionary to store the scores
def search_query_dict_ranked(data_dict, query):
    combined_results = search_query_dict(data_dict, query)
    tokens = query.split()
    scores = {uri: 0 for uri in combined_results}

    # Calculate scores for the documents in the combined results
    for uri in combined_results:
        scores[uri] = sum(data_dict[uri].get(token.lower(), 0) for token in tokens if token.isalnum())

    # Sort documents by score in descending order
    ranked_results = sorted(combined_results, key=lambda uri: scores[uri], reverse=True)

    # Create a user-friendly formatted output
    print("\nSearch Results:")
    print("-" * 40)
    if not ranked_results:
        print("No matching documents found.")
    else:
        print("{:<6} {:<20} {:<10}".format("Rank", "Document", "Score"))
        print("-" * 40)
        for rank, uri in enumerate(ranked_results, start=1):
            print("{:<6} {:<20} {:<10}".format(rank, uri, scores[uri]))
        print("-" * 40)

    # Return the ranked list of matching URIs
    return ranked_results


In [None]:
import math
from collections import defaultdict

# Compute TF-IDF weights
def compute_tf_idf(data_dict):
    doc_count = len(data_dict)
    term_doc_frequency = defaultdict(int)

    for terms in data_dict.values():
        for term in terms:
            if terms[term] > 0:
                term_doc_frequency[term] += 1

    tf_idf_dict = {}
    for uri, terms in data_dict.items():
        tf_idf_dict[uri] = {}
        for term, freq in terms.items():
            tf = freq
            idf = math.log(doc_count / (1 + term_doc_frequency[term]))
            tf_idf_dict[uri][term] = tf * idf

    return tf_idf_dict


# Perform Boolean retrieval
def boolean_retrieval(data_dict, query):
    tokens = query.split()
    combined_results = set()

    initial_term = tokens[0].lower()
    if initial_term not in next(iter(data_dict.values())):
        return []

    combined_results = {uri for uri, terms in data_dict.items() if terms.get(initial_term, 0) > 0}
    i = 1

    while i < len(tokens):
        operator = tokens[i].upper()
        term = tokens[i + 1].lower()

        if term not in next(iter(data_dict.values())):
            return []

        term_uris = {uri for uri, terms in data_dict.items() if terms.get(term, 0) > 0}

        if operator == "AND":
            combined_results &= term_uris
        elif operator == "OR":
            combined_results |= term_uris
        elif operator == "NOT":
            combined_results -= term_uris

        i += 2

    return list(combined_results)


# Perform Vector Space Model retrieval
def vsm_retrieval(tf_idf_dict, query):
    query_terms = query.split()
    query_vector = defaultdict(float)

    for term in query_terms:
        query_vector[term.lower()] += 1

    scores = defaultdict(float)
    for uri, terms in tf_idf_dict.items():
        doc_vector = terms
        dot_product = sum(query_vector[term] * doc_vector.get(term, 0) for term in query_vector)
        query_magnitude = math.sqrt(sum(val**2 for val in query_vector.values()))
        doc_magnitude = math.sqrt(sum(val**2 for val in doc_vector.values()))
        if query_magnitude * doc_magnitude != 0:
            scores[uri] = dot_product / (query_magnitude * doc_magnitude)

    sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    print("\nVector Space Model Results (Cosine Similarity):")
    print("-" * 40)
    print("{:<6} {:<20} {:<10}".format("Rank", "Document", "Score"))
    print("-" * 40)
    for rank, (doc, score) in enumerate(sorted_results, start=1):
        print("{:<6} {:<20} {:<10.4f}".format(rank, doc, score))
    print("-" * 40)

    return {doc: score for doc, score in sorted_results if score > 0}


# Perform Probabilistic Retrieval
def probabilistic_retrieval(data_dict, query):
    tokens = query.split()
    scores = defaultdict(float)
    doc_count = len(data_dict)

    for term in tokens:
        term = term.lower()
        if term in next(iter(data_dict.values())):
            term_doc_count = sum(1 for terms in data_dict.values() if terms.get(term, 0) > 0)
            idf = math.log((doc_count - term_doc_count + 0.5) / (term_doc_count + 0.5))
            for uri, terms in data_dict.items():
                if terms.get(term, 0) > 0:
                    scores[uri] += idf

    sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Print the results in a formatted
    print("\nProbabilistic Retrieval Results (Scores):")
    print("-" * 40)
    print("{:<6} {:<20} {:<10}".format("Rank", "Document", "Score"))
    print("-" * 40)
    for rank, (doc, score) in enumerate(sorted_results, start=1):
        print("{:<6} {:<20} {:<10.4f}".format(rank, doc, score))
    print("-" * 40)

    return {doc: score for doc, score in sorted_results if score > 0}




## 5. Αξιολόγηση Συστήματος

In [None]:
# Evaluate the selected method
def evaluate_results1(ground_truth, retrieved):
    retrieved = set(retrieved)
    relevant = set(ground_truth)

    tp = len(retrieved & relevant) # true positives  - documents that are retrieved and are indeed relevant
    fp = len(retrieved - relevant) # false positives - documents that are retrieved but are not relevant
    fn = len(relevant - retrieved) # false negatives - documents that are not retrieved but are relevant

    # calculate precision, recall and F1-score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # print calculated metrics
    print("\nEvaluation Metrics:")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-Score: {f1_score:.3f}")


def evaluate_results(ground_truth, retrieved):
    retrieved_docs = set(retrieved.keys())  # Use only documents with scores > 0
    relevant_docs = set(ground_truth)

    tp = len(retrieved_docs & relevant_docs)
    fp = len(retrieved_docs - relevant_docs)
    fn = len(relevant_docs - retrieved_docs)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print("\nEvaluation Metrics:")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-Score: {f1_score:.3f}")


query = "musician AND jazz"
ground_truth = search_query_dict(data_dict, query)

# Show option to the user
print("\nChoose Retrieval Method:")
print("1. Boolean Retrieval")
print("2. Vector Space Model")
print("3. Probabilistic Retrieval")
choice = int(input("Enter your choice (1/2/3): "))


if choice == 1:
    results = boolean_retrieval(data_dict, query)
    print("\nBoolean Retrieval Results:", results)
    evaluate_results1(ground_truth, results)
elif choice == 2:
    # Remove logical operators for VSM
    vsm_query = " ".join([token for token in query.split() if token.upper() not in {"AND", "OR", "NOT"}])
    tf_idf_dict = compute_tf_idf(data_dict)
    results = vsm_retrieval(tf_idf_dict, vsm_query)
    evaluate_results(ground_truth, results)
elif choice == 3:
    # Remove logical operators for Probabilistic Retrieval
    prob_query = " ".join([token for token in query.split() if token.upper() not in {"AND", "OR", "NOT"}])
    results = probabilistic_retrieval(data_dict, prob_query)
    evaluate_results(ground_truth, results)
else:
    print("Invalid choice!")
    results = []

