In [2014]:
import os
import re
import pandas as pd
import math

We get the total files of the corpus

In [2015]:
DIR = "./data/textos"
total_files = len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))])
print(total_files)

20


Helper function to flaten lists

In [2016]:
def flatten(xss):
    return [x for xs in xss for x in xs]

In [2017]:
punctuation_regex = r"[^\w\s]"

Helper functions to normalize and tokenize strings

In [2018]:
def normalize_tokens(tokens: list[str]) -> list[str]:
    normalized_tokens = [re.sub(punctuation_regex, "", str.lower()) for str in tokens]
    return normalized_tokens

In [2019]:
def tokenize_input(input: str) -> list[str]:
    tokens = normalize_tokens(input.split())
    return tokens

In [2020]:
def tokenize_content() -> list[str]:
    file_tokens = []
    for i in range(1, total_files + 1):
        with open(f"{DIR}/texto-{i}.txt") as file:
            file_content = file.read()
        file_tokens.append(tokenize_input(file_content))

    return file_tokens

In [2021]:
tokens = tokenize_content()

We get the all the unique tokens of the corpus

In [2022]:
unique_tokens = set(flatten(tokens))

In [2023]:
#print(unique_tokens)

In [2024]:
print(len(unique_tokens))

392


Here we build the first token matrix

Using a dictionary we store the text name as the key and a counter of how many times a token is found on the text as the value

In [2025]:
tokens_matrix = {}

for i in range(1, total_files + 1):
    with open(f"{DIR}/texto-{i}.txt") as file:
        file_name = f"texto-{i}.txt"
        tokens_matrix[file_name] = []
        file_content = file.read()
        file_words = file_content.split()
        normalized_words = normalize_tokens(file_words)
        # print(normalized_words)

        for index, token in enumerate(unique_tokens):
            tokens_matrix[file_name].append(0)
            count = 0
            for word in normalized_words:
                if word == token:
                    count += 1
                    tokens_matrix[file_name][index] = count
                    #print(word, count)

In [2026]:
#print(tokens_matrix)

We export the resulting matrix as tokens_matrix.csv

In [2027]:
pd.DataFrame.from_dict(data=tokens_matrix, orient='index').to_csv('tokens_matrix.csv', header=list(unique_tokens))

We get the number of querys in total

In [2028]:
QUERY_DIR = "./data/querys"
total_querys = len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))])
print(total_querys)

20


In [2029]:
query_names = []
for i in range(1, total_querys + 1):
    query_names.append(f"query-{i}")

print(query_names)

['query-1', 'query-2', 'query-3', 'query-4', 'query-5', 'query-6', 'query-7', 'query-8', 'query-9', 'query-10', 'query-11', 'query-12', 'query-13', 'query-14', 'query-15', 'query-16', 'query-17', 'query-18', 'query-19', 'query-20']


Now we build our query matrix

We check if the content of the query is in our unique_tokens list


In [2030]:
query_matrix = {}

for i in range(1, total_querys + 1):
    with open(f"{QUERY_DIR}/query-{i}.txt") as file:
        query_name = f"query-{i}"
        query_matrix[query_name] = []
        file_content = file.read()
        file_words = file_content.split()
        normalized_words = normalize_tokens(file_words)
        # print(normalized_words)

        for index, token in enumerate(unique_tokens):
            query_matrix[query_name].append(0)
            for word in normalized_words:
                if word == token:
                    query_matrix[query_name][index] = 1


In [2031]:
#print(query_matrix)

Export the query matrix as query_matrix.csv

In [2032]:
pd.DataFrame.from_dict(data=query_matrix, orient='index').to_csv('query_matrix.csv', header=list(unique_tokens))

Function to calculate the distance between two vectors of same dimension

In [2033]:
def get_distance(token_vector, query_vector):
    numerator = 0
    token_module = 0
    query_module = 0
    for i in range(len(token_vector)):
        numerator += token_vector[i] * query_vector[i]
        token_module += token_vector[i] ** 2
        query_module += query_vector[i] ** 2

    denominator = math.sqrt(token_module) * math.sqrt(query_module)
    distance = numerator / denominator

    return distance

We build the result matrix with the corresponding distances between every query and every text

In [2034]:
results_matrix = {}

for i in range(1, total_files + 1):
    text_name = f"texto-{i}.txt"
    results_matrix[text_name] = []
    for j in range(1, total_querys + 1):
        query_name = f"query-{j}"
        results_matrix[text_name].append(
            get_distance(token_vector=tokens_matrix[text_name], query_vector=query_matrix[query_name]))

In [2035]:
#print(results_matrix)

Export the results matrix as result_matrix.csv

In [2036]:
pd.DataFrame.from_dict(data=results_matrix, orient='index').to_csv('results_matrix.csv', header=query_names)

Now we construct the ranking dictionary to rank the 3 more relevant texts to every query

In [2037]:
ranking_dict = {}

for i in range(1, total_files + 1):
    current_query = f"query-{i}"
    current_text = f"texto-{i}.txt"
    ranking_dict[current_query] = []
    for j in range(3):
        best_result_index = results_matrix[current_text].index(max(results_matrix[current_text]))

        ranking_dict[current_query].append(
            f"texto-{best_result_index + 1}" + " d=" + str(results_matrix[current_text][best_result_index]))

        results_matrix[current_text].pop(best_result_index)


In [2038]:
#print(ranking_dict)

Export the ranking dict as ranking.csv

In [2039]:
pd.DataFrame.from_dict(data=ranking_dict, orient='index').to_csv('ranking.csv',
                                                                 header=["Primero", "Segundo", "Tercero"])