In [1]:
import pandas as pd
import numpy as np
from utils import read_config_file
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import read_config_file, process_string, create_csv_file
import xml.etree.ElementTree as ET
import re
import string
from numpy import linalg as LA

In [2]:
corpus = ['We are good',
        'We are becoming better',
        'We will be great']

In [6]:
config_file = "../config/index.cfg"

def compute_term_document_matrix(inverted_index_df, records_num):
  terms_frequencies = []

  for record_num in records_num:
    terms_frequencies.append(inverted_index_df["RecordNum"].str.count(record_num))

  terms_frequencies = np.array(terms_frequencies).T

  number_of_documents = terms_frequencies.shape[-1]
  terms_occurence_on_documents = np.sum(np.where(terms_frequencies > 0, 1, 0), axis=1)
  terms_idf = (np.log((1 + number_of_documents)/(1 + terms_occurence_on_documents)) + 1).reshape((terms_occurence_on_documents.shape[0], 1))

  term_document_matrix = terms_frequencies * terms_idf
  term_document_matrix = term_document_matrix / LA.norm(term_document_matrix, axis=0)

  return term_document_matrix

def get_records_num_set(inverted_index_df):
  inverted_index_df["RecordNum"] = inverted_index_df["RecordNum"].str.replace("'", "")
  inverted_index_df["RecordNum"] = inverted_index_df["RecordNum"].str.replace(" ", "")
  records_num = set([record_num for word in inverted_index_df["RecordNum"].str[1:-1].str.split(",").tolist() for record_num in word])

  return records_num

print("Reading config files...")
config_dict = read_config_file(config_file)
inverted_index_file = config_dict["leia"]
term_document_matrix_file = config_dict["escreva"]

print("Reading inverted index file...")
inverted_index_df = pd.read_csv(inverted_index_file, sep=';')

records_num = get_records_num_set(inverted_index_df)
words = inverted_index_df["Word"].tolist()
print(f"Inverted index dimensions: ({len(words)},{len(records_num)}).")

print("Computing terms-documents matrix from inverted index...")
term_document_matrix = compute_term_document_matrix(inverted_index_df, records_num)

print("Creating terms-documents matrix file...")
term_document_df = pd.DataFrame(data=term_document_matrix, index=words, columns=records_num)
# term_document_df.to_csv(term_document_matrix_file, sep=";")



Reading config files...
Reading inverted index file...
Inverted index dimensions: (8,3).
Computing terms-documents matrix from inverted index...
Creating terms-documents matrix file...


In [7]:
inverted_index_df

Unnamed: 0,Word,RecordNum
0,WE,"[00001,00002,00003]"
1,ARE,"[00001,00002]"
2,GOOD,[00001]
3,BECOMING,[00002]
4,BETTER,[00002]
5,WILL,[00003]
6,BE,[00003]
7,GREAT,[00003]


In [84]:
term_document_df

Unnamed: 0,00003,00001,00002
WE,0.322745,0.425441,0.345205
ARE,0.0,0.547832,0.444514
GOOD,0.0,0.720333,0.0
BECOMING,0.0,0.0,0.584483
BETTER,0.0,0.0,0.584483
WILL,0.546454,0.0,0.0
BE,0.546454,0.0,0.0
GREAT,0.546454,0.0,0.0


In [36]:
words = term_document_df.index.values
my_array = np.zeros(8)
print(my_array)
word_index = np.flatnonzero(words == "BEAUTIFUL")
my_array[word_index] = 1
print(my_array)

[0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0.]


In [41]:
def execute_queries_on_index(config_file):
  print("Reading config files...")
  config_dict = read_config_file(config_file)
  term_document_matrix_file = config_dict["modelo"]
  queries_file = config_dict["consultas"]
  queries_results_file = config_dict["resultados"]

  print("Reading model and queries files...")
  term_document_matrix_df = pd.read_csv(term_document_matrix_file, sep=';', index_col=0)
  print(f"terms-documents matrix has the following dimensions: ({len(term_document_matrix_df)},{len(term_document_matrix_df.columns)}).")
  queries_df = pd.read_csv(queries_file, sep=';', index_col=0)
  print(f"There are {len(queries_df)} queries.")

  print("Computing queries docs scores...")
  queries_scored_results_data= get_queries_scored_results_data(term_document_matrix_df, queries_df)
  print("Saving queries results...")
  queries_scored_results_df = pd.DataFrame(data=queries_scored_results_data, columns=["QueryNumber", "Results"])
  return queries_scored_results_df
  # queries_scored_results_df.to_csv(queries_results_file, sep=";", index=False)

  print("Done!")

def get_queries_scored_results_data(term_document_matrix_df, queries_df):
  def get_queries_vectors(query_text):
    number_of_words = len(term_document_matrix_df.index)
    query_vector = np.zeros(number_of_words)

    for word in re.findall(r'\w+', query_text):
      word_index = np.flatnonzero(term_document_matrix_df.index == word)
      query_vector[word_index] = 1

    query_vector = query_vector / LA.norm(query_vector)

    return query_vector

  queries_vectors = queries_df["QueryText"].apply(get_queries_vectors)

  queries_scored_results_data = []
  term_document_matrix = term_document_matrix_df.values
  queries_vectors = queries_vectors.tolist()
  # vectors are normalized, so dot product is the same as cosine similarity
  queries_documents_similarities = np.dot(queries_vectors, term_document_matrix)

  for i, query_documents_similarities in enumerate(queries_documents_similarities):
    query_number = queries_df.index.values[i]
    query_results = []

    for j, query_document_similarity in enumerate(query_documents_similarities):
      if query_document_similarity > 0:
        document_number = term_document_matrix_df.columns[j]
        query_results.append((document_number, query_document_similarity))

    query_results = sorted(query_results, key=lambda tup: tup[1], reverse=True)
    for j, query_result in enumerate(query_results):
      queries_scored_results_data.append([query_number, [j, *query_result]])

  return queries_scored_results_data

queries_scored_results_df = execute_queries_on_index("../config/busca.cfg")
queries_scored_results_df

Reading config files...
Reading model and queries files...
terms-documents matrix has the following dimensions: (8,3).
There are 3 queries.
Computing queries docs scores...
Saving queries results...


Unnamed: 0,QueryNumber,Results
0,1,"[0, 00001, 0.688207721799424]"
1,1,"[1, 00002, 0.5584158923474207]"
2,1,"[2, 00003, 0.22821485436677302]"
3,2,"[0, 00002, 0.5844829010200651]"
4,3,"[0, 00003, 0.546454011634009]"
