In [78]:
import pandas as pd
import numpy as np
from utils import read_config_file
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import read_config_file, process_string, create_csv_file
import xml.etree.ElementTree as ET
import re
import string
from numpy import linalg as LA

In [35]:
corpus = ['We are good',
        'We are becoming better',
        'We will be great']

In [56]:
print("Computing inverted index...")
inverted_index = {}
for i, record in enumerate(corpus):
  for word in re.findall(r'\w+', record):
    if word in inverted_index:
      inverted_index[word].append(i)
    else:
      inverted_index[word] = [i]
print(f"Terms in inverted index: {len(inverted_index)}.")

print("Creating inverted index file...")
inverted_index_columns = ["Word", "RecordNum"]

inverted_index_df = pd.DataFrame(data=[[term[0], str(term[1])] for term in inverted_index.items()], columns=["Word","RecordNum"])
inverted_index_df

Computing inverted index...
Terms in inverted index: 8.
Creating inverted index file...


Unnamed: 0,Word,RecordNum
0,We,"[0, 1, 2]"
1,are,"[0, 1]"
2,good,[0]
3,becoming,[1]
4,better,[1]
5,will,[2]
6,be,[2]
7,great,[2]


In [82]:
config_file = "../config/index.cfg"

def compute_term_document_matrix(inverted_index_df, records_num):
  terms_frequencies = []

  for record_num in records_num:
    terms_frequencies.append(inverted_index_df["RecordNum"].str.count(record_num))

  terms_frequencies = np.array(terms_frequencies).T

  number_of_documents = terms_frequencies.shape[-1]
  terms_occurence_on_documents = np.sum(np.where(terms_frequencies > 0, 1, 0), axis=1)
  terms_idf = (np.log((1 + number_of_documents)/(1 + terms_occurence_on_documents)) + 1).reshape((terms_occurence_on_documents.shape[0], 1))

  term_document_matrix = terms_frequencies * terms_idf
  term_document_matrix = term_document_matrix / LA.norm(term_document_matrix, axis=0)

  return term_document_matrix

def get_records_num_set(inverted_index_df):
  inverted_index_df["RecordNum"] = inverted_index_df["RecordNum"].str.replace("'", "")
  inverted_index_df["RecordNum"] = inverted_index_df["RecordNum"].str.replace(" ", "")
  records_num = set([record_num for word in inverted_index_df["RecordNum"].str[1:-1].str.split(",").tolist() for record_num in word])

  return records_num

print("Reading config files...")
config_dict = read_config_file(config_file)
inverted_index_file = config_dict["leia"]
term_document_matrix_file = config_dict["escreva"]

print("Reading inverted index file...")
inverted_index_df = pd.read_csv(inverted_index_file, sep=';')

records_num = get_records_num_set(inverted_index_df)
words = inverted_index_df["Word"].tolist()
print(f"Inverted index dimensions: ({len(words)},{len(records_num)}).")

print("Computing terms-documents matrix from inverted index...")
term_document_matrix = compute_term_document_matrix(inverted_index_df, records_num)

print("Creating terms-documents matrix file...")
term_document_df = pd.DataFrame(data=term_document_matrix, index=words, columns=records_num)
# term_document_df.to_csv(term_document_matrix_file, sep=";")



Reading config files...
Reading inverted index file...
Inverted index dimensions: (8,3).
Computing terms-documents matrix from inverted index...
Creating terms-documents matrix file...


In [83]:
inverted_index_df

Unnamed: 0,Word,RecordNum
0,WE,"[00001,00002,00003]"
1,ARE,"[00001,00002]"
2,GOOD,[00001]
3,BECOMING,[00002]
4,BETTER,[00002]
5,WILL,[00003]
6,BE,[00003]
7,GREAT,[00003]


In [84]:
term_document_df

Unnamed: 0,00003,00001,00002
WE,0.322745,0.425441,0.345205
ARE,0.0,0.547832,0.444514
GOOD,0.0,0.720333,0.0
BECOMING,0.0,0.0,0.584483
BETTER,0.0,0.0,0.584483
WILL,0.546454,0.0,0.0
BE,0.546454,0.0,0.0
GREAT,0.546454,0.0,0.0


In [77]:
tfvec = TfidfVectorizer()
tdf = tfvec.fit_transform(corpus)
bow = pd.DataFrame(tdf.toarray(), columns = tfvec.get_feature_names())
bow



Unnamed: 0,are,be,becoming,better,good,great,we,will
0,0.547832,0.0,0.0,0.0,0.720333,0.0,0.425441,0.0
1,0.444514,0.0,0.584483,0.584483,0.0,0.0,0.345205,0.0
2,0.0,0.546454,0.0,0.0,0.0,0.546454,0.322745,0.546454
