<a href="https://colab.research.google.com/github/DavidMedeiros/lab5ri/blob/master/Lab5_RecInfo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Laboratório 5 - Expansão de Consultas
## David de Medeiros Souza

---


In [0]:
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
import collections
from collections import Counter,OrderedDict
import csv
from tabulate import tabulate

import time
import heapq as hp

import nltk
from nltk.stem import RSLPStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('rslp')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Funções Auxiliares

In [0]:
"""
  Verifica se há numeros na string 
"""
def hasNumbers(inputString):
  return bool(re.search(r'\d', inputString))

""" 
  Retorna os documentos do indice como parametro
"""
def get_documents(index):
  documents = []
  
  for _, inverted in index.items():
      for document, _ in inverted.items():
        documents.append(document)
        
  documents = list(set(documents))
  
  return documents

  return bool(re.search(r'\d', inputString))

""" 
  Retorna a quantidade de documentos que term aparece
"""
def num_of_docs_of_term(term):
    return len(index[term].keys())

""" 
  Retorna a quantidade total de documentos
"""
def num_of_docs():
  return csv['text'].count()

""" 
  Retorna a quantidade de documentos que o term_a E o term_b aparecem
"""
def num_docs_intercessions(term_a,term_b):
  docs1 = index[term_a].keys()
  docs2 = index[term_b].keys()
  counter = 0
  
  for doc in docs1:
    if doc in docs2:
      counter = counter + 1
      
  return counter

""" 
  Retorna o valor de Mutual information (MIM) para o term_a e term_n
""" 
def mutual_information(term_a, term_b):
  n_a = num_of_docs_of_term(term_a)
  n_b = num_of_docs_of_term(term_b)
  n_ab = num_docs_intercessions(term_a, term_b) 
    
  return ((n_ab) / (n_a * n_b))


""" 
  Retorna o valor de Expected Mutual Information (EMIM) para o term_a e term_n
""" 
def expected_mutual_information(term_a, term_b):
  n_a = num_of_docs_of_term(term_a)
  n_b = num_of_docs_of_term(term_b)
  n_ab = num_docs_intercessions(term_a, term_b)
  n = num_of_docs() 
  
  if (n * ((n_ab) / (n_a * n_b))) == 0:
    return 0
 
  return n_ab * np.log(n * ((n_ab) / (n_a * n_b)))

""" 
  Retorna o valor de Chi Square para o term_a e term_n
""" 
def chi_square(term_a, term_b):
  n_a = num_of_docs_of_term(term_a)
  n_b = num_of_docs_of_term(term_b)
  n_ab = num_docs_intercessions(term_a, term_b)
  n = num_of_docs() 
  
  return ((n_ab - (1/n) * n_a * n_b) ** 2) / (n_a * n_b)

""" 
  Retorna o valor de Dice’s coefficient (Dice) para o term_a e term_n
""" 
def dices_coefficient(term_a, term_b):
  n_a = num_of_docs_of_term(term_a)
  n_b = num_of_docs_of_term(term_b)
  n_ab = num_docs_intercessions(term_a, term_b)
    
  return ((n_ab) / (n_a + n_b))

"""
  Retorna a lista ordenada (MIM) para uma determinada query
"""
def get_mim_list_ordered(query):
  pairs = []
  for termo in index.keys():
    if termo != query:
      mim_value = mutual_information(query, termo)
      pairs.append((termo, mim_value))      
  pairs = sorted(pairs, key = lambda x: x[1], reverse=True)
  return pairs

"""
  Retorna a lista ordenada (EMIM) para uma determinada query
"""
def get_emim_list_ordered(query):
  pairs = []
  for termo in index.keys():
    if termo != query:
      emim_value = expected_mutual_information(query, termo)
      pairs.append((termo, emim_value))      
  pairs = sorted(pairs, key = lambda x: x[1], reverse=True)
  return pairs

"""
  Retorna a lista ordenada (Chi-square) para uma determinada query
"""
def get_chisquare_list_ordered(query):
  pairs = []
  for termo in index.keys():
    if termo != query:
      chi_value = chi_square(query, termo)
      pairs.append((termo, chi_value))      
  pairs = sorted(pairs, key = lambda x: x[1], reverse=True)
  return pairs

"""
  Retorna a lista ordenada (Dice) para uma determinada query
"""
def get_dice_list_ordered(query):
  pairs = []
  for termo in index.keys():
    if termo != query:
      dice_value = dices_coefficient(query, termo)
      pairs.append((termo, dice_value))      
  pairs = sorted(pairs, key = lambda x: x[1], reverse=True)
  return pairs

"""
  Retorna uma tabela para query informada, com as top-10 palavras
  mais associadas a cada delas de acordo com as métricas 
  MIM, EMIM, CHISQUARE e DICE.
"""
def set_table(query):
  dataFrame = pandas.DataFrame()

  dataFrame['MIM'] = [word for word, value in measure[query]['mim'][0:10]]  
  dataFrame['EMIM'] = [word for word, value in measure[query]['emim'][0:10]] 
  dataFrame['X2'] = [word for word, value in measure[query]['chi-square'][0:10]]
  dataFrame['Dice'] = [word for word, value in measure[query]['dice'][0:10]]
  
  return dataFrame.head(10)

"""
  Versão de consulta conjuntiva AND - Documentos por vez
"""
def conj_query(Q, I, k):
  q_indexes = []
  rank = []
  
  for word in Q.split(" "):
    if word in I.keys():
        q_indexes.append(I[word])
  
  all_indexes = [(k, item) for sublist in q_indexes for (k, item) in sublist.items()]
  all_indexes = sorted(all_indexes, key = lambda x: x[0])
  
  for i in range(len(all_indexes)):
    doc_score = 0
    d,f = all_indexes.pop()
    repeat = 1
    for document, freq in all_indexes:
      if document == d:
        doc_score += freq
        repeat += 1
    if doc_score != 0 and repeat == len(q_indexes):
      doc_score += f
      hp.heappush(rank, (doc_score, d))
  return [(d, sd) for sd, d in hp.nlargest(k, rank)]

"""
  Retorna a consulta expandida com os top number_of_top_terms de acordo com a métrica EMIM
"""
def get_expanded_query(query, number_of_top_terms):
  top_results = [word for word, value in measure[query]['emim'][0:number_of_top_terms]]
  result = query
  for word in top_results:
    result += " " + word
    
  return result

# Construção do índice 



In [0]:
dataset_url = 'https://raw.githubusercontent.com/DavidMedeiros/ri_lab_01/master/output/results.csv'
csv = pandas.read_csv(dataset_url)
documents = csv['text']

tokens = []
tokens_filtered= []

toker = RegexpTokenizer('''\w+[-']*\w*''')
stopwords = stopwords.words("portuguese")

for document in documents:
  tokens = tokens + toker.tokenize(document)


def build_index(documents):
  I = {}
  n = 0
  for document in documents:
    n += 1
    T = [token for token in toker.tokenize(document.lower())
         if token not in stopwords and len(token) > 2 and not hasNumbers(token)]

    for token in T:
      if token not in I:
        I[token] = {}
      
      ocurrence = T.count(token)
      if n not in I[token]:
        I[token][n] = ocurrence
      
  return I

index = build_index(documents)

data = {'token': list(index.keys()), 'ocurrences': list(index.values())}

df = pandas.DataFrame(data)

df.to_csv('index.csv')

# Questão 1

Considerando as consultas de um termo somente, utilizadas no laboratório anterior: **"ministro", "justiça", "bolsonaro", "lula" e "tribunal"**, essa questão tem como objetivo a construção de uma tabela para cada consulta, informando as top-10 palavras mais associadas a cada delas de acordo com as métricas MIM, EMIM, CHISQUARE e DICE.

Após visualizar as tabelas, acredito que **a métrica que obteve os melhores resultados foi a EMIM**, pois as top-10 palavras são mais promissoras e fazem mais sentido para cada uma das cinco consultas. 


In [0]:
queries = ["ministro", "justiça", "bolsonaro", "lula", "tribunal"]

measure = {}
  
for query in queries:
  querie_dict = {}
  querie_dict['mim'] = get_mim_list_ordered(query)
  querie_dict['emim'] = get_emim_list_ordered(query)
  querie_dict['chi-square'] = get_chisquare_list_ordered(query)
  querie_dict['dice'] = get_dice_list_ordered(query)
  measure[query] = querie_dict
  
  df = set_table(query)
  print("TABELA PARA CONSULTA: " + query) 
  print(tabulate(df, headers='keys', tablefmt='fancy_grid'))
  print("\n")


TABELA PARA CONSULTA: ministro
╒════╤════════════╤═════════╤════════════╤═════════╕
│    │ MIM        │ EMIM    │ X2         │ Dice    │
╞════╪════════════╪═════════╪════════════╪═════════╡
│  0 │ perfil     │ desta   │ guedes     │ desta   │
├────┼────────────┼─────────┼────────────┼─────────┤
│  1 │ verificada │ federal │ desta      │ federal │
├────┼────────────┼─────────┼────────────┼─────────┤
│  2 │ poucas     │ guedes  │ ricardo    │ sobre   │
├────┼────────────┼─────────┼────────────┼─────────┤
│  3 │ provar     │ israel  │ publicação │ após    │
├────┼────────────┼─────────┼────────────┼─────────┤
│  4 │ postando   │ após    │ quanto     │ vai     │
├────┼────────────┼─────────┼────────────┼─────────┤
│  5 │ calendário │ sobre   │ israel     │ israel  │
├────┼────────────┼─────────┼────────────┼─────────┤
│  6 │ datando    │ ricardo │ rodriguez  │ paulo   │
├────┼────────────┼─────────┼────────────┼─────────┤
│  7 │ provando   │ oficial │ negócios   │ guedes  │
├────┼─────────

# Questão 2

In [0]:
documents_results = {}
expanded_querys = {}

for top_k in [3, 5, 10]:
  for query in queries:
    expanded_query = get_expanded_query(query, top_k)
    documents = conj_query(expanded_query, index, 10)
    
    if query not in documents_results.keys():
      documents_results[query] = [documents]
      expanded_querys[query] = [expanded_query]
    else:
      documents_results[query].append(documents)
      expanded_querys[query].append(expanded_query)

In [0]:
pandas.options.display.max_colwidth = 160

data_frame_conj = pandas.DataFrame()
data_frame_conj['Top k'] = ['3','5','10'] * 3
data_frame_conj['Consulta expandida'] = expanded_querys[queries[0]] + expanded_querys[queries[1]] + expanded_querys[queries[2]]
data_frame_conj['Documentos'] = documents_results[queries[0]] + documents_results[queries[1]] + documents_results[queries[2]]
  
print(tabulate(data_frame_conj, headers='keys', tablefmt='fancy_grid'))


╒════╤═════════╤═══════════════════════════════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════╕
│    │   Top k │ Consulta expandida                                                                                │ Documentos                                                                                         │
╞════╪═════════╪═══════════════════════════════════════════════════════════════════════════════════════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════╡
│  0 │       3 │ ministro desta federal guedes                                                                     │ [(69, 17)]                                                                                         │
├────┼─────────┼───────────────────────────────────────────────────────────────────────────────────────────────────┼────────────