<a href="https://colab.research.google.com/github/DavidMedeiros/lab5ri/blob/master/Lab5_RecInfo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Laboratório 5 - Expansão de Consultas
## David de Medeiros Souza

---


In [144]:
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
import collections
from collections import Counter,OrderedDict
import csv
from tabulate import tabulate

import time
import heapq as hp

import nltk
from nltk.stem import RSLPStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('rslp')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Funções Auxiliares

In [0]:
"""
  Verifica se há numeros na string 
"""
def hasNumbers(inputString):
  return bool(re.search(r'\d', inputString))

""" 
  Retorna a quantidade de documentos que term aparece
"""
def num_of_docs_of_term(term, ranking):
    return len(ranking[term])
"""
  Retorna a quantidade de documentos que o term_a E o term_b aparecem
"""
def num_docs_intercessions(term_a, term_b, ranking):
  docs1 = ranking[term_a]
  docs2 = ranking[term_b]
  counter = 0
  
  for doc in docs1:
    if doc in docs2:
      counter = counter + 1
      
  return counter

""" 
  Retorna o valor de Mutual information (MIM) para o term_a e term_n
""" 
def mutual_information(term_a, term_b, ranking):
  n_a = num_of_docs_of_term(term_a, ranking)
  n_b = num_of_docs_of_term(term_b, ranking)
  n_ab = num_docs_intercessions(term_a, term_b, ranking) 
   
  if (n_a * n_b) == 0:
    return 0
  return ((n_ab) / (n_a * n_b))

""" 
  Retorna o valor de Expected Mutual Information (EMIM) para o term_a e term_n
""" 
def expected_mutual_information(term_a, term_b, ranking, n):
  n_a = num_of_docs_of_term(term_a, ranking)
  n_b = num_of_docs_of_term(term_b, ranking)
  n_ab = num_docs_intercessions(term_a, term_b, ranking)
  
  if((n_a * n_b) == 0): 
    return 0
  
  if (n * ((n_ab) / (n_a * n_b))) == 0:
    return 0
 
  return n_ab * np.log(n * ((n_ab) / (n_a * n_b)))

""" 
  Retorna o valor de Chi Square para o term_a e term_n
""" 
def chi_square(term_a, term_b, ranking, n):
  n_a = num_of_docs_of_term(term_a, ranking)
  n_b = num_of_docs_of_term(term_b, ranking)
  n_ab = num_docs_intercessions(term_a, term_b, ranking)
  
  if (n_a * n_b) == 0:
    return 0
  
  return ((n_ab - (1/n) * n_a * n_b) ** 2) / (n_a * n_b)

""" 
  Retorna o valor de Dice’s coefficient (Dice) para o term_a e term_n
""" 
def dices_coefficient(term_a, term_b, ranking):
  n_a = num_of_docs_of_term(term_a, ranking)
  n_b = num_of_docs_of_term(term_b, ranking)
  n_ab = num_docs_intercessions(term_a, term_b, ranking)
    
  if (n_a + n_b) == 0:
    return 0
  
  return ((n_ab) / (n_a + n_b))

def term_at_time(query, index, k):
  query_terms = query.split()    
  inverted_lists = []
  acumulador = {}

  for term in query_terms:
    if term in index.keys():
      inverted_lists.append(index[term])

  for lst in inverted_lists:
    for token,ocurrence in lst.items():
      d = token
      frequency = ocurrence
      if d in acumulador.keys():
        acumulador[d] = acumulador[d] + frequency
      else:
        acumulador[d] = frequency
  
  R = list(map(lambda elem: (elem[1],elem[0]), acumulador.items()))
  
  hp._heapify_max(R)
  
  top_k = []
  
  for i in range(1,k+1):
    if R != []:
      top = hp._heappop_max(R)
      top_k.append(top)
  
  return [(d, sd) for sd,d in top_k]
          
def top_docs(query, k):
  top_docs = [doc for doc,score in term_at_time(query, index, k)]
  top_docs_ranking = {} 
  
  for word in index:
    top_docs_ranking[word] = []
    for (doc,score) in index[word].items():
      if doc in top_docs:
        top_docs_ranking[word].append(index[word].items())
        
  return top_docs_ranking

# Construção do índice 



In [0]:
dataset_url = 'https://raw.githubusercontent.com/DavidMedeiros/ri_lab_01/master/output/results.csv'
csv = pandas.read_csv(dataset_url)
documents = csv['text']

tokens = []
tokens_filtered= []

toker = RegexpTokenizer('''\w+[-']*\w*''')
stopwords = stopwords.words("portuguese")

for document in documents:
  tokens = tokens + toker.tokenize(document)


def build_index(documents):
  I = {}
  n = 0
  for document in documents:
    n += 1
    T = [token for token in toker.tokenize(document.lower())
         if token not in stopwords and len(token) > 2 and not hasNumbers(token)]

    for token in T:
      if token not in I:
        I[token] = {}
      
      ocurrence = T.count(token)
      if n not in I[token]:
        I[token][n] = ocurrence
      
  return I

index = build_index(documents)

data = {'token': list(index.keys()), 'ocurrences': list(index.values())}

df = pandas.DataFrame(data)

df.to_csv('index.csv')

# Questão 1

Considerando as consultas de um termo somente, utilizadas no laboratório anterior: **"ministro", "justiça", "bolsonaro", "lula" e "tribunal"**, essa questão tem como objetivo a construção de uma tabela para cada consulta, informando as top-10 palavras mais associadas a cada delas de acordo com as métricas MIM, EMIM, CHISQUARE e DICE.

Após visualizar as tabelas, acredito que **a métrica que obteve os melhores resultados foi a EMIM**, pois as top-10 palavras são mais promissoras e fazem mais sentido para cada uma das cinco consultas. 


In [147]:
queries = ["ministro", "justiça", "bolsonaro", "lula", "tribunal"]

def build_metric_list(termo, metric, ranking, k):
  pairs = []
  for posting in index.keys():
    if posting != termo:
      if metric == 'mim':
        metric_value = mutual_information(termo, posting, ranking)
      elif metric == 'emim':
        metric_value = expected_mutual_information(termo, posting, ranking, k)
      elif metric == 'chi-square':
        metric_value = chi_square(termo, posting, ranking, k)
      elif metric == 'dice':
        metric_value = dices_coefficient(termo, posting, ranking)
      pairs.append((posting, metric_value))      
  pairs = sorted(pairs, key = lambda x: x[1], reverse=True)
  return pairs

termos_metricas = {}

k = csv['text'].count()


def build_table(termo, I, k):
  for query in queries:
    metricas = {}
    metricas['mim'] = build_metric_list(query, 'mim', I, k)
    metricas['emim'] = build_metric_list(query, 'emim', I, k)
    metricas['chi-square'] = build_metric_list(query, 'chi-square', I, k)
    metricas['dice'] = build_metric_list(query, 'dice', I, k)
    termos_metricas[query] = metricas

  mim = [k for k,_ in termos_metricas[termo]['mim'][0:10]]
  emim = [k for k,_ in termos_metricas[termo]['emim'][0:10]]
  chi_square = [k for k,_ in termos_metricas[termo]['chi-square'][0:10]]
  dice = [k for k,_ in termos_metricas[termo]['dice'][0:10]]
  data = {'MIM': mim, 'EMIM': emim, 'X²': chi_square, 'Dice': dice}
  return pandas.DataFrame(data)

for query in queries:
  print("TABELA PARA CONSULTA: " + query) 
  print(tabulate(build_table(query, index, k) , headers='keys', tablefmt='fancy_grid'))
  print("\n")


TABELA PARA CONSULTA: ministro
╒════╤════════════╤═════════╤════════════╤═════════╕
│    │ MIM        │ EMIM    │ X²         │ Dice    │
╞════╪════════════╪═════════╪════════════╪═════════╡
│  0 │ perfil     │ desta   │ guedes     │ desta   │
├────┼────────────┼─────────┼────────────┼─────────┤
│  1 │ verificada │ federal │ desta      │ federal │
├────┼────────────┼─────────┼────────────┼─────────┤
│  2 │ poucas     │ guedes  │ ricardo    │ sobre   │
├────┼────────────┼─────────┼────────────┼─────────┤
│  3 │ provar     │ israel  │ publicação │ após    │
├────┼────────────┼─────────┼────────────┼─────────┤
│  4 │ postando   │ após    │ quanto     │ vai     │
├────┼────────────┼─────────┼────────────┼─────────┤
│  5 │ calendário │ sobre   │ israel     │ israel  │
├────┼────────────┼─────────┼────────────┼─────────┤
│  6 │ datando    │ ricardo │ rodriguez  │ paulo   │
├────┼────────────┼─────────┼────────────┼─────────┤
│  7 │ provando   │ oficial │ negócios   │ guedes  │
├────┼─────────

# Questão 2

Expansão para os 
***TOP 3 Elementos***

In [160]:
k = 3
ranking_docs1 = top_docs(queries[0], k)
ranking_docs2 = top_docs(queries[1], k)
ranking_docs3 = top_docs(queries[2], k)
ranking_docs4 = top_docs(queries[3], k)
ranking_docs5 = top_docs(queries[4], k)

table_1 = build_table(query, index, k)
table_top1 = build_table(queries[0], ranking_docs1, k)

print(queries[0])
table_dice = pandas.DataFrame()
table_dice['Dice before Term A Time'] = table_1['EMIM']
table_dice['Dice after Term A Time'] = table_top1['EMIM']
table_dice

ministro


Unnamed: 0,Dice before Term A Time,Dice after Term A Time
0,sergio,justiça
1,perfil,segurança
2,oficial,pública
3,twitter,sergio
4,verificada,moro
5,social,criou
6,poucas,perfil
7,provar,oficial
8,internautas,twitter
9,postando,nesta
