In [348]:
import pandas as pd
import nltk
import math
import numpy as np


In [349]:
#Inicia os constantes
sizeDF = 8716

In [350]:
#Função para carregar o csv, preencher os espaços NaN e formatar o contéudo que ira ser utilizado
def getCSVFormattedAsDF():
    df = pd.read_csv('estadao_noticias_eleicao.csv')
    df = df.fillna('')
    df["data"] = df["titulo"] + ' ' +  df["subTitulo"] + ' ' + df["conteudo"]
    return df

#Função para carregar o csv do gabarito que será utilizado no experimento
def getGabaritoDF():
    df = pd.read_csv('gabarito.csv')
    return df

In [351]:
"""
 return as informações de tf que serão utilizado durante todo o experimento.
   {"idNoticia1": {"word1": num_ocorrencias}, {"word2": num_ocorrencias}, 
    "idNoticia2": {"word1": num_ocorrencias}, {"word3": num_ocorrencias} ... }
"""
def get_tf():
    tf = {}
    df = getCSVFormattedAsDF()
    sizeDF = len(df["idNoticia"])
    
    for idx in range(sizeDF):
        content = df["data"][idx]
        words = nltk.word_tokenize(content)
        tf[df["idNoticia"][idx]] = {}
        for word in words:
            word = word.lower()
            try:
                tf[df["idNoticia"][idx]][word] = tf[df["idNoticia"][idx]][word] + 1
            except KeyError:
                tf[df["idNoticia"][idx]][word] = 1
            
    return tf

In [352]:
"""
 return o idf de cada palavra
"""
def get_idf(tf):
    tfGeneral = {}
    idf = {}
    for new in tf:
        for word in tf[new]:
            try:
                tfGeneral[word] = tfGeneral[word] + 1
            except KeyError:
                 tfGeneral[word] = 1
    for word in tfGeneral:
        idf[word] = math.log(float(sizeDF)/tfGeneral[word]) 
    return idf

In [353]:
# Retorna o valor dos top5 documentos pela busca binaria, de acordo com a query pesquisada 
def top5Binario(query, tf):
    doc = {}
    for new in tf:
        doc[new] = 0
        sum = 0
        checkIfConjuntive = True
        for palavra in query:
            if (palavra in tf[new]):
                sum += doc[new] + 1
            else:
                checkIfConjuntive = False
        if (checkIfConjuntive):
            doc[new] = True
    sortedDoc = sorted(doc, key=doc.get, reverse=True)
    
    return sortedDoc[0:5]
 
# Retorna o valor dos top5 documentos pelo tf, de acordo com a query pesquisada 
def top5TF(query, tf):
    doc = {}
    for new in tf:
        doc[new] = 0
        sum = 0
        checkIfConjuntive = True
        for palavra in query:
            if (palavra in tf[new]):
                sum += tf[new][palavra]
            else:
                checkIfConjuntive = False
        if (checkIfConjuntive):
            doc[new] += sum
    sortedDoc = sorted(doc, key=doc.get, reverse=True)

    return sortedDoc[0:5]

# Retorna o valor dos top5 documentos pelo idf, de acordo com a query pesquisada 
def top5TFIDF(query, tf, idf):
    doc = {}
    for new in tf:
        doc[new] = 0
        sum = 0
        checkIfConjuntive = True
        for palavra in query:
            if (palavra in tf[new]):
                sum += tf[new][palavra] * idf[palavra]
            else:
                checkIfConjuntive = False
        if (checkIfConjuntive):
            doc[new] += sum
    sortedDoc = sorted(doc, key=doc.get, reverse=True)

    return sortedDoc[0:5]

# Retorna o valor dos top5 documentos pelo idf, de acordo com a query pesquisada 
def top5bm25(query, tf, idf):
    doc = {}
    k = 1.5
    b = 0.75
    for new in tf:
        doc[new] = 0
        sum = 0
        checkIfConjuntive = True
        for palavra in query:
            if (palavra in tf[new]):
                sum += ((tf[new][palavra]*(k+1.0)) / (tf[new][palavra] + k*0.25 + k*b*(())) * idf[palavra] 
            else:
                checkIfConjuntive = False
        if(checkIfConjuntive):
            doc[new] += sum
       
    sortedDoc = sorted(doc, key=doc.get, reverse=True)

    return sortedDoc[0:5]

In [354]:
# Funções comparativas do experimento
def apk(actual, predicted, k=10):
    if (len(predicted)>k):
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if (not actual):
        return 0.0

    return (score / min(len(actual), k))

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [355]:
#Funções para montar as respostas em formato de dicionarios de arrays oriundas do gabarito do experimento
def getGabFormatted(key, idx):
    formattedGab = gabaritoDF[key][idx].replace(",", "")
    formattedGab = formattedGab.replace("[", "")
    formattedGab = formattedGab.replace("]", "")
    return list(map(int, formattedGab.split(' ')))

def getGabResult(): 
    gabaritoDF = getGabaritoDF()
    gabResult = {}
    gabResult["tf"] = []
    gabResult["binario"] = []
    gabResult["tfidf"] = []
    gabResult["bm25"] = []

    for idx in range(len(gabaritoDF)):
        gabResult["tf"].append(getGabFormatted("tf", idx))
        gabResult["binario"].append(getGabFormatted("busca_binaria",idx))
        gabResult["tfidf"].append(getGabFormatted("tfidf",idx))
        gabResult["bm25"].append(getGabFormatted("bm25",idx))
    return gabResult

In [364]:
#Funções para montar as respostas em formato de dicionarios de arrays oriundas das minhas respostas obtidas no experimento
def getMyResultsFormatted(queries, tf, idf):
    myResult = {}
    myResult["tf"] = []
    myResult["binario"] = []
    myResult["tfidf"] = []
    myResult["bm25"] = []

    for query in queries:
        myResult["binario"].append(top5Binario(query, tf)) 
        myResult["tf"].append(top5TF(query, tf))
        myResult["tfidf"].append(top5TFIDF(query, tf, idf))
        myResult["bm25"].append(top5bm25(query, tf, idf))
    return myResult

In [357]:
#Armezana em memoria o tf
tf = get_tf()
#Armezana em memoria o tf
idf = get_idf(tf)
#Queries que foram utilizadas no experimento
queries = [
    ["segundo", "turno"],
    ["lava", "jato"],
    ["projeto", "de", "lei"],
    ["compra", "de", "voto"],
    ["ministério", "público"]
]

In [367]:
myResult = getMyResultsFormatted(queries, tf, idf)
gabResult = getGabResult()

In [366]:
print(mapk(gabResult["tf"], myResult["tf"], k=5))
print(mapk(gabResult["binario"], myResult["binario"], k=5))
print(mapk(gabResult["tfidf"], myResult["tfidf"], k=5))
print(mapk(gabResult["bm25"], myResult["bm25"], k=5))


1.0
0.24
0.7606666666666666
0.0


In [314]:
print (myResult["bm25"])

[[1, 7, 13, 26, 69], [27, 81, 92, 95, 98], [10, 25, 182, 263, 264], [82, 553, 748, 854, 1074], [7, 15, 21, 27, 38]]


In [346]:
print (myResult["tfidf"])

[[2744, 2112, 7672, 1235, 2388], [163, 353, 2807, 127, 359], [7017, 2853, 7, 2232, 3171], [2047, 7017, 7343, 7293, 5129], [6798, 8018, 6244, 6965, 6550]]
