In [3]:
import matplotlib.pyplot as plt
from sklearn.metrics import PrecisionRecallDisplay
import numpy as np
import json
import requests
import pandas as pd

In [11]:
QUERY_NUMBER = "q4"
QUERY_PATH = '../queries/' + QUERY_NUMBER + '/' + QUERY_NUMBER

BASE_QUERY_URL = "http://localhost:8983/solr/games/select?fq=%7B!child%20of%3D%22*%3A*%20-_nest_path_%3A*%22%7Dtitle%3Abasketball%20OR%20summary%3A(basketball%20OR%20best)%20OR%20wikipedia%3A(basketball%20OR%20best)%20OR%20genre%3Abasketball&indent=true&q.op=AND&q=review%3A(basketball%20OR%20best)%20OR%20platform%3A%22Playstation%204%22&rows=1000&useParams=&wt=json"
BOOSTED_QUERY_URL = "http://localhost:8983/solr/games/select?bq=%7B!child%20of%3D%22*%3A*%20-_nest_path_%3A*%22%7Dtitle%3Abasketball%5E2%20OR%20summary%3A(basketball%20OR%20best)%20OR%20wikipedia%3A(basketball%20OR%20best)%20OR%20genre%3Abasketball%5E2&defType=edismax&fq=%7B!child%20of%3D%22*%3A*%20-_nest_path_%3A*%22%7Dtitle%3A*&indent=true&ps=3&q.op=AND&q=(basketball%20OR%20best)%20OR%20%0A%22Playstation%204%22&qf=review%20platform%5E5&rows=1000&useParams=&wt=json"

#### Executa a base query e guarda os documentos obtidos no ficheiro baseRank.txt

In [12]:
#Query retorna reviews
base_results_reviews = requests.get(BASE_QUERY_URL).json()['response']['docs']
base_gameids = []
base_results = []

for index, doc in enumerate(base_results_reviews):
    if len(base_gameids) == 10:
        break
    
    if doc['id'].split('/')[0] not in base_gameids:
        #print("\nGetting game with id: ", doc['id'].split('/')[0])
        base_gameids.append(doc['id'].split('/')[0])
        query_url = "http://localhost:8983/solr/games/select?fl=*%2C%5Bchild%5D&indent=true&q.op=OR&q=id%3A(" + doc['id'].split('/')[0] + ')&useParams=&wt=json'
        result = requests.get(query_url).json()['response']['docs']
        #print(result)
        base_results.append(result[0])

base_ranked_doc = [doc['title'] for index, doc in enumerate(base_results)]

df_base = pd.DataFrame(base_ranked_doc, columns=['BASE'], index=None)

print(df_base)

latex_table = df_base.to_latex(index=False)

with open(QUERY_PATH+'_baseRank.txt', 'w') as tf:
    tf.write(latex_table)


                   BASE
0              NBA 2K16
1           NBA Live 19
2               NBA 2K3
3                NBA 07
4           NBA Live 15
5           NBA Live 16
6   ESPN NBA Basketball
7       NBA Playgrounds
8           NBA Live 14
9  inFamous: Second Son


  latex_table = df_base.to_latex(index=False)


In [13]:
#Query retorna reviews
boosted_results_reviews = requests.get(BOOSTED_QUERY_URL).json()['response']['docs']
boosted_gameids = []
boosted_results = []

for index, doc in enumerate(boosted_results_reviews):
    if len(boosted_gameids) == 10:
        break
    
    if doc['id'].split('/')[0] not in boosted_gameids:
        #print("\nGetting game with id: ", doc['id'].split('/')[0])
        boosted_gameids.append(doc['id'].split('/')[0])
        query_url = "http://localhost:8983/solr/games/select?fl=*%2C%5Bchild%5D&indent=true&q.op=OR&q=id%3A(" + doc['id'].split('/')[0] + ')&useParams=&wt=json'
        result = requests.get(query_url).json()['response']['docs']
        #print(result)
        boosted_results.append(result[0])

boosted_ranked_doc = [doc['title'] for index, doc in enumerate(boosted_results)]

df_boosted = pd.DataFrame(boosted_ranked_doc, columns=['BOOSTED'], index=None)

print(df_boosted)

latex_table = df_boosted.to_latex(index=False)

with open(QUERY_PATH+'_boostedRank.txt', 'w') as tf:
    tf.write(latex_table)

               BOOSTED
0      NBA Playgrounds
1  ESPN NBA Basketball
2             NBA 2K16
3          NBA Live 19
4          NBA Live 15
5          NBA Live 16
6             Disc Jam
7                WRC 7
8        Rocket League
9    Team Sonic Racing


  latex_table = df_boosted.to_latex(index=False)


#### Depois construir o ficheiro com todos os documentos relevantes

In [None]:
relevant = list(map(lambda el: el.strip(), open(QUERY_PATH+'_relevant.txt').readlines()))

print(relevant)

#### Definição das métricas

In [None]:
metrics = {}
metric = lambda f: metrics.setdefault(f.__name__, f)

@metric
def p10(results, relevant, n=10):
    """Precision at N"""
    return len([doc for doc in results[:n] if doc['id'] in relevant])/n

@metric
def ap(results, relevant):
    """Average Precision"""
    precision_values = []
    relevant_count = 0

    for idx, doc in enumerate(results):
        if doc['id'] in relevant:
            relevant_count += 1
            precision_at_k = relevant_count / (idx + 1)
            precision_values.append(precision_at_k)

    if not precision_values:
        return 0.0

    return sum(precision_values)/len(precision_values)

def calculate_metric(key, results, relevant):
    return metrics[key](results, relevant)

evaluation_metrics = {
    'ap': 'Average Precision',
    'p10': 'Precision at 10 (P@10)'
}

#### Função que cálcula a curva de Precision-Recall

In [None]:
def evaluate(results, queryType):
    precision_values = [
        len([
            doc 
            for doc in results[:idx]
            if doc['id'] in relevant
        ]) / idx 
        for idx, _ in enumerate(results, start=1)
    ]
    
    recall_values = [
        len([
            doc for doc in results[:idx]
            if doc['id'] in relevant
        ]) / len(relevant)
        for idx, _ in enumerate(results, start=1)
    ]
    
    precision_recall_match = {k: v for k,v in zip(recall_values, precision_values)}
    
    # Extend recall_values to include traditional steps for a better curve (0.1, 0.2 ...)
    recall_values.extend([step for step in np.arange(0.1, 1.1, 0.1) if step not in recall_values])
    recall_values = sorted(set(recall_values))

    # Extend matching dict to include these new intermediate steps
    for idx, step in enumerate(recall_values):
        if step not in precision_recall_match:
            if recall_values[idx-1] in precision_recall_match:
                precision_recall_match[step] = precision_recall_match[recall_values[idx-1]]
            else:
                precision_recall_match[step] = precision_recall_match[recall_values[idx+1]]

    disp = PrecisionRecallDisplay([precision_recall_match.get(r) for r in recall_values], recall_values)
    disp.plot()
    plt.savefig(QUERY_PATH+'_'+queryType+'_pr.pdf')
    

#### Cálculo da curva para a base query

In [None]:
evaluate(base_results, 'base')

#### Cálculo da curva para a boosted query

In [None]:
evaluate(boosted_results, 'boosted')

In [None]:
base_document_ids = [doc['id'] for doc in base_results]
base_relevance_column = ['Y' if doc_id in relevant else 'N' for doc_id in base_document_ids]

boosted_document_ids = [doc['id'] for doc in boosted_results]
boosted_relevance_column = ['Y' if doc_id in relevant else 'N' for doc_id in boosted_document_ids]

ranked_documents = list(zip(range(1, len(base_results) + 1), base_document_ids, base_relevance_column, boosted_document_ids, boosted_relevance_column))

df1 = pd.DataFrame(ranked_documents, columns=[('Rank', ''), ('Base System', 'Game'), ('Base System', 'Relevance'), ('Boosted System', 'Game'), ('Boosted System', 'Relevance')])

# Convert the DataFrame to a LaTeX table without an index
latex_table = df1.to_latex(index=False)

# Write the LaTeX table to a file
with open(QUERY_PATH+'_ranked_documents.tex', 'w') as tf:
    tf.write(latex_table)

# Print the LaTeX table
print(latex_table)