In [None]:
import os
import pandas as pd
from tqdm import tqdm
import logging
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import re

In [None]:
import json 

json_file_path = "/home/lamapi/lamAPI/data/Downloads/_HTR2/HTR2_ext_WD_query_type.json"

with open(json_file_path, "r") as file:
    HTR2_type = json.load(file)


tables_path = "/home/lamapi/lamAPI/data/Downloads/HardTablesR2/tables/"
cea_file = '/home/lamapi/lamAPI/data/Downloads/HardTablesR2/gt/cea.csv'
cta_file = '/home/lamapi/lamAPI/data/Downloads/HardTablesR2/gt/cta.csv'


os.listdir(tables_path)
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the cea_file and create a key-value dictionary
df_cea = pd.read_csv(cea_file, header=None)
df_cea["key"] = df_cea[0] + " " + df_cea[1].astype(str) + " " + df_cea[2].astype(str)
df_cea["key_col"] = df_cea[0] + " " + df_cea[2].astype(str)
cea_values_dict = dict(zip(df_cea["key_col"].values, df_cea[3].values))

cea_keys_set = set(df_cea["key"].values)
cea_values_dict_cell = dict(zip(df_cea["key"].values, df_cea[3].values))

# Function to process a single table file
def process_table_file(table_file):
    try:
        table_name = os.path.splitext(os.path.basename(table_file))[0]
        df = pd.read_csv(table_file)
        qid_to_value = {}

        for row in range(df.shape[0]):
            for col in range(df.shape[1]):
                key = f"{table_name} {row+1} {col}"
                if key in cea_keys_set:
                    cell_value = df.iloc[row, col]
                    qid = cea_values_dict_cell[key].split('/')[-1]  # Extract the QID from the URL
                    qid_to_value[cell_value] = qid
                    break  # Exit inner loop early as only one match per row/col is needed

        return qid_to_value
    except Exception as e:
        logging.error(f"Error processing {table_file}: {e}")
        return {}

# List of table files
table_files = [
    os.path.join(tables_path, table)
    for table in os.listdir(tables_path)
    if not table.startswith('.')
]

# Process tables sequentially
HTR2_id_to_name = {}
for table_file in tqdm(table_files, desc="Processing tables"):
    local_key_to_cell = process_table_file(table_file)
    HTR2_id_to_name.update(local_key_to_cell)

In [None]:
def get_query(name, value):
    name = str(name).replace('"', ' ')
    if value is not None:
        # hard filtering constraint
        query_dict = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"name": {"query": name, "boost": 2.0}}},
                        {"terms": {"extended_WDtypes": value}}  # Ensures `value` matches at least one in the array
                    ]
                }
            }
        }
        params = {
            'name': name,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 20,
            'query': json.dumps(query_dict),  # Convert the query dictionary to a JSON string
            'sort': [
                '{"popularity": {"order": "desc"}}'
            ]
        }
    
    return params


queries = []
for name, id  in tqdm(HTR2_id_to_name.items()):
    if id in HTR2_type:
        types_list = HTR2_type[id]

        ########################################################
        ##  modificare se types_list è una lista di tipi
        ########################################################
    
        query = get_query(name, types_list)

        queries.append((query, id, types_list))
        if len(queries) == 4000:
            break


In [None]:
import aiohttp
import asyncio
import backoff
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio
import json
import requests

# Assume queries is a list of tuples [(param1, id1), (param2, id2), ...]

failed_queries = {}
url = 'http://localhost:5000/lookup/entity-retrieval'

@backoff.on_exception(
    backoff.expo,
    (aiohttp.ClientError, aiohttp.http_exceptions.HttpProcessingError, asyncio.TimeoutError),
    max_tries=10,
    max_time=400
)
async def fetch(session, url, params, headers, semaphore):
    async with semaphore:
        async with session.get(url, params=params, headers=headers, timeout=50) as response:
            try:
                response.raise_for_status()
                return await response.json()
            except asyncio.TimeoutError:
                print(f"Request timed out for params: {params}")
                return []
            except aiohttp.ClientError as e:
                print(f"ClientError for params : {str(e)}")
                return []
            except Exception as e:
                print(f"Unexpected error for params {params}: {str(e)}")
                return []

async def process_item(session, url, id, headers, params, semaphore, pbar):
    try:
        data = await fetch(session, url, params, headers, semaphore)
    except aiohttp.ClientResponseError as e:
        if e.status == 404:
            print(f"404 Error: Resource not found for '{id}'")
            pbar.update(1)
            return 0, 0
        else:
            raise

    num_result = len(data) if data else 0

    if data:
        for item in data:
            if id == item.get('id'):
                pbar.update(1)
                pos_score = item.get('pos_score', 0)
                if pos_score:
                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                else:
                    mrr_increment = 1 / num_result
                return mrr_increment, 1

    return 0, 0

async def main(queries, url, failed_queries, l):
    headers = {'accept': 'application/json'}
    semaphore = asyncio.Semaphore(50)
    m_mrr = 0
    cont_el = 0

    async with aiohttp.ClientSession() as session:
        tasks = []
        pbar = tqdm_asyncio(total=len(queries), desc=f"Limit {l}")
        for param, id, _ in queries:
            tasks.append(process_item(session, url, id, headers, param, semaphore, pbar))

        results = await asyncio.gather(*tasks)

        for (mrr_increment, count), (param, id, item_NERtype) in zip(results, queries):
            if mrr_increment == 0 and count == 0:
                failed_queries[id] = (id, item_NERtype)
                
                param['limit'] = l
                
                query_dict = json.loads(param['query'])

                if "query" in query_dict and "bool" in query_dict["query"] and "must" in query_dict["query"]["bool"]:
                    for condition in query_dict["query"]["bool"]["must"]:
                        if "match" in condition and "name" in condition["match"]:
                            condition["match"]["name"]["fuzziness"] = "AUTO"

                param['query'] = json.dumps(query_dict)

                response = requests.get(url, params=param)
                if response.status_code == 200:
                    data = response.json()
                    num_result = len(data) if data else 0
                    if data:
                        for item in data:
                            if id == item.get('id'):
                                pbar.update(1)
                                pos_score = item.get('pos_score', 0)
                                if pos_score:
                                    mrr_increment = (num_result - (pos_score * num_result)) / num_result
                                else:
                                    mrr_increment = 1 / num_result

                m_mrr += mrr_increment
                cont_el += count
            else:
                m_mrr += mrr_increment
                cont_el += count

        pbar.close()

    print(f"Coverage of 2T (l={l}): {cont_el / len(queries)}")
    print(f"Measure Reciprocal Rank of 2T (l={l}): {m_mrr / len(queries)}")

    return cont_el / len(queries), m_mrr / len(queries)

if __name__ == "__main__":
    nest_asyncio.apply()
    try:
        trend_dict = {}
        l_values = list(range(1, 100, 5))  # Increment `l` by 5
        loop = asyncio.get_event_loop()

        results = loop.run_until_complete(
            asyncio.gather(
                *(main(queries, url, failed_queries, l) for l in l_values)
            )
        )

        for l, (coverage, mrr) in zip(l_values, results):
            trend_dict[l] = (coverage, mrr)

        print("Trend Dictionary:", trend_dict)

    except RuntimeError as e:
        print(f"Runtime error: {e}")
