In [1]:
import pandas as pd
import numpy as np
import json
import os
import utils

from pathlib import Path
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(19, 11)}, font_scale=2)

# Valid Queries

Select the set of valid queries from which we will sample them

In [2]:
valid_queries_dir = 'queries/expanded_wikipages/all_queries/minTupleWidth_2_tuplesPerQuery_1/'
valid_wikipage_ids = []
for filename in os.listdir(valid_queries_dir):
    wikipage_id = os.path.splitext(filename)[0].split('_')[1]
    valid_wikipage_ids.append(int(wikipage_id))

# Read the queries_df and only select queries found in the `vald_wikipage_ids` list
queries_df = pd.read_pickle('query_dataframes/expanded_wikipages/filtered_queries/minTupleWidth_all_tuplesPerQuery_all.pickle')
queries_df = queries_df[queries_df['wikipage_id'].isin(valid_wikipage_ids)]
queries_df

Unnamed: 0,wikipage,wikipage_id,num_tables,tables,num_entities,tuple_width,num_tuples,selected_table,selected_row_ids,categories_relevant_wikipages,categories_relevant_tables,navigation_links_relevant_wikipages,navigation_links_relevant_tables,categories_expansion_ratio,navigation_links_expansion_ratio,avg_query_containment
32,https://en.wikipedia.org/wiki/Andre_Norton_Award,32,1,[table-0001-242.json],[80],4.0,11.0,table-0001-242.json,"[4, 6, 7, 10, 13, 14, 28, 30, 31, 33, 36]",51.0,58.0,,,58.0,,0.036120
46,https://en.wikipedia.org/wiki/President_of_Ind...,46,1,[table-0001-319.json],[20],3.0,10.0,table-0001-319.json,"[0, 3, 4, 5, 6, 7, 8, 12, 14, 18]",7.0,7.0,,,7.0,,0.271930
66,https://en.wikipedia.org/wiki/Charlotte_Bobcat...,66,1,[table-0001-460.json],[108],3.0,28.0,table-0001-460.json,"[0, 8, 9, 15, 19, 21, 26, 28, 31, 34, 38, 39, ...",27.0,122.0,,,122.0,,0.026968
68,https://en.wikipedia.org/wiki/List_of_organism...,68,1,[table-0001-469.json],[355],3.0,14.0,table-0001-469.json,"[2, 44, 45, 53, 80, 107, 206, 208, 213, 219, 2...",16.0,37.0,,,37.0,,0.002137
97,https://en.wikipedia.org/wiki/1982_NCAA_Women'...,97,2,"[table-0001-64.json, table-0001-65.json]","[12, 58]",4.0,13.0,table-0001-65.json,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 13, 14, 15]",34.0,39.0,,,19.5,,0.210526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244164,https://en.wikipedia.org/wiki/2004_LPGA_Tour,244164,2,"[table-1653-355.json, table-1653-356.json]","[64, 15]",3.0,10.0,table-1653-355.json,"[4, 6, 7, 8, 10, 16, 17, 24, 26, 30]",68.0,81.0,,,40.5,,0.237500
244174,https://en.wikipedia.org/wiki/1998_NCAA_Women'...,244174,1,[table-1653-409.json],[56],4.0,11.0,table-1653-409.json,"[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 15]",40.0,48.0,,,48.0,,0.206687
244209,https://en.wikipedia.org/wiki/1931_Italian_Gra...,244209,1,[table-1653-615.json],[26],3.0,13.0,table-1653-615.json,"[0, 1, 2, 3, 4, 11, 15, 19, 21, 22, 23, 24, 27]",88.0,129.0,,,129.0,,0.075247
244229,https://en.wikipedia.org/wiki/List_of_World_Aq...,244229,16,"[table-1653-718.json, table-1653-720.json, tab...","[30, 44, 45, 40, 40, 44, 41, 24, 21, 47, 25, 4...",4.0,15.0,table-1653-730.json,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",44.0,240.0,,,15.0,,0.131381


In [3]:
# Remove queries where the expansion ration is less than or equal to 1
queries_df = queries_df[queries_df['categories_expansion_ratio']>1]

# Remove queries for which there are less than 20 relevant tables
queries_df = queries_df[queries_df['categories_relevant_tables']>=20]

queries_df

Unnamed: 0,wikipage,wikipage_id,num_tables,tables,num_entities,tuple_width,num_tuples,selected_table,selected_row_ids,categories_relevant_wikipages,categories_relevant_tables,navigation_links_relevant_wikipages,navigation_links_relevant_tables,categories_expansion_ratio,navigation_links_expansion_ratio,avg_query_containment
32,https://en.wikipedia.org/wiki/Andre_Norton_Award,32,1,[table-0001-242.json],[80],4.0,11.0,table-0001-242.json,"[4, 6, 7, 10, 13, 14, 28, 30, 31, 33, 36]",51.0,58.0,,,58.0,,0.036120
66,https://en.wikipedia.org/wiki/Charlotte_Bobcat...,66,1,[table-0001-460.json],[108],3.0,28.0,table-0001-460.json,"[0, 8, 9, 15, 19, 21, 26, 28, 31, 34, 38, 39, ...",27.0,122.0,,,122.0,,0.026968
68,https://en.wikipedia.org/wiki/List_of_organism...,68,1,[table-0001-469.json],[355],3.0,14.0,table-0001-469.json,"[2, 44, 45, 53, 80, 107, 206, 208, 213, 219, 2...",16.0,37.0,,,37.0,,0.002137
97,https://en.wikipedia.org/wiki/1982_NCAA_Women'...,97,2,"[table-0001-64.json, table-0001-65.json]","[12, 58]",4.0,13.0,table-0001-65.json,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 13, 14, 15]",34.0,39.0,,,19.5,,0.210526
105,https://en.wikipedia.org/wiki/List_of_French_f...,105,2,"[table-0001-693.json, table-0001-695.json]","[65, 83]",3.0,13.0,table-0001-695.json,"[0, 1, 2, 3, 4, 5, 6, 7, 13, 20, 24, 26, 34]",72.0,189.0,,,94.5,,0.013093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244164,https://en.wikipedia.org/wiki/2004_LPGA_Tour,244164,2,"[table-1653-355.json, table-1653-356.json]","[64, 15]",3.0,10.0,table-1653-355.json,"[4, 6, 7, 8, 10, 16, 17, 24, 26, 30]",68.0,81.0,,,40.5,,0.237500
244174,https://en.wikipedia.org/wiki/1998_NCAA_Women'...,244174,1,[table-1653-409.json],[56],4.0,11.0,table-1653-409.json,"[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 15]",40.0,48.0,,,48.0,,0.206687
244209,https://en.wikipedia.org/wiki/1931_Italian_Gra...,244209,1,[table-1653-615.json],[26],3.0,13.0,table-1653-615.json,"[0, 1, 2, 3, 4, 11, 15, 19, 21, 22, 23, 24, 27]",88.0,129.0,,,129.0,,0.075247
244229,https://en.wikipedia.org/wiki/List_of_World_Aq...,244229,16,"[table-1653-718.json, table-1653-720.json, tab...","[30, 44, 45, 40, 40, 44, 41, 24, 21, 47, 25, 4...",4.0,15.0,table-1653-730.json,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",44.0,240.0,,,15.0,,0.131381


# Sample Queries

In [3]:
def update_sampled_query_ids(stratum_df, sampled_query_ids, samples_per_stratum, random_state=1):
    '''
    Selects `sampled_per_stratum` rows from `df` and adds the wikipage IDs of the selected rows to the `sampled_query_ids` set
    '''
    sampled_wikipage_ids = stratum_df.sample(n=samples_per_stratum, random_state=1)['wikipage_id'].to_list()
    
    # Ensure that the sampled_wikipage_ids are not already in the `sampled_query_ids`
    for sampled_wikipage_id in sampled_wikipage_ids:
        if sampled_wikipage_id in sampled_query_ids: 
            raise ValueError('Sampled wikipage ID already in the sampled_query_ids. This is most likely because the strata are not mutually exclusive.')
        else:
            sampled_query_ids.add(sampled_wikipage_id)

In [4]:
tuple_widths=[3, 4, 5]
num_relevant_tables_per_query=[[20, 50], [50, 150]]
total_num_candidates=0
samples_per_stratum=10
sampled_query_ids=set()

for tuple_width in tuple_widths:
    tuple_width_df = queries_df[queries_df['tuple_width']==tuple_width]
    for relevant_tables_per_query_range in num_relevant_tables_per_query:
        tmp_df = tuple_width_df[(tuple_width_df['categories_relevant_tables']>=relevant_tables_per_query_range[0]) & (tuple_width_df['categories_relevant_tables']<relevant_tables_per_query_range[1])]
        print('Tuple Width:', tuple_width, 'number of relevant tables in range:', relevant_tables_per_query_range, 'number of candidate tables:', len(tmp_df))
        total_num_candidates += len(tmp_df)
        update_sampled_query_ids(stratum_df=tmp_df, sampled_query_ids=sampled_query_ids, samples_per_stratum=samples_per_stratum, random_state=1)
        

    # Get candidates with >=150 relevant tables
    tmp_df = tuple_width_df[tuple_width_df['categories_relevant_tables']>=150]
    print('Tuple Width:', tuple_width, 'number of relevant tables: 150+ number of candidate tables:', len(tmp_df))
    total_num_candidates += len(tmp_df)
    update_sampled_query_ids(stratum_df=tmp_df, sampled_query_ids=sampled_query_ids, samples_per_stratum=samples_per_stratum, random_state=1)


# Candidates for tuple width >= 6 and number of
tuple_width_df = queries_df[queries_df['tuple_width']>=6]
for relevant_tables_per_query_range in num_relevant_tables_per_query:
    tmp_df = tuple_width_df[(tuple_width_df['categories_relevant_tables']>=relevant_tables_per_query_range[0]) & (tuple_width_df['categories_relevant_tables']<relevant_tables_per_query_range[1])]
    print('Tuple Width >=6 number of relevant tables in range:', relevant_tables_per_query_range, 'number of candidate tables:', len(tmp_df))
    total_num_candidates += len(tmp_df)
    update_sampled_query_ids(stratum_df=tmp_df, sampled_query_ids=sampled_query_ids, samples_per_stratum=samples_per_stratum, random_state=1)

# Get candidates with >=150 relevant tables
tmp_df = tuple_width_df[tuple_width_df['categories_relevant_tables']>=150]
print('Tuple Width >=6, number of relevant tables: 150+ number of candidate tables:', len(tmp_df))
total_num_candidates += len(tmp_df)
update_sampled_query_ids(stratum_df=tmp_df, sampled_query_ids=sampled_query_ids, samples_per_stratum=samples_per_stratum, random_state=1)

print("Total Number of candidates:", total_num_candidates)

Tuple Width: 3 number of relevant tables in range: [20, 50] number of candidate tables: 1795
Tuple Width: 3 number of relevant tables in range: [50, 150] number of candidate tables: 2557
Tuple Width: 3 number of relevant tables: 150+ number of candidate tables: 465
Tuple Width: 4 number of relevant tables in range: [20, 50] number of candidate tables: 480
Tuple Width: 4 number of relevant tables in range: [50, 150] number of candidate tables: 731
Tuple Width: 4 number of relevant tables: 150+ number of candidate tables: 328
Tuple Width: 5 number of relevant tables in range: [20, 50] number of candidate tables: 78
Tuple Width: 5 number of relevant tables in range: [50, 150] number of candidate tables: 192
Tuple Width: 5 number of relevant tables: 150+ number of candidate tables: 95
Tuple Width >=6 number of relevant tables in range: [20, 50] number of candidate tables: 50
Tuple Width >=6 number of relevant tables in range: [50, 150] number of candidate tables: 57
Tuple Width >=6, number

In [5]:
sampled_queries_df = queries_df[queries_df['wikipage_id'].isin(sampled_query_ids)]
sampled_queries_df.to_pickle('query_dataframes/expanded_wikipages/sampled_queries/sampled_queries.pickle')
sampled_queries_df

Unnamed: 0,wikipage,wikipage_id,num_tables,tables,num_entities,tuple_width,num_tuples,selected_table,selected_row_ids,categories_relevant_wikipages,categories_relevant_tables,navigation_links_relevant_wikipages,navigation_links_relevant_tables,categories_expansion_ratio,navigation_links_expansion_ratio,avg_query_containment
205,https://en.wikipedia.org/wiki/Australian_feder...,205,1,[table-0002-277.json],[62],3.0,10.0,table-0002-277.json,"[2, 3, 6, 8, 10, 12, 14, 15, 23, 25]",42.0,66.0,,,66.000000,,0.045641
335,https://en.wikipedia.org/wiki/Colorado_statist...,335,1,[table-0002-837.json],[69],3.0,10.0,table-0002-837.json,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",55.0,57.0,,,57.000000,,0.053571
3452,https://en.wikipedia.org/wiki/World_Fencing_Ch...,3452,4,"[table-0022-580.json, table-0022-581.json, tab...","[221, 61, 41, 33]",6.0,26.0,table-0022-580.json,"[13, 20, 34, 39, 41, 43, 44, 49, 50, 54, 55, 5...",44.0,70.0,,,17.500000,,0.043025
5050,https://en.wikipedia.org/wiki/2004_Bradford_Bu...,5050,3,"[table-0033-213.json, table-0033-217.json, tab...","[23, 29, 30]",5.0,20.0,table-0033-219.json,"[0, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 17, 18,...",19.0,61.0,,,20.333333,,0.253846
6218,https://en.wikipedia.org/wiki/List_of_Super_Bo...,6218,2,"[table-0041-812.json, table-0041-814.json]","[144, 51]",6.0,32.0,table-0041-812.json,"[0, 1, 2, 3, 4, 5, 8, 9, 11, 12, 14, 17, 19, 2...",16.0,35.0,,,17.500000,,0.162439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239771,https://en.wikipedia.org/wiki/2007_Houston_Ast...,239771,6,"[table-1625-64.json, table-1625-65.json, table...","[40, 30, 41, 41, 48, 38]",4.0,10.0,table-1625-68.json,"[4, 6, 8, 9, 11, 12, 13, 15, 16, 17]",77.0,314.0,,,52.333333,,0.097267
242059,https://en.wikipedia.org/wiki/List_of_Fiesta_B...,242059,2,"[table-1640-699.json, table-1640-701.json]","[53, 18]",4.0,12.0,table-1640-699.json,"[0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 14, 34]",79.0,209.0,,,104.500000,,0.042147
242282,https://en.wikipedia.org/wiki/Super_Prestige_P...,242282,1,[table-1641-979.json],[49],5.0,17.0,table-1641-979.json,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 17, 18, 19...",23.0,46.0,,,46.000000,,0.001905
242991,https://en.wikipedia.org/wiki/List_of_places_o...,242991,2,"[table-1646-858.json, table-1646-859.json]","[136, 68]",4.0,11.0,table-1646-858.json,"[0, 4, 7, 11, 17, 42, 51, 70, 101, 108, 112]",63.0,134.0,,,67.000000,,0.081847


In [7]:
sampled_queries_df[sampled_queries_df['wikipage_id']==124800]

Unnamed: 0,wikipage,wikipage_id,num_tables,tables,num_entities,tuple_width,num_tuples,selected_table,selected_row_ids,categories_relevant_wikipages,categories_relevant_tables,navigation_links_relevant_wikipages,navigation_links_relevant_tables,categories_expansion_ratio,navigation_links_expansion_ratio,avg_query_containment
124800,https://en.wikipedia.org/wiki/List_of_UEFA_Cup...,124800,3,"[table-0852-461.json, table-0852-462.json, tab...","[138, 38, 16]",6.0,13.0,table-0852-461.json,"[12, 13, 22, 33, 34, 44, 45, 49, 50, 51, 53, 5...",32.0,63.0,,,21.0,,0.07472
