# Retrieve sample categories from Wikidata

### Imports

In [None]:
import pandas as pd
import random
import requests
import time

### Define Functions

In [None]:
# function which generates 500 random Qids for wikidata and pulls their two types of hypernym relationships

def batchQuery():
    ids = list()
    ids_str = ""

    for i in range(500):
        # Grab random Wikidata item
        random_id = random.randrange(10000000)
        ids.append(random_id)
        ids_str = ids_str + ' wd:Q'+ str(random_id)

    sparql_query = """

                  SELECT ?item ?itemLabel ?class ?classLabel ?subclass ?subclassLabel WHERE {
                    VALUES ?item { """ + ids_str + """ }
                    OPTIONAL {
                      ?item wdt:P31 ?class.
                      }
                    OPTIONAL {
                      ?item wdt:P279 ?subclass.
                      }
                    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                  }
    """
    url = 'https://query.wikidata.org/sparql'

    r = requests.get(url, params={'format': 'json', 'query': sparql_query})
    data = r.json()

    return data

### Run batchQuery multiple times to build up sample of entities

In [None]:
%%time

batch = batchQuery()
sample_df = pd.json_normalize(batch['results']['bindings'])

for i in range(9):
    print(i)
    batch = batchQuery()
    batch_df = pd.json_normalize(batch['results']['bindings'])
    sample_df = pd.concat([sample_df, batch_df], axis=0)

### Examine outputs

In [None]:
sample_df.head()

Unnamed: 0,item.type,item.value,class.type,class.value,itemLabel.xml:lang,itemLabel.type,itemLabel.value,classLabel.xml:lang,classLabel.type,classLabel.value,subclass.type,subclass.value,subclassLabel.xml:lang,subclassLabel.type,subclassLabel.value
0,uri,http://www.wikidata.org/entity/Q4700,uri,http://www.wikidata.org/entity/Q5,en,literal,Claude Debussy,en,literal,human,,,,,
1,uri,http://www.wikidata.org/entity/Q11485,uri,http://www.wikidata.org/entity/Q5,en,literal,Jean-Claude Killy,en,literal,human,,,,,
2,uri,http://www.wikidata.org/entity/Q23508,uri,http://www.wikidata.org/entity/Q1302471,en,literal,jeroboam,en,literal,unit of volume,uri,http://www.wikidata.org/entity/Q23490,en,literal,wine bottle
3,uri,http://www.wikidata.org/entity/Q27989,uri,http://www.wikidata.org/entity/Q16521,en,literal,Campanula rotundifolia,en,literal,taxon,,,,,
4,uri,http://www.wikidata.org/entity/Q42300,uri,http://www.wikidata.org/entity/Q747074,en,literal,Comazzo,en,literal,comune of Italy,,,,,


In [None]:
# run batchQuery many more times to scale sample of entities. Add pause to avoid wikidata's rate limits

%%time

limit = 200

for i in range(limit):

    print(i)
    batch = batchQuery()
    batch_df = pd.json_normalize(batch['results']['bindings'])
    sample_df = pd.concat([sample_df, batch_df], axis=0)

    if (i+1) % 10 == 0 and i != (limit-1):
        print(i)
        time.sleep(10)

In [None]:
# Examine scale of output
sample_df.shape

(218122, 15)

### Clean resultant dataset

In [None]:
# remove any entities which are a subclass of another class

can_df = sample_df[sample_df['subclass.value'].isna()]

In [None]:
# remove any entities which do not have a class

can_df = can_df[~can_df['class.value'].isna()]

In [None]:
# remove any entities which are not recorded in English

can_df = can_df[can_df['itemLabel.xml:lang']=='en']

In [None]:
# this is our final sample set

can_df.shape

(150909, 15)

In [None]:
# sort top classes of entities by number of instances sampled per class

pt = can_df.pivot_table(values='classLabel.type', columns='classLabel.value', aggfunc='count')
pt = pt.transpose()
pt_sorted = pt.sort_values(by='classLabel.type', ascending=False)
pt_sorted.reset_index(drop=False, inplace=True)

In [None]:
# check for top 50 classes by number of instances

pt_sorted.head(50)

### Save results

In [None]:
pt_sorted.to_csv('../Data/top_first_tier_cats.csv')
can_df.to_csv('../Data/wikidata_sample_df.csv')