In [1]:
import os, glob
from pathlib import Path
from tqdm.notebook import tqdm
tqdm.pandas()

import requests

from collections import Counter

import numpy as np
import pandas as pd

# ConceptNet - Extracting the Part related to OKVQA Questions

This notebook examines the ConceptNet Knowledge Graph wrt. the OKVQA Question tokens.

## 1) Call the ConceptNet API

We rely on a function for filtering the ConceptNet API, such that we only take interesting data (for our case) into account. Since the ConceptNet knowledge graph is defined for a large set of languages, we filter by 'english' language by setting a `language_key`. Other implementation details can be seen below.

In [8]:
def filter_ConceptNet(word, language_key='en', same_language=True):

    # call the ConceptNet API
    json_object = requests.get(f'http://api.conceptnet.io/c/{language_key}/{word}').json()
    
    # check if the concept exists in the graph
    try:
        query_error = json_object['error']['status'] == 404
    except KeyError:
        query_error = False        
    
    # if queried concept exists in conceptnet
    if not query_error:
        # create a dataframe from the nested json-object
        df = pd.json_normalize(json_object, record_path='edges', meta=['@id'], record_prefix='_')        
        try:
            df = df[df['_start.language'] == language_key]

            # dataframe might be empty due to language filtering
            if not df.empty:

                # assert that all rows are of form (Node, Edge, Node)
                try:
                    type_assertion = ['_start.@type', '_@type', '_end.@type']
                    assert all(df[type_assertion] == ('Node', 'Edge', 'Node'))

                except AssertionError:
                    df = df[np.all(df[['_start.@type', '_@type', '_end.@type']] == ('Node', 'Edge', 'Node'), axis=1)]

                # attributes of interest
                aoi = ['start_label', 'relation', 'end_label', 'surfaceText', 'weight', 'dataset']

                # only find relations with identical language
                if same_language:
                    df['language'] = [language_key] * df.__len__()
                    language = [concept['_start.language'] == concept['_end.language'] for (_, concept) in df.iterrows()]
                    df = df[language]
                    aoi.append('language')
                else:
                    df = df.rename(columns={'_start.language':'start_language',
                                            '_end.language': 'end_language'})
                    aoi += ['start_language', 'end_language']

                # rename columns
                df.columns = df.columns.str.lstrip("_")
                df = df.rename(columns={'start.label':'start_label', 
                                        'rel.label':'relation', 
                                        'end.label':'end_label',
                                       })

                df = df[aoi].reset_index(drop=True)
                df['query_word'] = [word] * df.__len__()
                return df
        except KeyError:
            pass
    
    # if concept doesn't exist in conceptnet
    else:
        return pd.DataFrame()
        #raise KeyError("Concept does not exist in ConceptNet...")

Let's try out the filtering on a simple example in Danish - namley, the word 'hej' which means 'hello'.

In [9]:
language_key = 'da'
concept = 'hej'
same_language = True

temp = filter_ConceptNet(concept, language_key, same_language=same_language)
temp

Unnamed: 0,start_label,relation,end_label,surfaceText,weight,dataset,language,query_word
0,hej,Antonym,farvel,,1.0,/d/wiktionary/en,da,hej
1,dav,RelatedTo,hej,,1.0,/d/wiktionary/de,da,hej
2,hej,Synonym,goddag,,1.0,/d/wiktionary/en,da,hej


Yeah, it works out as expected!

## 2) Load OKVQA data

We start out by loading the processed OKVQA dataset that was created in the Question-Answer and Image investigation files.

In [35]:
# path to okvqa dataset (from text-investigation)
filename = 'OKVQA_object.json'
data_path = Path(os.getcwd()) / 'data'

okvqa = {}
okvqa['full'] = pd.read_json(data_path/filename)
okvqa['full'].head()

Unnamed: 0,image_id,question_id,answer_type,question_type,confidence,question_str,question_tokens,all_answers,answers,image_name,feature_path,annotator_agreement_cos,image_objects
0,51606,516065,other,four,3,What is the hairstyle of the blond called?,"[what, is, the, hairstyle, of, the, blond, cal...","[pony tail, pony tail, pony tail, pony tail, p...","[pony tail, pony tail, pony tail, pony tail, p...",COCO_train2014_000000051606,COCO_train2014_000000051606.npy,0.97716,"[person, person, tennis racket, chair]"
1,81721,817215,other,seven,5,How old do you have to be in canada to do this?,"[how, old, do, you, have, to, be, in, canada, ...","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18]","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18]",COCO_train2014_000000081721,COCO_train2014_000000081721.npy,,"[person, wine glass, person, wine glass, perso..."
2,480208,4802085,other,four,2,Can you guess the place where the man is playing?,"[can, you, guess, the, place, where, the, man,...","[aspen, aspen, mountain, mountain, mountain, m...","[aspen, aspen, mountain, mountain, mountain, m...",COCO_train2014_000000480208,COCO_train2014_000000480208.npy,0.974979,"[person, backpack, skis]"
3,570618,5706185,other,one,2,Which rail company is named after a town in ne...,"[which, rail, company, is, named, after, a, to...","[santa fe, santa fe, santa fe, santa fe, new e...","[santa fe, santa fe, santa fe, santa fe, new e...",COCO_train2014_000000570618,COCO_train2014_000000570618.npy,0.960305,[train]
4,478903,4789035,other,seven,2,Is the boy swimming or doing another water act...,"[is, the, boy, swimming, or, doing, another, w...","[another activity, another activity, another a...","[another activity, another activity, another a...",COCO_train2014_000000478903,COCO_train2014_000000478903.npy,0.955901,"[person, surfboard]"


## 3) Linking ConceptNet to OKVQA questions

Next, we aim at filtering ConceptNet based on the question-input from the OKVQA dataset. Since we already have a tokenized version of all questions, we start by concatenating these into a single list of unique tokens.

In [13]:
all_question_tokens = okvqa['full'].question_tokens.apply(pd.Series).stack().reset_index(drop=True)
unique_question_tokens = list(Counter(all_question_tokens).keys())

In [14]:
unique_question_tokens[:10]

['what', 'is', 'the', 'hairstyle', 'of', 'blond', 'called', 'how', 'old', 'do']

Now - using these tokens as input for the ConceptNet-filtering function - we extract relevant concepts and their relations for the question tokens. This takes a while.

In [16]:
okvqa['conceptnet'] = pd.DataFrame()

for token in tqdm(unique_question_tokens):
    temp = filter_ConceptNet(token, language_key='en', same_language=True)
    okvqa['conceptnet'] = okvqa['conceptnet'].append(temp)

okvqa['conceptnet'].reset_index(drop=True, inplace=True)

## 4) Linking ConceptNet to OKVQA image objects

Next, we aim at filtering ConceptNet based on the image-input from the OKVQA dataset. Since we already have a run object detection with Faster-RCNN on the images, we start by concatenating these into a single list of unique tokens.

In [24]:
all_image_objects = okvqa['full'].image_objects.apply(pd.Series).stack().reset_index(drop=True)
unique_image_objects = list(Counter(all_image_objects).keys())

In [25]:
unique_image_objects[:10]

['person',
 'tennis racket',
 'chair',
 'wine glass',
 'backpack',
 'skis',
 'train',
 'surfboard',
 'tv',
 'bird']

Now - using these objects as input for the ConceptNet-filtering function - we extract relevant concepts and their relations for the question tokens.

In [28]:
temp_conceptnet = pd.DataFrame()

for some_object in tqdm(unique_image_objects):
    temp = filter_ConceptNet(some_object, language_key='en', same_language=True)
    temp_conceptnet = temp_conceptnet.append(temp)

temp_conceptnet.reset_index(drop=True, inplace=True)

HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))




Let's add an attribute on the extracted information to determine if the relation is extracted from question tokens or image objects.

In [41]:
okvqa['conceptnet']['from_question'] = [True] * okvqa['conceptnet'].__len__()
temp_conceptnet['from_image'] = [True] * temp_conceptnet.__len__()

In [72]:
okvqa['conceptnet'] = okvqa['conceptnet'].merge(temp_conceptnet, how='outer')
okvqa['conceptnet'][['from_question', 'from_image']] = okvqa['conceptnet'][['from_question', 'from_image']].fillna(False)

In [73]:
okvqa['conceptnet']

Unnamed: 0,start_label,relation,end_label,surfaceText,weight,dataset,language,query_word,from_question,from_image
0,is,HasContext,computing,,2.000000,/d/wiktionary/en,en,is,True,False
1,ye olde,RelatedTo,the,,1.000000,/d/wiktionary/en,en,the,True,False
2,hairstyle,Synonym,hairdo,[[hairstyle]] is a synonym of [[hairdo]],2.000000,/d/wordnet/3.1,en,hairstyle,True,False
3,bingle,RelatedTo,hairstyle,,1.000000,/d/wiktionary/en,en,hairstyle,True,False
4,beehive,RelatedTo,hairstyle,,1.000000,/d/wiktionary/en,en,hairstyle,True,False
...,...,...,...,...,...,...,...,...,...,...
43004,a kite,CapableOf,fly,[[a kite]] can [[fly]],4.472136,/d/conceptnet/4/en,en,kite,False,True
43005,a kite,AtLocation,a toy store,You are likely to find [[a kite]] in [[a toy s...,3.464102,/d/conceptnet/4/en,en,kite,False,True
43006,skateboard,MotivatedByGoal,have fun,You would [[skateboard]] because you want to [...,4.898979,/d/conceptnet/4/en,en,skateboard,False,True
43007,having fun,CausesDesire,skateboard,[[having fun]] would make you want to [[skateb...,2.828427,/d/conceptnet/4/en,en,skateboard,False,True


In [74]:
# save data
okvqa['conceptnet'].to_json('data/okvqa_conceptnet.json')