Import libraries

In [1]:
import os
import xmltodict
import json
import nltk
from nltk.corpus import wordnet

Global variables

In [2]:
# suffix of the VISA files containing the concepts
file_suffix = '_structured_final.us.xml'
# all categories
file_sel_classes = ['ANIMALS', 'APPLIANCES', 'ARTIFACTS', 'CLOTHING', 'CONTAINER', 'DEVICE', 'FOOD', 'HOME', 'INSTRUMENTS', 'PLANTS', 'STRUCTURES', 'TOOLS', 'TOYS', 'VEHICLES', 'WEAPONS']

**Concepts and visible parts extraction**

Filter out the non-visible parts

In [3]:
# delete the <no_evidence/> lines, i.e. the non-visible parts
# write new file(s) w/ only visible parts
for file_class in file_sel_classes:
  if os.path.isfile('../VISA/new_'+file_class+file_suffix) == False:
    with open('../VISA/'+file_class+file_suffix,"r") as oldfile, open('../VISA/new_'+file_class+file_suffix, 'w') as newfile:
        for line in oldfile:
            if '<no_evidence/>' not in line:
                newfile.write(line)

Record all the concepts belonging to the selected categories

In [5]:
# function to extract an Holonym and its associated Meronyms
def extract_concept(concept, class_parts):
  concept_parts_dict = {}
  concept_parts_dict['name'] = concept['@name'].split('_')[0]
  if class_parts in concept and concept[class_parts] is not None and '#text' in concept[class_parts]:
    parts = [p.split('has_')[-1].replace('_', ' ').lstrip('0123456789 ') for p in concept[class_parts]['#text'].split('\n') if bool(p and p.strip())]
    concept_parts_dict['parts'] = parts
  elif class_parts in concept and concept[class_parts] is not None and '#text' not in concept[class_parts]:
    parts = [p.split('has_')[-1].replace('_', ' ').lstrip('0123456789 ') for p in concept[class_parts].split('\n') if bool(p and p.strip())]
    concept_parts_dict['parts'] = parts
  else:
    return None
  return concept_parts_dict

concept_list = []
for file_class in file_sel_classes:
  class_parts = ('anatomy' if file_class == 'ANIMALS' else ('botany' if (file_class == 'FOOD' or file_class == 'PLANTS') else 'parts'))
  with open('../VISA/new_'+file_class+file_suffix,"r") as xml_obj:
    my_dict = xmltodict.parse(xml_obj.read())
    xml_obj.close()
  # first look into categoryless concepts
  if 'concept' in my_dict['concepts']:
    categoryless_list = my_dict['concepts']['concept']
    for concept in categoryless_list:
      concept_parts_dict = extract_concept(concept, class_parts)
      if concept_parts_dict is not None:
        concept_list.append(concept_parts_dict)
  # look into subcategories, if any
  if 'subcategory' in my_dict['concepts']:
    subcategories_list = my_dict['concepts']['subcategory']
    if isinstance(subcategories_list, list):
      concepts_list = []
      for subcategory in subcategories_list:
        concepts_list.append(subcategory['concept'])
      for concepts in concepts_list:
        # check if it's already the list of concepts
        if isinstance(concepts, list):
          for concept in concepts:
            concept_parts_dict = extract_concept(concept, class_parts)
            if concept_parts_dict is not None:
              concept_list.append(concept_parts_dict)
        # it's a dict
        else:
          concept = concepts
          concept_parts_dict = extract_concept(concept, class_parts)
          if concept_parts_dict is not None:
            concept_list.append(concept_parts_dict)
    # it's a dict
    else:
      for concept in subcategories_list['concept']:
        concept_parts_dict = extract_concept(concept, class_parts)
        if concept_parts_dict is not None:
          concept_list.append(concept_parts_dict)

Setup wordnet

In [None]:
nltk.download('wordnet')

Load the Wordnet-ImageNet mappings

In [8]:
with open('../VISA/mapping.json') as json_file: 
    wn_mappings = json.load(json_file)

Check the classes in common with Imagenet

In [9]:
# for each concept extracted, store the name and the associated synsets
visa_label_synsets_list = []
for concept in concept_list:
  name = concept['name']
  parts = concept['parts']
  syns = wordnet.synsets(name)
  visa_label_synsets_list.append({'label': name, 'syns': syns, 'parts': parts})

# for each Imagenet class, store the name and the associated synset
imagenet_label_synset_list = []
for id_label_uri_dict in wn_mappings.values():
  id = id_label_uri_dict['id'].split('-')[0]
  label = id_label_uri_dict['label']
  syn = wordnet.synset_from_pos_and_offset('n', int(id))
  imagenet_label_synset_list.append({'label': label, 'syn': syn})

imagenet_visa_list = []
# Arabian camel, ...
for imagenet_label_synset in imagenet_label_synset_list:
  h_count = 0
  found = False
  imagenet_label = imagenet_label_synset['label']
  imagenet_synset = imagenet_label_synset['syn']
  # go up to the third upper hypernym
  while found == False and h_count < 3:
    # check if the current imagenet synset is contained in one of the extracted synsets, and save the associated extracted class
    for visa_label_synsets in visa_label_synsets_list:
      if found == True:
        h_count = 0
      visa_label = visa_label_synsets['label']
      visa_synsets = visa_label_synsets['syns']
      visa_parts = visa_label_synsets['parts']
      # I've more than one hypernyms
      if isinstance(imagenet_synset, list):
        hypernym_synsets = imagenet_synset
        for hypernym_synset in hypernym_synsets:
          if hypernym_synset in visa_synsets:
            imagenet_visa_list.append({'label': imagenet_label, 'visa_label': visa_label, 'parts': visa_parts, 'depth': h_count})
            found = True
            break
      # I've a single synset, either original class or hypernym
      else:  
        if imagenet_synset in visa_synsets:
          imagenet_visa_list.append({'label': imagenet_label, 'visa_label': visa_label, 'parts': visa_parts, 'depth': h_count})
          found = True
    # if an associated synset is not found, consider the hypernym(s)
    if found == False:
      if isinstance(imagenet_synset, list):
        imagenet_synset = imagenet_synset[0]
      imagenet_synset = imagenet_synset.hypernyms()
      h_count +=1
  if found == False:
    imagenet_visa_list.append({'label': imagenet_label, 'visa_label': None, 'parts': None, 'depth': h_count})

found_imagenet_visa_list = [concept for concept in imagenet_visa_list if concept['visa_label'] is not None]

Remove duplicate concepts 

In [10]:
nodup_imagenet_visa_dict = {}
for concept in found_imagenet_visa_list:
  label = concept['label']
  if label not in nodup_imagenet_visa_dict:
    nodup_imagenet_visa_dict[label] = concept
  else:
    if concept['depth'] < nodup_imagenet_visa_dict[label]['depth']:
      nodup_imagenet_visa_dict[label] = concept
found_imagenet_visa_list = list(nodup_imagenet_visa_dict.values()) 

In [11]:
#for c in found_imagenet_visa_list:
  #print(c)

Hyper-meronyms and hypo-meroynms

In [12]:
# among the matching concepts, divide them into subcategories for the sake of visualization
sel_labels_dict = {
    # animals
    'ungulates': ['bison', 'buffalo', 'bull', 'calf', 'camel', 'caribou', 'cow', 'deer', 'donkey', 'elk', 'fawn', 'giraffe', 'goat', 'horse', 'lamb', 'moose', 'ox', 'pig', 'pony', 'sheep', 'zebra'],
    'felines': ['cat', 'cheetah', 'cougar', 'leopard', 'lion', 'panther', 'tiger'],
    'reptiles': ['alligator', 'iguana', 'python', 'rattlesnake', 'tortoise', 'turtle'],
    # appliances
    'kitchen appliances': ['blender', 'dishwasher', 'freezer', 'fridge', 'microwave', 'mixer', 'oven', 'stove', 'toaster'],
    # tools
    'tools': ['anchor', 'axe', 'bolts', 'broom', 'brush', 'chisel', 'clamp', 'crowbar', 'drill', 'fork', 'hammer', 'hatchet', 'hoe', 'hook', 'level', 'peg', 'pliers', 'rake', 'scissors',
              'screwdriver', 'screws', 'shovel', 'sledgehammer', 'spade', 'tomahawk', 'tongs', 'wrench'],
    # vehicles
    'aircraft': ['airplane', 'baloon', 'helicopter', 'jet'],
    'wheeled vehicle': ['ambulance', 'bike', 'tricyle', 'trolley', 'truck', 'unicycle', 'van', 'wagon', 'wheelbarrow', 'buggy', 'bus', 'car', 'cart', 'dunebuggy', 'jeep', 'limousine', 'motorcycle',
                        'scooter', 'taxi', 'tractor', 'trailer'],
}

# define Hyper-meronyms and hypo-meronyms
hyper_hypo_mer_dict = {
    'head': ['jaws', 'snout', 'horns', 'eyes', 'ears', 'beard', 'hair', 'nose', 'mouth', 'tongue', 'whiskers', 'teeth', 'mane', 'comb', 'powerful jaws', 'neck', 'edge', 'blade'],
    'feet': ['paws', 'toes', 'claws'],
    'tail': ['rattle on tail', 'curly tail'],
    'fur': ['wool'],
    'flat head': ['slots'],
    'contoured head': ['blade'],
    'blade': ['edge', 'scoop'],
    'nose': ['cabin', 'windshield', 'windshield wiper'], 
    'fins': ['rudder'],
    'windows': ['windshield', 'windshield wiper', 'tinted windows'],
    'roof': ['sign on roof'],
    'light': ['flashing lights'],
    'handlebar': ['bell'],
    'wheels': ['spokes'],
    'door': ['handle', 'glass panel'],
    'handlebar': ['handle', 'bell']
}

Display final Holonyms and Meronyms extracted

In [13]:
category_concept_dict = {}
tot_len = 0
for sel_label in sel_labels_dict.keys():
  print(sel_label)
  category_concept_dict[sel_label] = []
  for concept in found_imagenet_visa_list:
    if concept['visa_label'] in sel_labels_dict[sel_label]:
      label = concept['label']
      visa_label = concept['visa_label']
      depth = concept['depth']
      parts = concept['parts']
      for hyper, hypos in hyper_hypo_mer_dict.items():
        parts = [p for p in parts if not (hyper in parts and p in hypos) and p != visa_label]
      if len(parts)>1:
        concept_dict = {'label' : label, 'visa_label': visa_label, 'parts': parts, 'depth': depth}
        category_concept_dict[sel_label].append(concept_dict)
        print(concept_dict)
  len_category = len(category_concept_dict[sel_label])
  tot_len += len_category
  print(len_category)
  print()
print(tot_len)

ungulates
{'label': 'sorrel', 'visa_label': 'horse', 'parts': ['tail', 'legs', 'fur', 'hooves', 'feet', 'head'], 'depth': 1}
{'label': 'zebra', 'visa_label': 'zebra', 'parts': ['tail', 'legs', 'fur', 'hooves', 'feet', 'head'], 'depth': 0}
{'label': 'hog, pig, grunter, squealer, Sus scrofa', 'visa_label': 'pig', 'parts': ['tail', 'legs', 'feet', 'hooves', 'fur', 'udder', 'head'], 'depth': 0}
{'label': 'ox', 'visa_label': 'ox', 'parts': ['tail', 'legs', 'feet', 'hooves', 'fur', 'dewlap', 'head'], 'depth': 0}
{'label': 'water buffalo, water ox, Asiatic buffalo, Bubalus bubalis', 'visa_label': 'buffalo', 'parts': ['hump', 'tail', 'legs', 'fur', 'hooves', 'feet', 'head'], 'depth': 1}
{'label': 'bison', 'visa_label': 'bison', 'parts': ['hump', 'tail', 'legs', 'fur', 'hooves', 'feet', 'head'], 'depth': 0}
{'label': 'ram, tup', 'visa_label': 'sheep', 'parts': ['legs', 'feet', 'fur', 'hooves', 'head'], 'depth': 1}
{'label': 'ibex, Capra ibex', 'visa_label': 'goat', 'parts': ['tail', 'legs', 'fe

Misassigned: beach wagon, police van (corrected afterwards)