In [2]:
import pandas as pd
import numpy as np
import os
import requests
import json
import urllib.request
import re
import pickle

In [10]:
app_id = '24ac8855'
app_key = 'baa032bb2944756dcf361a18ae7e3ab9'

actual_dictionary = {}

def find_defs(diction):
    """
    This function executes when we reach a list containing dictionary data strucutres that may contain several different
    definitions for a given word. It runs through this list, for each list finding keys in the dictionary data
    structure containing the string "definitions" or "short_definitions" and appending the corresponding value 
    (a string) of this key to a master string, which we call stringy. At the end of this function, we use the 
    built-in function set() on the list given to us by the .split() method. The reasons for a set are:
    1) checking membership in sets is efficient in python
    2) sets do not contain duplicate elements, so we are using less space.
    """
    stringy = ''
    for i in diction:
        for k, v in i.items():
            if k == 'definitions' or k == 'short_definitions':
                stringy += v[0] + ' '
    return set(stringy.split(' '))


def find_definition(word):
    """
    This function takes in a word as input and creates an entry in our dataset consisting of a word-definition pair.
    """
    url = 'https://od-api.oxforddictionaries.com:443/api/v1/entries/en/' + word.lower()
    # there may be times while looking up words that the dictionary does not contain a word for whatever reason.
    # this chunk of code catches this error.
    try:
        r = requests.get(url, headers = {'app_id': app_id, 'app_key': app_key})
        dicty = r.json()
        # # Need to consider words like "won" that have atypical structure...........       
        # this process is ugly, but it works. 
        # The structure of the data returned from the API call is a bit awkward.
        first_layer = dicty['results']
        second_layer = first_layer[0]
        third_layer = second_layer['lexicalEntries']
        # third layer contains multiple definitions or "senses" of the word
        fourth_layer = third_layer[0]
        fifth_layer = fourth_layer['entries']
        sixth_layer = fifth_layer[0]
        seventh_layer = sixth_layer['senses']
        actual_dictionary[word.lower()] = find_defs(seventh_layer)
    except json.JSONDecodeError:
        pass
    
def find_contents(diction):
    try:
        for k, v in diction.items():
            print(k, '\n', v, '\n\n')
    except AttributeError:
        for i in diction:
            print(i, '\n\n')

In [11]:
# Defining functions using "Pickle" library that allow us to save python objects as .pkl files that, when loaded again,
# act just like the objects we saved them as without needing to transform them.

# This allows us to save a dictionary instead of needing to reconsruct a dictionary for each info-theoretic calculation.

def save_obj(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [12]:
word_csv = pd.read_csv('Dictionaries/Cambridge_gk-scch.csv', sep=';')
word_list = list(set(word_csv['WORD'].tolist()))

In [15]:
for word in word_list:
    # some dict entries are strucuctured differently...
    try:
        find_definition(word)
    except KeyError:
        pass

save_obj(actual_dictionary, 'dictionary')

find_contents(actual_dictionary)

ink 
 {'', 'fluid', 'drawing,', 'used', 'coloured', 'etc.', 'writing,', 'duplicating', 'for', 'paste', 'or', 'a', 'printing,'} 


pair 
 {'', 'set', 'not', 'together', 'regarded', 'as', 'parts', 'consisting', 'article', 'joined', 'things', 'used', 'an', 'corresponding', 'separately', 'two', 'unit', 'of', 'or', 'a'} 


people 
 {'', 'authority', 'or', 'ethnic', 'a', 'general', 'members', 'the', 'position', 'power', 'community,', 'supporters', 'nation', 'person', 'particular', 'nation,', 'group', 'collectively', 'of', 'considered', 'in', 'employees', 'beings', 'human'} 


out 
 {'', 'considering', 'appearing', 'at', 'situated', 'that', 'burning', 'far', 'jury)', 'revealed', 'end', 'longer', 'or', 'from', 'a', 'fire)', 'especially', 'as', 'no', 'enclosed', 'is', 'moving', 'its', 'activity', 'place', 'somewhere', 'move', 'particular', '(of', 'secrecy.', 'situation', 'so', 'known', 'away', 'situation,', 'verdict', 'in', 'competition,', 'hidden', 'involved', 'place,', 'distance', 'extinguish

 {'', 'about', 'parts', 'arranged', 'into', 'all', 'or', 'instrumental', 'a', 'set', 'composition', 'the', 'notch', 'goals,', 'points,', 'twenty', 'showing', 'surface', 'other.', 'team', 'cut', 'runs,', 'number', 'musical', 'group', 'vocal', 'of', 'representation', 'in', 'written', 'achieved', 'and', 'by', 'game', 'an', 'below', 'other', 'etc.', 'individual', 'line', 'scratched', 'one'} 


modify 
 {'', '(something)', 'to', 'changes', 'minor', 'or', 'small', 'partial', 'make'} 


venerable 
 {'', 'especially', 'character', 'because', 'respect,', 'or', 'respect', 'great', 'of', 'wisdom,', 'accorded', 'a', 'deal', 'age,'} 


unlucky 
 {'', 'having,', 'bad', 'bringing,', 'luck', 'resulting', 'or', 'from'} 


black 
 {'', 'skin,', 'any', 'darkest', 'complete', 'boycotted', 'opposite', 'African', 'disastrous', 'or', 'a', 'tragic', 'belonging', 'dark-coloured', 'especially', 'not', 'support', 'members', 'as', 'the', 'elsewhere', 'having', 'handled', 'for', 'covert', 'members,', 'trade', 'eve

aloud 
 {'', 'not', 'audibly;', 'whisper', 'silently', 'loudly', 'audibly', 'or', 'a', 'in'} 


recognize 
 {'', 'before', '(someone', 'encountered', 'existence,', 'thing', 'or', 'from', 'before;', 'the', 'having', 'again', 'legality', 'validity,', 'person', 'acknowledge', 'something)', 'of', 'know', 'identify', 'them'} 


product 
 {'', 'article', 'that', 'substance', 'together,', 'quantity', 'thing', 'or', 'a', 'from', 'result', 'multiplying', 'the', 'is', 'action', 'for', 'sale', 'person', 'refined', 'obtained', 'manufactured', 'analogous', 'operation.', 'of', 'by', 'quantities', 'an', 'process', 'algebraic'} 


price 
 {'', 'something', 'payment', 'expected,', 'or', 'achieving', 'a', 'condition', 'as', 'the', 'unwelcome', 'action', 'for', 'amount', 'given', 'done', 'money', 'undergone', 'of', 'in', 'achieve', 'paid', 'objective', 'required,', 'an', 'to', 'experience'} 


bring 
 {'', 'action)', 'something', 'take', '(someone', 'initiate', 'legal', 'or', 'force', 'a', 'condition', '

In [19]:
dicty = load_obj('dictionary')