In [1]:
import pandas as pd
import numpy as np
import time
import os
import requests
import json
import urllib.request
import re
import pickle
import networkx as nx
import string
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /home/john/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [28]:
app_id = '24ac8855'
app_key = 'baa032bb2944756dcf361a18ae7e3ab9'

actual_dictionary = {}

alphabet = set(list(string.ascii_lowercase) + list(string.ascii_uppercase))

def find_defs(diction, posCategory):
    """
    This function executes when we reach a list containing dictionary data strucutres that may contain several different
    definitions for a given word. It runs through this list, for each list finding keys in the dictionary data
    structure containing the string "definitions" or "short_definitions" and appending the corresponding value 
    (a string) of this key to a master string, which we call stringy. At the end of this function, we use the 
    built-in function set() on the list given to us by the .split() method. The reasons for a set are:
    1) checking membership in sets is efficient in python
    2) sets do not contain duplicate elements, so we are using less space.
    """
    stringy = ''
    for i in diction:
        for k, v in i.items():
            if k == 'definitions' or k == 'short_definitions' or k == 'crossReferenceMarkers':
                stringy += v[0] + ' '
    stringy = stringy.split(' ')
    
    for word in stringy:  
        n = stringy.index(word)
        try:
            # Make sure that first, last element in string is 
            if word[0] not in alphabet:
                stringy[n] = word[1:]
            if stringy[n][-1] not in alphabet:
                stringy[n] = stringy[n][:-1]
            if stringy[n][-1] not in alphabet:
                stringy[n] = stringy[n][:-1]
            stringy[n] = wordnet_lemmatizer.lemmatize(stringy[n], pos=posCategory).lower()
        except IndexError:
            pass

    return set(stringy)

def find_definition(word):
    """
    This function takes in a word as input and creates an entry in our dataset consisting of a word-definition pair.
    """
#     wordnet_lemmatizer.lemmatize(word)
    
    # there may be times while looking up words that the dictionary does not contain a word for whatever reason.
    # this chunk of code catches this error.
    try:
#         lemmatizedWord = wordnet_lemmatizer.lemmatize(word).lower()
        url = 'https://od-api.oxforddictionaries.com:443/api/v1/entries/en/' + word.lower()
        r = requests.get(url, headers = {'app_id': app_id, 'app_key': app_key})
        dicty = r.json()
        # # Need to consider words like "won" that have atypical structure...........       
        # this process is ugly, but it works. 
        # The structure of the data returned from the API call is a bit awkward.
        lexCategory = dicty['results'][0]['lexicalEntries'][0]['lexicalCategory']
        if lexCategory == "Noun":
            lexCategory = "n"
        if lexCategory == "Verb":
            lexCategory = "v"
        if lexCategory == "Adjective":
            lexCategory = "a"
        definitions = dicty['results'][0]['lexicalEntries'][0]['entries'][0]['senses']
        if find_defs(definitions, lexCategory) != {''}:
            actual_dictionary[wordnet_lemmatizer.lemmatize(word, pos=lexCategory).lower()] = find_defs(definitions, lexCategory)
        else:
            pass
    except json.JSONDecodeError:
        print("something fucked up with " + word)
        pass
    
def find_contents(diction):
    try:
        for k, v in diction.items():
            print(k, '\n', v, '\n\n')
    except AttributeError:
        for i in diction:
            print(i, '\n\n')

In [215]:
# Defining functions using "Pickle" library that allow us to save python objects as .pkl files that, when loaded again,
# act just like the objects we saved them as without needing to transform them.

# This allows us to save a dictionary instead of needing to reconsruct a dictionary for each info-theoretic calculation.

def save_obj(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [216]:
word_csv = pd.read_csv('Dictionaries/Cambridge_gk-scch.csv', sep=';')
word_list = list(set(word_csv['WORD'].tolist()))

In [219]:
find_definition('elephant')

save_obj(actual_dictionary, 'dictionary')

find_contents(actual_dictionary)

dicty = load_obj('dictionary')

find_contents(dicty)

Noun
n
elephant 
 {'', '58', 'prehensile', 'of', 'tusks', '2', '71', 'largest', 'and', 'asia', 'Africa', 'paper', 'ivory', 'the', 'living', 'typically', 'plant-eating', 'to', 'southern', 'is', 'ear', 'very', 'approximately', 'long', 'tusk', 'with', 'a', 'It', 'mm)', 'size', 'large', 'trunk', 'curved', 'animal', 'native', 'inches', 'land', 'mammal'} 


elephant 
 {'', '58', 'prehensile', 'of', 'tusks', '2', '71', 'largest', 'and', 'asia', 'Africa', 'paper', 'ivory', 'living', 'typically', 'mammal', 'plant-eating', 'to', 'southern', 'is', 'ear', 'very', 'long', 'native', 'tusk', 'with', 'a', 'It', 'mm)', 'size', 'large', 'trunk', 'curved', 'animal', 'the', 'inches', 'land', 'approximately'} 




In [126]:
nltk.download('averaged_perceptron_tagger')
checkWord = "running".lower()
POS = nltk.pos_tag(checkWord)

print(POS)
# lemmatizedWord = wordnet_lemmatizer.lemmatize(checkWord, pos=POS)
# print(lemmatizedWord)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[('r', 'NN'), ('u', 'JJ'), ('n', 'JJ'), ('n', 'NN'), ('i', 'NN'), ('n', 'VBP'), ('g', 'NN')]


In [29]:
find_definition('elephant')
find_contents(actual_dictionary)

elephant 
 {'a', '', 'ear', 'tusk', 'approximately', 'with', 'of', 'mm', 'the', 'living', 'typically', 'and', 'plant-eating', 'it', 'southern', 'large', 'ivory', 'is', 'inch', 'native', 'asia', 'to', 'prehensile', 'land', 'trunk', 'size', 'long', 'paper', 'africa', 'largest', 'animal', 'mammal', 'curved', 'very'} 


