In [42]:
import os, glob, json
import pandas as pd

In [6]:
# specify directory containing all the aspect folders
ROOT_DIR = os.path.dirname(os.path.abspath('__file__'))
print("Current root directory: ", ROOT_DIR)

Current root directory:  /home/tictactoe/Documents/iss_plp/ISS_PLP_Project/WordNet


In [35]:
def get_words(aspect, split=True, pos_tags=None):
    '''
        Return words given aspect and a list of permitted pos tags
        
        Parameters:
        -----------
            aspect: string
                Name of the aspect
                Either 'appearance', 'comfort', 'delivery', 'functionality', 'price', 'quality'
            split: bool
                Whether to split the output into two separate lists of words and pos-tags
            pos_tags: list
                List of permitted pos-tags in the result
        
        Return:
        -------
            Dictionary of two keys: 'aspect', 'words'
            If split is True, the words value is a nested dictionary of keys 'tokens' and 'pos'.
            Else, the words value is a list of (token, pos) tuples.
    '''
    if isinstance(pos_tags, str):
        pos_tags = [pos_tags]
    
    with open(os.path.join(ROOT_DIR, aspect, aspect+'.csv')) as f:
        words = f.readlines()
    
    # strip away next line escape characters
    words = [word.strip() for word in words]
    
    # convert each word, pos pair to tuple 
    words = [word.split(',') for word in words]
    
    if pos_tags is None:
        tokens = [tk for tk, pos in words]
        pos = [pos for tk, pos in words]
    else:
        tokens = [tk for tk, pos in words if pos in pos_tags]
        pos = [pos for tk, pos in words if pos in pos_tags]
    
    if split:
        return {
            'aspect' : aspect,
            'words' : {
                'tokens' : tokens,
                'pos' : pos
            } 
        }
    else:
        return {
            'aspect' : aspect,
            'words' : [(tk, p) for tk, p in zip(tokens, pos)]
        }

In [39]:
# example: getting price words, only allowing noun and adjectives
result = get_words('price', split=True, pos_tags=['noun', 'adj'])

# get the list of tokens only
tks = result['words']['tokens']

# get the list of pos only
pos = result['words']['pos']

for tk, p in zip(tks, pos):
    print("%s ---> %s" %(tk, p))

catalog buying ---> noun
takeover ---> noun
superannuation ---> noun
combat pay ---> noun
trueness ---> noun
support payment ---> noun
fair ---> adj
cost-of-living allowance ---> noun
bid price ---> noun
highway robbery ---> noun
license fee ---> noun
retirement check ---> noun
sumptuous ---> adj
disbursal ---> noun
allowance ---> noun
subscriber ---> noun
mail-order buying ---> noun
moorage ---> noun
steal ---> noun
expensive ---> adj
viatical settlement ---> noun
minimum wage ---> noun
modest ---> adj
pipage ---> noun
interest rate ---> noun
freight rate ---> noun
baksheesh ---> noun
fare ---> noun
indemnification ---> noun
time plan ---> noun
penalty ---> noun
admission fee ---> noun
contingency fee ---> noun
repayment rate ---> noun
transportation ---> noun
portage ---> noun
maintenance ---> noun
entrance money ---> noun
meed ---> noun
conscience money ---> noun
bounty ---> noun
marketing cost ---> noun
repurchase ---> noun
freight ---> noun
retirement pension ---> noun
tip ---> no

In [40]:
# example of getting the words of a few aspects at once
aspects = ['price', 'appearance', 'size']
results = [get_words(asp, split=True, pos_tags=['noun', 'adj']) for asp in aspects ]

In [41]:
# write the results to json file
with open('test.json', 'w') as f:
    json.dump(results, )

[{'aspect': 'price',
  'words': {'tokens': ['catalog buying',
    'takeover',
    'superannuation',
    'combat pay',
    'trueness',
    'support payment',
    'fair',
    'cost-of-living allowance',
    'bid price',
    'highway robbery',
    'license fee',
    'retirement check',
    'sumptuous',
    'disbursal',
    'allowance',
    'subscriber',
    'mail-order buying',
    'moorage',
    'steal',
    'expensive',
    'viatical settlement',
    'minimum wage',
    'modest',
    'pipage',
    'interest rate',
    'freight rate',
    'baksheesh',
    'fare',
    'indemnification',
    'time plan',
    'penalty',
    'admission fee',
    'contingency fee',
    'repayment rate',
    'transportation',
    'portage',
    'maintenance',
    'entrance money',
    'meed',
    'conscience money',
    'bounty',
    'marketing cost',
    'repurchase',
    'freight',
    'retirement pension',
    'tip',
    'buy',
    'dependableness',
    'refresher',
    'fixed charge',
    'tax assessment',