### Aspect Extraction Implementation



In [2]:
import stanza

stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-07 21:35:42 INFO: Downloaded file to /Users/sifael/stanza_resources/resources.json
2024-05-07 21:35:42 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

2024-05-07 21:36:30 INFO: Downloaded file to /Users/sifael/stanza_resources/en/default.zip
2024-05-07 21:36:32 INFO: Finished downloading models and saved to /Users/sifael/stanza_resources


In [3]:
import nltk
from nltk.corpus import stopwords

In [4]:
STOP_WORDS = set(stopwords.words('english'))
VALID_POS_TAGS = ('JJ', 'NN', 'JJR', 'NNS', 'RB')
RELEVANT_DEPENDENCIES = {'nsubj', 'acl:relcl', 'obj', 'dobj', 'agent', 'advmod', 'amod', 'neg', 'prep_of', 'acomp', 'xcomp', 'compound'}

In [40]:
def combine_nouns(tagged_list):
    new_word_list = []
    skip = False

    for i in range(len(tagged_list) - 1):
        if skip:
            skip = False
            continue

        current_word, current_tag = tagged_list[i]
        next_word, next_tag = tagged_list[i + 1]

        if current_tag == 'NN' and next_tag == 'NN':
            new_word_list.append(current_word + '_' + next_word)
            skip = True
        else:
            new_word_list.append(current_word)

    if not skip and len(tagged_list) > 0:
        new_word_list.append(tagged_list[-1][0])

    return new_word_list


def filter_and_retag(text):
    word_tokens = nltk.word_tokenize(text)
    filtered_words = [w for w  in word_tokens if w not in STOP_WORDS ]
    return nltk.pos_tag(filtered_words)


def extract_dependencies(doc, word_list):
    dependencies = []
    for dep_edge in doc.sentences[0].dependencies:
        word_id = int(dep_edge[0].id) - 2
        if word_id >= 0:
            head_word = word_list[word_id]
        else:
            head_word = 'ROOT'
        dependencies.append( [dep_edge[2].text, head_word, dep_edge[1]] )
    return dependencies

def build_clusters(features, dependencies):
    clusters = []
    for feature in features:
        related = [ 
            dep[1] if dep[0] == feature[0] else dep[0]
            for dep in dependencies
            if dep[0] == feature[0] and dep[2] in RELEVANT_DEPENDENCIES
        ]
        clusters.append([feature[0], related])
    return clusters


def process_sentences(sentences, dependency_parser):
    fclusters, final_clusters, dic = [], [], {}

    for sentence in sentences:
        # Tokenize and POS tag the sentence
        tokenized_sentence = nltk.word_tokenize(sentence)
        pos_tagged_sentence = nltk.pos_tag(tokenized_sentence)

        # Combine consecutive nouns and for a new word list
        combined_noun_list = combine_nouns(pos_tagged_sentence)
        final_text = ' '.join(combined_noun_list)

        # Filter Stops Words and retag the combined word list
        filtered_tagged_list = filter_and_retag(final_text)

        # Parse dependencies
        doc = dependency_parser(final_text)
        dependencies = extract_dependencies(doc, combined_noun_list)

        # Extract Features
        features = [word for word in filtered_tagged_list if word[1] in VALID_POS_TAGS]

        # Build Clusters
        clusters = build_clusters(features, dependencies)
        fclusters.extend(clusters)

        # Populate dictionary and final clusters
        for word, pos_tag in features:
            dic[word] = pos_tag

        final_clusters.extend([cluster for cluster in clusters if dic[cluster[0]] == 'NN'])

    return final_clusters

        

        
        

        

In [41]:
import stanza

nlp = stanza.Pipeline('en', download_method=None)


2024-05-07 22:27:48 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2024-05-07 22:27:48 INFO: Using device: cpu
2024-05-07 22:27:48 INFO: Loading: tokenize
2024-05-07 22:27:48 INFO: Loading: mwt
2024-05-07 22:27:48 INFO: Loading: pos
2024-05-07 22:27:48 INFO: Loading: lemma
2024-05-07 22:27:48 INFO: Loading: constituency
2024-05-07 22:27:48 INFO: Loading: depparse
2024-05-07 22:27:48 INFO: Loading: sentiment
2024-05-07 22:27:48 INFO: Loading: ner
2024-05-07 22:27:49 INFO: Done loading processors!


In [42]:
sample_text = """
The hotel service was fast and efficient. 
We got checked in in under 5 minutes and had all our questions answered.
However the food was terrible. 
Breakfast ran from 7-9 am and there was little variety. 
The wifi was also not very fast and we couldn't stream our favorite shows on TV. 
Overall the experience was ok.
"""

sentences = nltk.sent_tokenize(sample_text)
sentences

['\nThe hotel service was fast and efficient.',
 'We got checked in in under 5 minutes and had all our questions answered.',
 'However the food was terrible.',
 'Breakfast ran from 7-9 am and there was little variety.',
 "The wifi was also not very fast and we couldn't stream our favorite shows on TV.",
 'Overall the experience was ok.']

In [43]:

process_sentences(['\nThe hotel service was fast and efficient.',
 'We got checked in in under 5 minutes and had all our questions answered.',
 'However the food was terrible.',
 'Breakfast ran from 7-9 am and there was little variety.',
 "The wifi was also not very fast and we couldn't stream our favorite shows on TV.",
 'Overall the experience was ok.'], nlp)

[['hotel_service', ['was']],
 ['food', ['was']],
 ['variety', ['little']],
 ['wifi', ['very']],
 ['TV', []],
 ['experience', ['was']],
 ['ok', []]]