In [1]:
import stanza 

nlp = stanza.Pipeline('en')

2024-05-12 17:02:49 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 17:02:49 INFO: Downloaded file to /Users/sifael/stanza_resources/resources.json
2024-05-12 17:02:50 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2024-05-12 17:02:50 INFO: Using device: cpu
2024-05-12 17:02:50 INFO: Loading: tokenize
2024-05-12 17:02:51 INFO: Loading: mwt
2024-05-12 17:02:51 INFO: Loading: pos
2024-05-12 17:02:51 INFO: Loading: lemma
2024-05-12 17:02:51 INFO: Loading: constituency
2024-05-12 17:02:51 INFO: Loading: depparse
2024-05-12 17:02:51 INFO: Loading: sentiment
2024-05-12 17:02:51 INFO: Loading: ner


In [2]:
import nltk
from nltk.corpus import stopwords

# Initialize stop words and set global constants
STOP_WORDS = set(stopwords.words('english'))
VALID_POS_TAGS = ('JJ', 'NN', 'JJN', 'NNS', 'RB')

def process_sentences(sentences, dependency_parser):
    final_clusters = []
    dic = {}

    for sentence in sentences:
        print(f"Processing sentence: {sentence}")

        # Step 1: Tokenization and POS Tagging
        tokenized_sentence = nltk.word_tokenize(sentence)
        pos_tagged_sentence = nltk.pos_tag(tokenized_sentence)
        print("Step 1: Tokenization and POS Tagging completed.")
        print(f"Tokenized and tagged sentence: {pos_tagged_sentence} \n")

        # Step 2: Combining Consecutive Nouns
        new_word_list = []
        i = 0
        while i < len(pos_tagged_sentence):
            if i < len(pos_tagged_sentence) - 1 and pos_tagged_sentence[i][1] == 'NN' and pos_tagged_sentence[i+1][1] == 'NN':
                combined_noun = pos_tagged_sentence[i][0] + pos_tagged_sentence[i+1][0]
                new_word_list.append(combined_noun)
                i += 2  # Skip the next item because it's already combined
            else:
                new_word_list.append(pos_tagged_sentence[i][0])
                i += 1
        print("Step 2: Combining consecutive nouns completed.")
        print(f"New word list after combining nouns: {new_word_list} \n")

        # Step 3: Filtering Stop Words
        filtered_words = [word for word in new_word_list if word.lower() not in STOP_WORDS]
        print("Step 3: Filtering stop words completed.")
        print(f"Words after stop word filtering: {filtered_words} \n")

        # Step 4: Dependency Parsing
        filtered_sentence = ' '.join(filtered_words)
        doc = dependency_parser(filtered_sentence)
        dependencies = []
        for dep_edge in doc.sentences[0].dependencies:
            dependencies.append((dep_edge[2].text, dep_edge[0].id, dep_edge[1]))
        print("Step 4: Dependency parsing completed.")
        print(f"Dependency parse results: {dependencies} \n")

        # Step 5: Extracting Features Based on Dependencies
        features = [word for word in pos_tagged_sentence if word[1] in VALID_POS_TAGS]
        feature_clusters = {}
        for word, pos_tag in features:
            if word in filtered_words:
                feature_clusters[word] = pos_tag
        print("Step 5: Extracting and clustering features based on dependencies completed.")
        print(f"Feature clusters: {feature_clusters} \n")

        # Step 6: Building Output Structures
        for word, cluster in feature_clusters.items():
            if cluster == 'NN':
                final_clusters.append(word)
        print("Step 6: Building output structures completed.")
        print(f"Final clusters: {final_clusters} \n")

    return final_clusters

In [7]:
"""
The hotel service was fast and efficient. 
We got checked in in under 5 minutes and had all our questions answered. 
However the food was terrible. Breakfast ran from 7-9 am and there was little variety. 
The wifi was also not very fast and we couldn't stream our favorite shows on TV. 
Overall the experience was ok."""

sentences = ["The hotel service was fast and efficient. ",
             "The wifi was also not very fast and we couldn't stream our favorite shows on TV.",
            "However the food was terrible. Breakfast ran from 7-9 am and there was little variety.",
             "The wifi was also not very fast and we couldn't stream our favorite shows on TV. ",
             "Overall the experience was ok."
            ]
process_sentences(sentences, nlp)

Processing sentence: The hotel service was fast and efficient. 
Step 1: Tokenization and POS Tagging completed.
Tokenized and tagged sentence: [('The', 'DT'), ('hotel', 'NN'), ('service', 'NN'), ('was', 'VBD'), ('fast', 'RB'), ('and', 'CC'), ('efficient', 'JJ'), ('.', '.')] 

Step 2: Combining consecutive nouns completed.
New word list after combining nouns: ['The', 'hotelservice', 'was', 'fast', 'and', 'efficient', '.'] 

Step 3: Filtering stop words completed.
Words after stop word filtering: ['hotelservice', 'fast', 'efficient', '.'] 

Step 4: Dependency parsing completed.
Dependency parse results: [('hotelservice', 3, 'nsubj'), ('fast', 3, 'advmod'), ('efficient', 0, 'root'), ('.', 3, 'punct')] 

Step 5: Extracting and clustering features based on dependencies completed.
Feature clusters: {'fast': 'RB', 'efficient': 'JJ'} 

Step 6: Building output structures completed.
Final clusters: [] 

Processing sentence: The wifi was also not very fast and we couldn't stream our favorite show

['wifi', 'TV', 'food', 'variety', 'wifi', 'TV', 'experience']