In [None]:
'''Creating a qualitative comments analyser
Reference: https://robertorocha.info/using-nlp-to-analyze-open-ended-responses-in-surveys/
'''

import pandas as pd
import spacy

In [None]:
# Load NLP and set large max length to go past spaCys default token limit
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1850000

In [None]:
# Load data and check first 5 entries

df = pd.read_csv('...csv')
df.head()

In [None]:
# Text pre-processing

# Function works on specific column in wider dataframe

# To have it work on a dataframe that is only one column, or generally any column, replace x = x with x.iloc[:, 0] = x.iloc[:, 0]

def text_clean(x):
    # Set to lower case
    x.iloc[:, 0] = x.iloc[:, 0].str.lower()

    # Remove one or more whitespace characters including other unicode ones
    x.iloc[:, 0] = x.iloc[:, 0].str.replace(r'\s+', ' ', regex=True)

    # Remove set of special characters
    remove_spec_chars = ["!",'"',"%","&","'","(",")","#","*","?",
                    "+",",","-","/",":",";","<","=",">",
                    "@","[","\\","]","^","_","`","{","|","}",
                    "~","–","’", "*"]
    
    for char in remove_spec_chars:
        x.iloc[:, 0] = x.iloc[:, 0].str.replace(char, ' ')

    # Handle periods not part of abbreviations
    # This pattern aims to remove periods that are not followed by a lowercase letter (common in abbreviations)
    x.iloc[:, 0] = x.iloc[:, 0].str.replace(r'\.(?![a-z])', ' ', regex=True)

    # Remove single characters
    x.iloc[:, 0] = x.iloc[:, 0].replace(r'\b[a-zA-Z]\b', ' ', regex=True)

    # Remove extra spaces (trim) from boh ends
    x.iloc[:, 0] = x.iloc[:, 0].str.strip()

    # Remove double spacing
    x.iloc[:, 0] = x.iloc[:, 0].replace(r' +', ' ', regex=True)

    # Remove spaces --
    x.iloc[:, 0] = x.iloc[:, 0].replace(r'--', '', regex=True)

    return x

# test responses - 2349 values
clean_df = text_clean(df)

clean_df.loc[100:105]

In [None]:
# Join all responses into a single mega string

all_text = clean_df.Comments.str.cat(sep = ' ')

In [None]:
'''Create spaCy document with the mega strings
Named entity recognize (NER) disabled in tutorial, e.g.
doc = nlp(all_text, disable = ['ner'])
but not disabling for my comments'''

doc = nlp(all_text)

''' This splits words and tags them with parts-of-speech, and recognises stop-words'''

In [None]:
# Overll word frequency analysis for most common words that aren't stop words or punctuation marks

from collections import Counter
from collections import defaultdict
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

word_freq = Counter(words)
word_freq.most_common(50)


In [None]:
# Pattern word selection
"""
ADJ - adjective
ADP - adposition
ADV - adverb
AUX - auxiliary verb
CCONJ - coordinating conjunction
DET - determiner
INTJ -interjection
NOUN - noun
NUM - numeral
PART - particle
PRON - pronoun
PROPN - proper noun
PUNCT - punctuation
SCONJ - subordinating conunction
IN - conjuction
VERB - verb
X - other
"""

In [None]:
pattern_one = "ADJ"
pattern_two = "NOUN"
patt_match_phrase = pattern_one +"_" + pattern_two + "_" + "PHRASE"

In [None]:
# Pattern: adjective-noun

matcher = Matcher(nlp.vocab)
pattern = [{'POS':pattern_one}, {'POS':pattern_two}]
matcher.add(patt_match_phrase, [pattern])

matches = matcher(doc, as_spans=True)
phrases = []

for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)

phrase_freq.most_common(20)

In [None]:
# Function to find surrounding words for a given phrase - word immediately before and after
top_phrases = [phrase for phrase, freq in phrase_freq.most_common(20)]

# Function to find surrounding words for a given phrase
def find_surrounding_words(doc, phrase):
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_patterns = [nlp.make_doc(phrase)]
    phrase_matcher.add("PhrasePattern", phrase_patterns)
    
    matches = phrase_matcher(doc)
    surrounding_words = []
    
    for _, start, end in matches:
        before_index = max(start - 1, 0)
        after_index = min(end, len(doc) - 1)
        
        if before_index != start - 1 or after_index != end:  # Skip if before or after word is out of bounds
            continue
            
        if not doc[before_index].is_punct and not doc[before_index].is_stop and not doc[after_index].is_punct and not doc[after_index].is_stop:
            pair = (doc[before_index].lemma_.lower(), doc[after_index].lemma_.lower())
            surrounding_words.append(pair)
            
    return Counter(surrounding_words)

# Step 3: Find and display the 10 most common surrounding word pairs for each top phrase
for phrase in top_phrases:
    surrounding_word_freq = find_surrounding_words(doc, phrase)
    print(f"Phrase: '{phrase}' - Top 10 surrounding word pairs:")
    for pair, freq in surrounding_word_freq.most_common(10):
        print(f"{pair}: {freq}")
    print("\n")

In [None]:
# Function to find surrounding words for a given phrase - three words immediately before and after (just replace 5s with x number for x words)

top_phrases = [phrase for phrase, freq in phrase_freq.most_common(20)]

# Function to find three words before and three words after for a given phrase
def find_surrounding_words(doc, phrase):
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_patterns = [nlp.make_doc(phrase)]
    phrase_matcher.add("PhrasePattern", phrase_patterns)
    
    matches = phrase_matcher(doc)
    surrounding_words = []
    
    for _, start, end in matches:
        # Adjust indices to capture three words before and three words after
        before_start = max(start - 5, 0)
        after_end = min(end + 5, len(doc))
        
        # Construct the pair with three words before and three words after the phrase
        before_words = doc[before_start:start].text.lower() if start - 5 >= 0 else ''
        after_words = doc[end:after_end].text.lower() if end + 5 <= len(doc) else ''
        
        if before_words and after_words:  # Ensure both before and after words are captured
            pair = (before_words, after_words)
            surrounding_words.append(pair)
            
    return Counter(surrounding_words)

# Step 3: Find and display the 10 most common pairs of three words before and after for each top phrase
for phrase in top_phrases:
    surrounding_word_freq = find_surrounding_words(doc, phrase)
    print(f"Phrase: '{phrase}' - Top 10 pairs of three words before and after:")
    for pair, freq in surrounding_word_freq.most_common(10):
        print(f"{pair}: {freq}")
    print("\n")

In [None]:
top_phrases = [phrase for phrase, freq in phrase_freq.most_common(20)]

# Updated function to include document index
def find_surrounding_words_with_index(df, phrase):
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_patterns = [nlp.make_doc(phrase)]
    phrase_matcher.add("PhrasePattern", phrase_patterns)
    
    surrounding_words_with_index = defaultdict(list)
    
    for idx, doc in df['Comments'].items():
        doc = nlp(doc)
        matches = phrase_matcher(doc)
        
        for _, start, end in matches:
            before_start = max(start - 3, 0)
            after_end = min(end + 3, len(doc))
            
            before_words = doc[before_start:start].text.lower() if start - 3 >= 0 else ''
            after_words = doc[end:after_end].text.lower() if end + 3 <= len(doc) else ''
            
            if before_words and after_words:
                pair = (before_words, after_words)
                surrounding_words_with_index[pair].append(idx)  # Store index where phrase was found
            
    return surrounding_words_with_index

# Finding and displaying results for each top phrase
for phrase in top_phrases:
    surrounding_words_with_indices = find_surrounding_words_with_index(df, phrase)
    print(f"Phrase: '{phrase}' - Contexts and entry numbers:")
    
    for pair, indices in surrounding_words_with_indices.items():
        # Displaying pair and list of indices (data entry numbers) where the pair was found
        print(f"{pair}: found in entries {indices}")
    print("\n")


In [None]:
# Pattern: verb-adjective

matcher = Matcher(nlp.vocab)
pattern = [{'POS':'VERB'}, {'POS':'ADJ'}]
matcher.add('VERB_ADJ_PHRASE', [pattern])

matches = matcher(doc, as_spans=True)
phrases = []

for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)

phrase_freq.most_common(20)

In [None]:
# Function to find surrounding words for a given phrase - word immediately before and after
top_phrases = [phrase for phrase, freq in phrase_freq.most_common(20)]

# Function to find surrounding words for a given phrase
def find_surrounding_words(doc, phrase):
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_patterns = [nlp.make_doc(phrase)]
    phrase_matcher.add("PhrasePattern", phrase_patterns)
    
    matches = phrase_matcher(doc)
    surrounding_words = []
    
    for _, start, end in matches:
        before_index = max(start - 1, 0)
        after_index = min(end, len(doc) - 1)
        
        if before_index != start - 1 or after_index != end:  # Skip if before or after word is out of bounds
            continue
            
        if not doc[before_index].is_punct and not doc[before_index].is_stop and not doc[after_index].is_punct and not doc[after_index].is_stop:
            pair = (doc[before_index].lemma_.lower(), doc[after_index].lemma_.lower())
            surrounding_words.append(pair)
            
    return Counter(surrounding_words)

# Step 3: Find and display the 10 most common surrounding word pairs for each top phrase
for phrase in top_phrases:
    surrounding_word_freq = find_surrounding_words(doc, phrase)
    print(f"Phrase: '{phrase}' - Top 10 surrounding word pairs:")
    for pair, freq in surrounding_word_freq.most_common(10):
        print(f"{pair}: {freq}")
    print("\n")

In [None]:
# Function to find surrounding words for a given phrase - three words immediately before and after (just replace 3s with 2s for 2 words)

top_phrases = [phrase for phrase, freq in phrase_freq.most_common(20)]

# Function to find three words before and three words after for a given phrase
def find_surrounding_words(doc, phrase):
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_patterns = [nlp.make_doc(phrase)]
    phrase_matcher.add("PhrasePattern", phrase_patterns)
    
    matches = phrase_matcher(doc)
    surrounding_words = []
    
    for _, start, end in matches:
        # Adjust indices to capture three words before and three words after
        before_start = max(start - 3, 0)
        after_end = min(end + 3, len(doc))
        
        # Construct the pair with three words before and three words after the phrase
        before_words = doc[before_start:start].text.lower() if start - 3 >= 0 else ''
        after_words = doc[end:after_end].text.lower() if end + 3 <= len(doc) else ''
        
        if before_words and after_words:  # Ensure both before and after words are captured
            pair = (before_words, after_words)
            surrounding_words.append(pair)
            
    return Counter(surrounding_words)

# Step 3: Find and display the 10 most common pairs of three words before and after for each top phrase
for phrase in top_phrases:
    surrounding_word_freq = find_surrounding_words(doc, phrase)
    print(f"Phrase: '{phrase}' - Top 10 pairs of three words before and after:")
    for pair, freq in surrounding_word_freq.most_common(10):
        print(f"{pair}: {freq}")
    print("\n")

In [None]:
# Pattern: noun-noun

matcher = Matcher(nlp.vocab)
pattern = [{'POS':'NOUN'}, {'POS':'NOUN'}]
matcher.add('NOUN_NOUN_PHRASE', [pattern])

matches = matcher(doc, as_spans=True)
phrases = []

for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)

phrase_freq.most_common(20)

In [None]:
# Function to find surrounding words for a given phrase - three words immediately before and after (just replace 3s with 2s for 2 words)

top_phrases = [phrase for phrase, freq in phrase_freq.most_common(20)]

# Function to find three words before and three words after for a given phrase
def find_surrounding_words(doc, phrase):
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_patterns = [nlp.make_doc(phrase)]
    phrase_matcher.add("PhrasePattern", phrase_patterns)
    
    matches = phrase_matcher(doc)
    surrounding_words = []
    
    for _, start, end in matches:
        # Adjust indices to capture three words before and three words after
        before_start = max(start - 3, 0)
        after_end = min(end + 3, len(doc))
        
        # Construct the pair with three words before and three words after the phrase
        before_words = doc[before_start:start].text.lower() if start - 3 >= 0 else ''
        after_words = doc[end:after_end].text.lower() if end + 3 <= len(doc) else ''
        
        if before_words and after_words:  # Ensure both before and after words are captured
            pair = (before_words, after_words)
            surrounding_words.append(pair)
            
    return Counter(surrounding_words)

# Step 3: Find and display the 10 most common pairs of three words before and after for each top phrase
for phrase in top_phrases:
    surrounding_word_freq = find_surrounding_words(doc, phrase)
    print(f"Phrase: '{phrase}' - Top 10 pairs of three words before and after:")
    for pair, freq in surrounding_word_freq.most_common(10):
        print(f"{pair}: {freq}")
    print("\n")

In [None]:
# Pattern: noun-verb

matcher = Matcher(nlp.vocab)
pattern = [{'POS':'NOUN'}, {'POS':'VERB'}]
matcher.add('NOUN_VERB_PHRASE', [pattern])

matches = matcher(doc, as_spans=True)
phrases = []

for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)

phrase_freq.most_common(20)

In [None]:
# Function to find surrounding words for a given phrase - three words immediately before and after (just replace 3s with 2s for 2 words)

top_phrases = [phrase for phrase, freq in phrase_freq.most_common(20)]

# Function to find three words before and three words after for a given phrase
def find_surrounding_words(doc, phrase):
    phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    phrase_patterns = [nlp.make_doc(phrase)]
    phrase_matcher.add("PhrasePattern", phrase_patterns)
    
    matches = phrase_matcher(doc)
    surrounding_words = []
    
    for _, start, end in matches:
        # Adjust indices to capture three words before and three words after
        before_start = max(start - 5, 0)
        after_end = min(end + 5, len(doc))
        
        # Construct the pair with three words before and three words after the phrase
        before_words = doc[before_start:start].text.lower() if start - 5 >= 0 else ''
        after_words = doc[end:after_end].text.lower() if end + 5 <= len(doc) else ''
        
        if before_words and after_words:  # Ensure both before and after words are captured
            pair = (before_words, after_words)
            surrounding_words.append(pair)
            
    return Counter(surrounding_words)

# Step 3: Find and display the 10 most common pairs of three words before and after for each top phrase
for phrase in top_phrases:
    surrounding_word_freq = find_surrounding_words(doc, phrase)
    print(f"Phrase: '{phrase}' - Top 10 pairs of three words before and after:")
    for pair, freq in surrounding_word_freq.most_common(10):
        print(f"{pair}: {freq}")
    print("\n")

In [None]:
# Pattern: noun-adjective

from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = [{'POS':'NOUN'}, {'POS':'ADJ'}]
matcher.add('ADJ_PHRASE', [pattern])

matches = matcher(doc, as_spans=True)
phrases = []

for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)

phrase_freq.most_common(20)

In [None]:
# Pattern: adjective-noun-verb

matcher = Matcher(nlp.vocab)
pattern = [{'POS':'ADJ'}, {'POS':'NOUN'}, {'POS':'VERB'}]
matcher.add('NOUN_ADV_ADJ_PHRASE', [pattern])

matches = matcher(doc, as_spans=True)
phrases = []

for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)

phrase_freq.most_common(30)

In [None]:
# Pattern: noun-noun-adverb-adjective

matcher = Matcher(nlp.vocab)
pattern = [{'POS':'NOUN'}, {'POS':'NOUN'}, {'POS':'ADV'}, {'POS':'ADJ'}]
matcher.add('NOUN_ADV_ADJ_PHRASE', [pattern])

matches = matcher(doc, as_spans=True)
phrases = []

for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)

phrase_freq.most_common(30)