In [2]:
# LOAD MODULES
from IPython.core.display import display, HTML # for max width
display(HTML("<style>.container { width:80% !important; }</style>"))
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
from spacy.tokens import Doc, Span, Token
import scattertext as st
from IPython.display import IFrame
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy import spatial
from scipy.spatial import distance
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
import spacy # software for analysing text
from spacy import displacy
nlp = spacy.load('en_core_web_lg') # a medium english language model
import warnings
warnings.filterwarnings('ignore')

# LOAD FUNCTIONS
def compareMeanings(words):
    # calculate similarities
    pca = PCA(n_components=2)
    pca.fit([nlp(word).vector for word in words])
    word_vecs_2d = pca.transform([nlp(word).vector for word in words])
    
    # create plot 
    plt.figure(figsize=(5,5))
    plt.scatter(word_vecs_2d[:,0], word_vecs_2d[:,1])
    # for each word and coordinate pair: draw the text on the plot
    for word, coord in zip(words, word_vecs_2d):
        x, y = coord
        plt.text(x, y, word, size= 15)
    #plt.savefig('fig.jpg', dpi=1000)
    # show the plot
    plt.show()
    
    
def mathsOnMeaning(start, subtract, add):
    x = nlp.vocab[start].vector - nlp.vocab[subtract].vector + nlp.vocab[add].vector
    
    similar_words = []
    for word in nlp.vocab:
        if not (word.has_vector & word.is_lower):
            continue
        if not ((word.text!=start)&(word.text!=add)&(word.text!=subtract)):
            continue
     
        similarity = cosine_similarity(x, word.vector)
        similar_words.append((word, similarity))
        
    similar_words = sorted(similar_words, key=lambda item: -item[1])
    result = [w[0].text for w in similar_words[:5]][0]
    print(f"'{start}' minus '{subtract}' add '{add}' = {result}")
    
def scattertextTag(tag):
    def getX(doc):
        out = [x.text for x in doc if (x.tag_ in [tag])]
        return nlp(' '.join(out))
    df['text_x'] = df['text_doc'].apply(getX)
    
    corpus = st.CorpusFromParsedDocuments(df, 
                                          parsed_col='text_x',
                                category_col='post_type'
                                         ).build()
    html = st.produce_scattertext_explorer(corpus,
             category='story',
             category_name='Patient Criticism',
             not_category_name='Staff Responses',
             #characteristic_scorer=None,
             alternative_text_field='post_body',
             show_characteristic=False,                              
             width_in_pixels=800)
    return html

def scattertextThemes(topics):
    topic_feature_builder = st.FeatsFromTopicModel(topics)
    
    #CREATE CORPUS
    corpus = st.CorpusFromParsedDocuments(df, 
                                          parsed_col='text_doc',
                                          category_col='post_type',
                                          feats_from_spacy_doc=topic_feature_builder
                                          ).build()
    
    # CREATE PLOT
    html = st.produce_scattertext_explorer(corpus,
                                          category='story',
                                          category_name='Patient Criticism',
                                          not_category_name = 'Hospital Response',
                                          width_in_pixels=600,
                                           height_in_pixels=400,
                                          show_characteristic=False,
                                           use_non_text_features=True,
                                           show_top_terms=True,
                                          minimum_term_frequency=5,
                                          use_full_doc=True,
                                          p_value_colors=True,
                                          max_snippets = 11,
                                           pmi_threshold_coefficient=0,
                                        topic_model_term_lists=topic_feature_builder.get_top_model_term_lists()
                                          )
    return html

def findSentence(sentence_subject, sentence_objecet, sentence_contains):
    print('')
    if len(sentence_contains)>0:
        for doc in df.loc[df['post_type']=='response', 'text_doc']:
            for sent in doc.sents:
                for tok in sent:
                    if tok.text.lower() in sentence_contains:
                        for tok in sent:
                            if 'obj' in tok.dep_:
                                if tok.text.lower() in sentence_object:
                                    for tok in sent:
                                        if 'subj' in tok.dep_:
                                            if tok.text.lower() in sentence_subject:
                                                print(tok.sent)
                                                #displacy.render(tok.sent, jupyter=True, style='dep')
                                                break
    else:
        for doc in df.loc[df['post_type']=='response', 'text_doc']:
            for sent in doc.sents:
                for tok in sent:
                    if 'obj' in tok.dep_:
                        if tok.text.lower() in sentence_object:
                            for tok in sent:
                                if 'subj' in tok.dep_:
                                    if tok.text.lower() in sentence_subject:
                                        print(tok.sent)
                                        #displacy.render(tok.sent, jupyter=True, style='dep')
                                        break
print('Notebook loaded successfully')

# set up info
%load_ext watermark
%watermark -m -v -p pandas,numpy,scipy,statsmodels,matplotlib,seaborn,spacy,tqdm,watermark

# spacy model info
!python -m spacy info 
#!python -m spacy info en_core_web_md

Notebook loaded successfully
The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
CPython 3.7.6
IPython 7.12.0

pandas 0.25.1
numpy 1.17.2
scipy 1.4.1
statsmodels 0.11.1
matplotlib 3.1.3
seaborn 0.10.0
spacy 2.2.3
tqdm 4.43.0
watermark 2.0.2

compiler   : MSC v.1916 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
CPU cores  : 12
interpreter: 64bit
[1m

spaCy version    2.2.3                         
Location         C:\Users\alexg\Anaconda3\envs\PyR_202003_pandas_0_25_3\lib\site-packages\spacy
Platform         Windows-10-10.0.18362-SP0     
Python version   3.7.6                         
Models                                         

[1m

lang             en                            
name             core_web_md                   
license          MIT                           
author           Explosion                     
url              https://explosion.

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span></li><li><span><a href="#Jupyter-notebook" data-toc-modified-id="Jupyter-notebook-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Jupyter notebook</a></span></li><li><span><a href="#Natural-Language-Processing-(NLP)" data-toc-modified-id="Natural-Language-Processing-(NLP)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Natural Language Processing (NLP)</a></span><ul class="toc-item"><li><span><a href="#Pre-2013:-Word-Frequency" data-toc-modified-id="Pre-2013:-Word-Frequency-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Pre-2013: Word Frequency</a></span></li><li><span><a href="#Post-2013" data-toc-modified-id="Post-2013-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Post-2013</a></span></li></ul></li><li><span><a href="#Word-embeddings" data-toc-modified-id="Word-embeddings-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Word embeddings</a></span><ul class="toc-item"><li><span><a href="#Find-words-used-in-similar-contexts" data-toc-modified-id="Find-words-used-in-similar-contexts-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Find words used in similar contexts</a></span></li><li><span><a href="#Comparing-word-meaning?" data-toc-modified-id="Comparing-word-meaning?-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Comparing word meaning?</a></span></li><li><span><a href="#Comparing-sentence-meaning?" data-toc-modified-id="Comparing-sentence-meaning?-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Comparing sentence meaning?</a></span></li><li><span><a href="#Scoring-sentences-against-target-meaning" data-toc-modified-id="Scoring-sentences-against-target-meaning-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Scoring sentences against target meaning</a></span></li><li><span><a href="#Adding-and-subtracting-meanings?" data-toc-modified-id="Adding-and-subtracting-meanings?-4.5"><span class="toc-item-num">4.5&nbsp;&nbsp;</span>Adding and subtracting meanings?</a></span></li><li><span><a href="#'Thought-vectors'" data-toc-modified-id="'Thought-vectors'-4.6"><span class="toc-item-num">4.6&nbsp;&nbsp;</span>'Thought vectors'</a></span></li></ul></li><li><span><a href="#Hospital-Staff-Responding-to-Criticism" data-toc-modified-id="Hospital-Staff-Responding-to-Criticism-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Hospital Staff Responding to Criticism</a></span><ul class="toc-item"><li><span><a href="#Research-context" data-toc-modified-id="Research-context-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Research context</a></span></li><li><span><a href="#Comparing-perspectives:-words" data-toc-modified-id="Comparing-perspectives:-words-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Comparing perspectives: words</a></span></li><li><span><a href="#Comparing-perspectives:-themes" data-toc-modified-id="Comparing-perspectives:-themes-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Comparing perspectives: themes</a></span></li><li><span><a href="#Which-responses-have-least-perspective-taking?" data-toc-modified-id="Which-responses-have-least-perspective-taking?-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Which responses have least perspective taking?</a></span></li><li><span><a href="#Which-responses-have-most-psychological-distancing?" data-toc-modified-id="Which-responses-have-most-psychological-distancing?-5.5"><span class="toc-item-num">5.5&nbsp;&nbsp;</span>Which responses have most psychological distancing?</a></span></li></ul></li><li><span><a href="#Conclusion" data-toc-modified-id="Conclusion-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Conclusion</a></span></li></ul></div>

# Introduction

# Jupyter notebook

[Jupyter notebooks](https://jupyter.org/) (and more recently Jupyter Lab) are language agnostic notebooks. They can run many languages, especially: **Ju**lia, **Pyt**hon, and **R**. 

Initiated in 2014, Jupyter notebooks have rapidly become a [dominant platform for data analysis](https://www.nature.com/articles/d41586-018-07196-1). There is good reason to believe that they will eventually [replace the scientific article](https://www.theatlantic.com/science/archive/2018/04/the-scientific-paper-is-obsolete/556676/) as we know it.

Scientific articles are texts that talk about data and analysis - but they don't allow readers to interact with the data or the analysis. Notebooks, are a step-change in scientific communication because they combine text, data, and analysis in one easy to read and reproducible package that is easy to share and interact with ([gallery of interesting notebooks](https://github.com/jupyter/jupyter/wiki/A-gallery-of-interesting-Jupyter-Notebooks)).

# Natural Language Processing (NLP)

NLP refers to using computers to search, manipulate and respond to human language

It is closely connected to AI, Machine Learning, Big Data

The driving motive is that Google, Facebook, etc. are investing vast resources in making computers understand text, image and sound in a way that matters for humans.

## Pre-2013: Word Frequency

To-date most psychological research using computers to analyse text has focused on word-frequency. Because the words we speak are closely connected to our thought patterns, this has produced several interesting results. For example:
- Identifying [authorship in Shakespeare's plays](http://elizabethan-theatre.org/wps/wp-content/uploads/2015/07/Double-Falsehood-by-Ryan-Boyd-Psychological-Science-2015.pdf)
- Longitudinal studies of Mayor [Giuliani's adaptation to the crisis of 9/11](https://www.sciencedirect.com/science/article/abs/pii/S0092656602923494)
- Detecting people's [personality from their writing style](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2885844/?_escaped_fragment_=po=2.77778)
- Monitoring [public anxiety](https://www.tandfonline.com/doi/abs/10.1080/10410236.2011.571759) and public mood (which is [associated with stock prices](https://arxiv.org/pdf/1010.3003&))

But, all these studies use fairly basic word count techniques, or, slightly more advanced sentiment analysis.

## Post-2013 

Word vectors, or 'word embeddings', developed in the late 1990s by Yoshua Bengio, Geoffrey Hinton, et al. But, their significance was only appreciated in about 2013.

Before about 2013, most computational analysis of words focused on the textual appearance of a word; for example, 'dog' was similar to 'doggy' (same stem), similar to 'god' (same letters) and 'hog' (2 letters in the same order), but without any similarity to 'cat' (no similar letters). There seemed to be no way to make computers understand that 'dog' and 'cat' are both family pets and thus semantically related.

The breakthrough came from Wittgenstein's (1953; also Frith, 1957) insight: the meaning of words comes from the context in which they are used. Consider the sentence 'the X sat on the mat' - most people would agree that 'dog' and 'cat' are plausible candidates for X. That is to say, 'dog' and 'cat' occupy similar roles within similar sentences. 

Word embeddings encode the meaning of words not in terms of the words themselves, but, in terms of the context in which the words appear. By analyzing billions of webpages and books, statistical models are built based on which words occur in the same context. These models _seem_ to encode meaning.

# Word embeddings

Google and other companies are using word embeddings to [map the meaning of all words](https://projector.tensorflow.org/) - created by analysing the sentence context in which words are used accross _all_ books, webpages, news sources and other archives.

## Find words used in similar contexts

In [None]:
def getTokensNearVector(nlp, vector, limit=300, debug=False):
    '''
    takes a vector (does not need to be based on a word)
    returns a list of the nearest tokens
    all returned vectors are lower case, so duplicates are removed
    '''
        
    # INPUT CHECKS
    # if input is vector
    if isinstance(vector, np.ndarray):
        input_is_string=False
        # Format the input vector for use in the distance function
        p = [np.array(vector)]
        if debug==True:
            print('INPUT IS A VECTOR')
            print('type(p):', type(p))
        
    # if input is string
    elif isinstance(vector, str):
        input_is_string=True
        #print(f'the input was a string:  "{vector}"')
        #print('converting each token into a vector and taking the average\n')
        doc = nlp(vector)
        vectors = []
        for tok in [tok for tok in doc if tok.is_alpha]:
            try:
                vectors.append(tok.vector)
            except:
                print('error getting vector for ', tok.text)
        #sum vectors
        vector_sum=0
        for v in vectors:
            vector_sum=vector_sum+v
        
        #get mean vector
        vector=vector_sum/len(vectors)
        # Format the input vector for use in the distance function
        p = [np.array(vector)]

        if debug==True:
            print('INPUT IS A STRING')
            print('type(p):', type(p))
            #print('THIS IS THE CONVERTED VECTOR: ', vector)
            
    elif isinstance(vector, list):
        print('THIS IS A LIST - TAKING FIRST ITEM')
        vector=vector[0]
    else:
        print('UNKNOWN TYPE: ', type(vector))
        
    # Imports
   
    #import spacy
    #nlp = spacy.load("en_core_web_lg")
    # https://stackoverflow.com/questions/54717449/mapping-word-vector-to-the-most-similar-closest-word-using-spacy
    
    # Format the vocabulary for use in the distance function
    ids = [x for x in nlp.vocab.vectors.keys()]
    vectors1 = [nlp.vocab.vectors[x] for x in ids]
    vectors2 = np.array(vectors1)
        
    if debug==True:
        print('THIS IS THE FORMATTED VECTOR (p) BEING SENT TO THE DISTANCE FUNCTION')
        print(p)
        print('THESE ARE THE IDS')
        print(ids[:100])
        print('THESE ARE vectors1 - BEFORE BEING FORMATTED')
        print(vectors1[:2])
        print('THESE ARE vectors2 - ALL NLP VECTORS FORMATTED FOR COMPARISON BY DIST FUNCTION')
        print(vectors2[:2])
    
    # measure distances
    dist = distance.cdist(p,vectors2)[0]
    
    # create df of results
    df = pd.DataFrame({'distance':[x for x in dist]})
    df.sort_values('distance', inplace=True)
    
    if debug==True:
        print('THIS IS THE DATAFRAME OF DISTANCES')
        display(df)
    
    # print results
    words = []
    for i in df.index[:limit]:
        word_id = ids[i]
        output_word = nlp.vocab[word_id].text
        output_word = output_word.lower()
        if output_word not in words:
            words.append(output_word.lower())
            
    #if input was string, then remove input tokens
    if input_is_string==True:
        words = [word for word in words if word not in [tok.lower_ for tok in doc]]
            
    print('\nTOKENS NEAR VECTOR:   ', ' '.join(words), '\n')

Try single words, such as: 'blue', 'summer', 'university' or 'psychology'. <br>
Also, try combinations of words: 'sad happy' (yields 'hope'), 

In [None]:
getTokensNearVector(nlp, 'happy angry')

## Comparing word meaning?
Most people would agree that 'dog' is more similar to 'cat' than 'road' - and the models agree:

In [None]:
words = ['dog', 'cat', 'car', 'road', 'traffic', 'ideas']
    
compareMeanings(words)

## Comparing sentence meaning?

We can also use these models to compare sentences. Consider the following (all using unique words). Just by averaging the word embeddings for each sentence, we can see, that the model corresponds broadly to our human judgement. <br>
<br>
Change the sentences and re-run the cell to see the result.

In [None]:
sentences = ['the pilot flew the plane', 
             'trees do not grow on rocks',
             'i broke my arm', 
             'he snapped his leg',
             'a helicopter soared up',
             'plants need lots of water']

compareMeanings(sentences)

## Scoring sentences against target meaning
The idea here comes from the [Garten et al. (2018)](https://link.springer.com/article/10.3758/s13428-017-0875-9) who argue that 

In [None]:
seed_terms = 'died'

sentences = ["Superb excellent ++",
             'accessible car park',
             'my arm broke', 
             'she was dying', 
             'he died later',
             'died died died']

for sent in sentences:
    score = round(nlp(sent).similarity(nlp(seed_terms)), 2)
    print("Similarity to 'died':  {:<4}  '{:}'".format(score, sent))

## Adding and subtracting meanings?
If words are represented as numbers, then, can we do mathematics with the meanings?

What if we start with the meaning of 'king' subtract 'man' and add 'woman' - what would the outcome be?

In [None]:
def mathsOnMeaning(start, subtract, add):
    x = nlp.vocab[start].vector - nlp.vocab[subtract].vector + nlp.vocab[add].vector
    
    similar_words = []
    for word in nlp.vocab:
        if not (word.has_vector & word.is_lower):
            continue
        if not ((word.text!=start)&(word.text!=add)&(word.text!=subtract)):
            continue
     
        similarity = cosine_similarity(x, word.vector)
        #similarity = word.similarity(x)
        similar_words.append((word, similarity))
        
    similar_words = sorted(similar_words, key=lambda item: -item[1])
    result = [w[0].text for w in similar_words[:5]][0]
    print(f"'{start}' minus '{subtract}' add '{add}' = {result}")

Here are the words near 'died dead and dying'. 

In [None]:
v = nlp('died').vector
getTokensNearVector(nlp, v)

But, people tend to die in two ways: unintentional and intentional (murder). <br>
To find words associated with intentionall death, we subtract the concept of 'accident' and add the concept 'murder'.

In [None]:
v = nlp('died').vector - nlp('accident').vector + nlp('murder').vector
getTokensNearVector(nlp, v)

Or, we can look at the words associated with 'died' excluding the 'murder' and focusing on 'accidents'.

In [None]:
v = nlp('died').vector + nlp('accident').vector - nlp('murder').vector
getTokensNearVector(nlp, v)

In [None]:
v = nlp('rome').vector + nlp('uk england').vector 
getTokensNearVector(nlp, v)

In [None]:
v = nlp('austria').vector
getTokensNearVector(nlp, v)

In [None]:
start = 'he'
subtract = 'doctor'
add = 'woman'

start = 'doctor'
subtract = 'man'
add = 'woman'


v = nlp.vocab[start].vector - nlp.vocab[subtract].vector + nlp.vocab[add].vector
#print(v)

from scipy.spatial import distance
import numpy as np

# Format the vocabulary for use in the distance function
ids = [x for x in nlp.vocab.vectors.keys()]
vectors1 = [nlp.vocab.vectors[x] for x in ids]
vectors2 = np.array(vectors1)
#print(vectors2)
        
# measure distances
dist = distance.cdist([np.array(v)],vectors2)[0]
    
# create df of results
df = pd.DataFrame({'distance':[x for x in dist]})
df.sort_values('distance', inplace=True)

# print output
words = []
for i in df.index[:50]:
    word_id = ids[i]
    output_word = nlp.vocab[word_id].text
    output_word = output_word.lower()
    if output_word not in words:
        words.append(output_word.lower())
        
words         
#remove input tokens

In [None]:
getTokensNearVector(nlp, 'rome pizza')

In [None]:
from scipy import spatial
cosine_similarity = lambda x, y: 1-spatial.distance.cosine(x, y)
print("apple vs banana: ", cosine_similarity(nlp.vocab['apple'].vector, nlp.vocab['banana'].vector))

In [None]:
start = 'king'
subtract = 'man'
add = 'woman'

mathsOnMeaning(start, subtract, add)

- Try: 'uk' minus 'london' add 'geneva'
- Try: 'berlin' minus 'germany' add 'france'
- Try: 'india' minus 'curry' add 'pizza'
- Try: 'man' minus 'boy' add 'girl'
- Try: 'paris' minus 'france' add 'uk'

## 'Thought vectors'

Geoffrey Hinton, one of the developers of 'deep learning' techniques, talks about the above examples that focus on single words as only the first step: he (and Google) are working towards 'thought vectors'. 

While the above examples represent the meanings of words in a 300 dimensional mathematical space, they aim to map thoughts, ideas, sentences, and even whole articles and books into a multi-million dimensional space.

Transcritpts, texts, and even audio-visual data will become open ho high-level semantic analysis (meaning clusters, emotion, contradictions, meta-perspectives and dialogical tensions).

- Longitudinal analyses of an individual life-course
- Comparison of perspectives of groups in conflict
- Identifying the emergence of novelty and new ideas

And, all done, on any size of dataset, in real-time, and for almost zero cost.

# Hospital Staff Responding to Criticism 
Accidents in hospitals are a leading cause of death. At least 10% of people going into hospital come out with a new health problem. Causes include: delays, hospital infections, errors, wrong-site surgery, misdiagnosis, medication errors, not reading patient notes etc. 

While aviation, construction, and heavy industry have all become much safer over the last 50 years, healthcare has remained dangerous, and seems resistant to improvement. Some hospitals have a 'blame culture' with high defensiveness.

One idea is that medical staff could learn from patients: patients know their own bodies, are usually the only person who has been at all the meetings, and have increasing access to medical knowledge.

## Research context
Question: How do healthcare staff respond to critical feedback?
    
Data: Online discussions between patients and staff in the UK (about 250k paired dialogs)
    
- What is the divergence of perspective between patients and staff?

- How are staff listening to and/or ignoring the patient point of view?

In [None]:
# loading the data
df = pd.read_csv('co_crit3.csv')
df['text_doc'] = df['post_body'].apply(nlp)
df['post_type'].value_counts()

## Comparing perspectives: words
The plot below compares the text of 'patient stories' with 'staff responses' in terms of past-tense verbs.
- The vertical axis is for patients; horizontal axis is for staff
- Words in the top-right are common to both patients and staff (i.e., was, were, did, had)
- Words in the top-left are peculiar to patients (i.e., took, died, called, started, tried, refused)
- Words in the bottom-right are peculiar to staff (i.e., raised, expectations, experiences, mentioned, expressed)
- Clicking on a word 'died' shows the original text (85 patient posts, 1 staff post)


In [None]:
#html = scattertextTag('VBD')
file_name = 'co_crit3_past.html'
#open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width=1300, height=1000)

## Comparing perspectives: themes
Based on an examination of the patient and staff text, we can formulate two ideas.

1. Patients seem to be talking about 'real' clinical issues.

2. Staff seem reluctant to address these clinical issues, and prefer to talk about 'valuing feedback' and 'patient experience'

To test these ideas, we can make some 'topic themes' that we then map into the semantic space.

In [None]:
themes = {
    'clinical': ['clinical', 'unsafe', 'doctor', 'nurse', 'operation', 'diagnosis', 'misdiagnosis'],
    'pain': ['agony', 'suffering', 'pains', 'suffer', 'screaming'],
    'feedback': ['value', 'feedback', 'learning', 'listen', 'listening'],
    'experience': ['experience', 'concern', 'concerns', 'feelings']
}

html = scattertextThemes(themes)
file_name = 'co_crit3_themes.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width=1000, height=700)

## Which responses have least perspective taking?

Using the 'word embeddings' method, can we identify the staff response with the least perspective taking?

Method: compare the word vectors for the staff responses with the original patient story and find the biggest difference.


In [None]:
df_index = 1
df = df.sort_values(by='similarityToOrigin', ascending=True, axis='index').reset_index(drop=True)
selected_row = df.iloc[df_index, :,]
print(f"ORIGINAL POST\n{selected_row[['thread_originText']][0]}")
print(f"STAFF RESPONSE\n{selected_row[['post_body']][0]}")

The distressed cancer patient had her glads that produce calcilum damaged during surgery resulting in paralysis and provided conflicting information about a blood clot in her brain. 

Despite the clinical error, her main concern is the rudeness of staff who have ignored her feelings.

The staff response is generic: 'Thank you for your feedback'

## Which responses have most psychological distancing?
Psychological distancing (not distanciation) refers to defensive routines that hold problematic perspectives at a semantic distance, so as to neutralise their transformative potential. 

One type of psychological distancing is to refer to problematic perspectives as 'beliefs' or 'experiences'.

In [None]:
sentence_subject = ['you']
sentence_object = ['experience']
sentence_contains = ['impression', 'subjective']

findSentence(sentence_subject, sentence_object, sentence_contains)

In the above excerpt has two types of psychological distancing:

First: The distressed patient is writing about a formal complaint they have submitted to the hospital, which has been ignored, and which they cannot get any feedback on (they have tried ringing and calling). Accordingly, the patient has resorted to a public post. Despite multiple failings by the hospital (known to the hospital) the problem is described in a psychologising manner:
- 'the experience'
- 'you would feel'
- 'your concerns'
- 'this impression'

Second: The staff (as a 'we') psychologises themselves, thus distancing from implications for action
- 'we understand'
- 'we are sorry'


# Conclusion

My substantive interest is to understand the ways that people don't listen. 
- distancing (psychologising problems)
- denial (ignoring topics)
- denigration (undermining the motive, stigma, expertise)
- rationalisation (individualising problems as one-off problem)

These strategies of not-listening guard the threshold between self and other; creating psychological comfort at the cost of transformative dialogue. 

If we want to understand how 'the social' leads to change, we need to understand 'semantic contact' - how alternative perspectives are accepted, rejected or modified.

But, for the purposes of our discussion, I also want to raise methodological questions:
- Is there a meeting between sociocultural psychology and NLP?
- What can NLP do for sociocultural psychology? - extra evidence, news tools, scale-up analyses?
- What can sociocultural psychology do for NLP? - conceptualizing the psychology of 'thought vectors', understanding the inherent dialogicality of language?
