In [27]:
import pandas as pd
import numpy as np
from spacy.lang.en.stop_words import STOP_WORDS
import re
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
def get_ew_multiplied_vector(phrase_list, sem_space):
    vectors_list = []
    # add vectors to list
    # change to numpy array
    for term in phrase_list:
        try:
            vectors_list.append(np.array(sem_space.loc[term].values.tolist()))
        except:
            print('cannot find '+term)
    # get element wise multiplied vector
    if len(vectors_list)==0: return np.nan
    element_wise_multiplied_vector = np.ones(len(sem_space.columns))
    # calculate element wise multiplied vector
    for vector in vectors_list:
        element_wise_multiplied_vector = element_wise_multiplied_vector * vector

    # return element wise multiplied vector
    return element_wise_multiplied_vector
def lemmatize(text, stopwords_list):
    # replace symbols with spaces
    text = re.sub("/|-", " ", text)

    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # tokenize the phrase
    tokens = word_tokenize(text)

    # lowercase all tokens
    tokens = [w.lower() for w in tokens]

    # remove stopwords
    tokens = [word for word in tokens if word not in stopwords_list]

    # lemmatize words if needed
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    return tokens

#compute centroid of the list of vectors, as the average of the normalized vectors
def calc_centroid(vector_list):
    vector_len=len(vector_list[0]) #the length of the embedded vector
    centroid=np.zeros(vector_len)
    for v in vector_list:
        centroid=centroid+v/np.linalg.norm(v)
    centroid=centroid/len(vector_list)
    return centroid
def get_cosine_distance(feature_vec_1, feature_vec_2):
    return (1 - cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0])


#main function to compute the diversity metric, input is the responses from same participant for a single prompt
#each response is already converted to embedded vectors using certain composition
def calc_diversity(vector_list):
    centroid=calc_centroid(vector_list)
    dist=[]
    for v in vector_list:
        dist.append(get_cosine_distance(centroid,v))
    #return the max of all distance, root mean square of the distance
    #they are equivalent in some sense, root mean squre might behave sligthly better as the diversity metric
    return np.max(dist),np.sqrt(np.mean(np.array(dist)**2))


def dispersion_vectors(responses,sp):
    if len(responses)<2:
        return np.nan,np.nan
    response_vector = [get_ew_multiplied_vector(x, sp) for x in responses]
    response_vector = [x for x in response_vector if type(x)!=float]
    return calc_diversity(response_vector)


# Load semantic space to a dictionary
# Load stopword

In [4]:
folder='/Users/yyu/Box Sync/ORG-SCHOOL-WCAS-PSYCHOLOGY-BEEMAN-LAB/COAT/scoring/'
sp='cbow_6_ukwac_subtitle'
semspace_dict={}
semspace_dict[sp] = pd.read_csv(folder+'/semantic_spaces/'+sp+'.txt', delimiter = " ", header = None).set_index(0)

In [8]:
stopwords_edited = list(STOP_WORDS)
stopwords_edited.append("thing")
stopwords_edited.append("use")
stopwords_edited.append("things")

# Dispersion Example

In [12]:
prompt = 'table'
responses = ['stand on top to dance','block door','burn it']

In [23]:
responses_vec = [lemmatize(r,stopwords_edited) for r in responses]
responses_vec = [[w for w in r if w!=prompt] for r in responses_vec]
responses_vec

[['stand', 'dance'], ['block', 'door'], ['burn']]

In [29]:
#two output here, mostly equivalanet, second is better
dispersion_vectors(responses_vec,semspace_dict[sp])

(0.4248608392207669, 0.38370818514983984)