# Flexibility Analysis

## Algorithm to Automate Flexibility Scoring

### Import Packages

In [1]:
import nltk
import pandas as pd
import numpy as np

import os
from os.path import expanduser
import glob

from scipy import stats
from scipy.stats import pearsonr

import shared_functions as sf
from shared_functions import *

## Read in Data to Analyze

In [2]:
cwd = os.getcwd()

In [3]:
# read in the official novelty results
novelty_dict = {}
for filename in glob.glob(cwd + '/..//results/results_methods/novelty_results/novelty_071321/*.csv'):
    novelty_dict[filename[122:-4]] = pd.read_csv(filename)

In [4]:
# read in the official flexibility results
flexibility_dict = {}
for filename in glob.glob(cwd + '/..//results/results_methods/flexibility_results/flexibility_071521/*.csv'):
    flexibility_dict[filename[157:-4]] = pd.read_csv(filename)

In [5]:
# read in the official originality results
originality_dict = {}
for filename in glob.glob(cwd + '/..//results/results_methods/originality_results/originality_071521/freqs/*.csv'):
    originality_dict[filename[169:-4]] = pd.read_csv(filename)

In [6]:
# read in the official fluency results
fluency_dict = {}
for filename in glob.glob(cwd + '/..//results/results_methods/fluency_results/fluency_071521/*.csv'):
    fluency_dict[filename[145:-4]] = pd.read_csv(filename)

## Comparing Flexibility Results with Human Ratings

In [7]:
# create pvalues matrix
def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues

### Correlations for Each Prompt
#### Between Human Rater and Method

In [8]:
# calculate corrrelation once
def flexibility_corr(flexibility_dict):
    # list to store list of tuples, later turn into df
    corrs_tuple_list = []
    # get list of keys to iterate through
    flexibility_keys = list(flexibility_dict.keys())
    # calculate the corr coefficient and pval
    for data in flexibility_keys:
        corrs_tuple_list.append(stats.pearsonr(flexibility_dict[data]['flex_1'], flexibility_dict[data]['flex_method_avg']))
        
    # turn list of tuples into df
    result_df = pd.DataFrame(corrs_tuple_list, columns=['corrs', 'pval'])
    
    # rename index with corresponding prompt
    result_df.index = flexibility_keys
    
    # return df
    return result_df

In [9]:
flexibility_corr(flexibility_dict)

Unnamed: 0,corrs,pval
box,0.477839,2e-06
brick,0.225879,0.033304
chair,0.302757,0.003726
cup,0.344469,0.000948
key,0.379652,0.000224
pencil,0.446145,1e-05
rope,0.162325,0.126369
shoe,0.465522,4e-06


## Comparing Flexibility Results with Novelty Results

In [10]:
novelty_dict['box_TASA_results'].columns

Index(['id', 'response', 'response_nofill', 'response_processed',
       'response_processed_phrase', 'item', 'item_nofill', 'SemDis_factor',
       'SemDis_cbowukwacsubtitle_nf_m', 'SemDis_cbowsubtitle_nf_m',
       'SemDis_cbowBNCwikiukwac_nf_m', 'SemDis_TASA_nf_m', 'SemDis_glove_nf_m',
       'SemDis_MEAN', 'ewm_vector_cosine_dis',
       'ewm_vector_cosine_dis_clus_avg', 'minima_vector_cosine_dis',
       'minima_vector_cosine_dis_clus_avg',
       'minima_vector_cosine_dis_clus_min', 'novelty_1', 'novelty_2',
       'novelty_m'],
      dtype='object')

In [11]:
def get_novelty_avg_semdis(collapse_df):
    # get id list
    id_list = sf.get_id_list(collapse_df)
    participant_values = []
    
    for participant in id_list:
        values_list = []
        values_list.append(participant)
        temp_df = collapse_df.loc[collapse_df['id'] == participant]
        values_list.append(temp_df["ewm_vector_cosine_dis"].mean())
        values_list.append(temp_df["ewm_vector_cosine_dis_clus_avg"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis_clus_avg"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis_clus_min"].mean())
        values_list.append(temp_df["SemDis_cbowukwacsubtitle_nf_m"].mean())
        values_list.append(temp_df["SemDis_cbowsubtitle_nf_m"].mean())
        values_list.append(temp_df["SemDis_cbowBNCwikiukwac_nf_m"].mean())
        values_list.append(temp_df["SemDis_TASA_nf_m"].mean())
        values_list.append(temp_df["SemDis_glove_nf_m"].mean())
        values_list.append(temp_df["SemDis_MEAN"].mean())
        values_list.append(temp_df["novelty_1"].mean())
        values_list.append(temp_df["novelty_2"].mean())
        values_list.append(temp_df["novelty_m"].mean())

        values_tuple = tuple(values_list)
        participant_values.append(values_tuple)
        
    participant_avg_novelty_df = pd.DataFrame(participant_values, columns=['id', 'ewm', 'ewm_clust', 'minvec', 'minvec_clust', "minvec_min",
                                                                          'cbowukwacsubtitle', 'cbowsubtitle', 'cbowBNCwikiukwac', 'TASA', 'glove'
                                                                           , "SemDis_MEAN", 'novelty_1', 'novelty_2', 'novelty_m'])
    return participant_avg_novelty_df

In [12]:
def get_novelty_avg(collapse_df):
    # get id list
    id_list = sf.get_id_list(collapse_df)
    participant_values = []
    
    for participant in id_list:
        values_list = []
        values_list.append(participant)
        temp_df = collapse_df.loc[collapse_df['id'] == participant]
        values_list.append(temp_df["ewm_vector_cosine_dis"].mean())
        values_list.append(temp_df["ewm_vector_cosine_dis_clus_avg"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis_clus_avg"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis_clus_min"].mean())
        values_tuple = tuple(values_list)
        participant_values.append(values_tuple)
        
    participant_avg_novelty_df = pd.DataFrame(participant_values, columns=['id', 'ewm', 'ewm_clust', 'minvec', 'minvec_clust', "minvec_min"])
    
    return participant_avg_novelty_df

In [13]:
def get_flexibility_avg(collapse_df):
    # get id list
    id_list = sf.get_id_list(collapse_df)
    participant_values = []
    
    for participant in id_list:
        values_list = []
        values_list.append(participant)
        temp_df = collapse_df.loc[collapse_df['id'] == participant]
        values_list.append(temp_df["flex_1"].mean())
        values_list.append(temp_df["flex_2"].mean())
        values_list.append(temp_df["flex_method_avg"].mean())
        values_tuple = tuple(values_list)
        participant_values.append(values_tuple)
        
    participant_avg_flexibility_df = pd.DataFrame(participant_values, columns=['id', 'flex_1', 'flex_2', 'flex_method_avg'])
    
    return participant_avg_flexibility_df

In [27]:
def get_originality_avg(collapse_df):
    # get id list
    id_list = sf.get_id_list(collapse_df)
    participant_values = []
    
    for participant in id_list:
        values_list = []
        values_list.append(participant)
        temp_df = collapse_df.loc[collapse_df['id'] == participant]
        values_list.append(temp_df["t_freq"].mean())
        values_tuple = tuple(values_list)
        participant_values.append(values_tuple)
        
    participant_avg_originality_df = pd.DataFrame(participant_values, columns=['id', 'originality'])
    
    return participant_avg_originality_df

In [15]:
def get_fluency_avg(collapse_df):
    # get id list
    id_list = sf.get_id_list(collapse_df)
    participant_values = []
    
    for participant in id_list:
        values_list = []
        values_list.append(participant)
        temp_df = collapse_df.loc[collapse_df['id'] == participant]
        values_list.append(temp_df["fluency"].mean())
        values_tuple = tuple(values_list)
        participant_values.append(values_tuple)
        
    participant_avg_fluency_df = pd.DataFrame(participant_values, columns=['id', 'fluency'])
    
    return participant_avg_fluency_df

In [18]:
def calculate_corrs_novelty_flexibility(novelty_dict_values, flexibility_dict_values):
    collapse_novelty_df = pd.concat(novelty_dict_values)
    collapse_flexibility_df = pd.concat(flexibility_dict_values)
    participant_avg_novelty_df = get_novelty_avg(collapse_novelty_df)
    participant_avg_flexibility_df = get_flexibility_avg(collapse_flexibility_df)
    combined_df = pd.merge(participant_avg_novelty_df, participant_avg_flexibility_df, how='inner', on = 'id')
    combined_df = combined_df.drop('id', axis=1)
    pval_df = calculate_pvalues(combined_df)
    corrs_df = combined_df.corr()
    corrs_df = pd.DataFrame(corrs_df.iloc[:5, 5:])
    pval_df = pd.DataFrame(pval_df.iloc[:5, 5:])
    pval_df.columns = ['flex_1_pval', 'flex_2_pval', 'flex_method_pval']
    display(corrs_df)
    display(pval_df)
    corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
    corr_pval_df = corr_pval_df[['flex_1', 'flex_1_pval','flex_2', 'flex_2_pval', 'flex_method_avg', 'flex_method_pval']]
    corr_pval_df.index.rename('metrics', inplace=True)
    display(corr_pval_df)

In [19]:
calculate_corrs_novelty_flexibility(novelty_dict.values(), flexibility_dict.values())

Unnamed: 0,flex_1,flex_2,flex_method_avg
ewm,0.079815,0.079815,0.190157
ewm_clust,0.127786,0.127786,0.147896
minvec,0.289169,0.289169,0.263412
minvec_clust,0.371233,0.371233,0.291475
minvec_min,-0.432635,-0.432635,0.011132


Unnamed: 0,flex_1_pval,flex_2_pval,flex_method_pval
ewm,0.4546,0.4546,0.0726
ewm_clust,0.23,0.23,0.1642
minvec,0.0057,0.0057,0.0121
minvec_clust,0.0003,0.0003,0.0053
minvec_min,0.0,0.0,0.9171


Unnamed: 0_level_0,flex_1,flex_1_pval,flex_2,flex_2_pval,flex_method_avg,flex_method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.079815,0.4546,0.079815,0.4546,0.190157,0.0726
ewm_clust,0.127786,0.23,0.127786,0.23,0.147896,0.1642
minvec,0.289169,0.0057,0.289169,0.0057,0.263412,0.0121
minvec_clust,0.371233,0.0003,0.371233,0.0003,0.291475,0.0053
minvec_min,-0.432635,0.0,-0.432635,0.0,0.011132,0.9171


## Big Correlation Matrix

In [28]:
def calculate_mega_matrix(novelty_dict_values, flexibility_dict_values, originality_dict_value, fluency_dict_value):
    collapse_novelty_df = pd.concat(novelty_dict_values)
    collapse_flexibility_df = pd.concat(flexibility_dict_values)
    collapse_originality_df = pd.concat(originality_dict_value)
    collapse_fluency_df = pd.concat(fluency_dict_value)
    participant_avg_novelty_df = get_novelty_avg_semdis(collapse_novelty_df)
    participant_avg_flexibility_df = get_flexibility_avg(collapse_flexibility_df)
    participant_avg_originality_df = get_originality_avg(collapse_originality_df)
    participant_avg_fluency_df = get_fluency_avg(collapse_fluency_df)
    combined_df = pd.merge(participant_avg_novelty_df, participant_avg_flexibility_df, how='inner', on = 'id')
    combined_df = pd.merge(combined_df, participant_avg_originality_df, how='inner', on = 'id')
    combined_df = pd.merge(combined_df, participant_avg_fluency_df, how='inner', on = 'id')
    combined_df = combined_df.drop('id', axis=1)
    pval_df = calculate_pvalues(combined_df)
    corrs_df = combined_df.corr()
    return (corrs_df, pval_df)

In [29]:
mega_matrix = calculate_mega_matrix(novelty_dict.values(), flexibility_dict.values(), originality_dict.values(), fluency_dict.values())

In [30]:
# mega_matrix[0].to_csv("mega_matrix_corrs.csv")

In [31]:
# mega_matrix[1].to_csv("mega_matrix_pvals.csv")