# Novelty Analysis

## Comparing Novelty Algorithms to SemDis 
## Comparing Novelty Algorithms to Human Raters

### Import Packages

In [2]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import openpyxl
import xlsxwriter

from nltk.stem import WordNetLemmatizer
import string
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

from nltk.cluster.kmeans import KMeansClusterer

from scipy import stats
from scipy.stats import pearsonr

import pingouin as pg

import glob

### Put Data from Excel Sheet into Dataframes

In [2]:
# individual df's for each sheet

# when on pc
data_official_cup = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_cup_semdis.csv")
data_official_key = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_key_semdis.csv")
data_official_rope = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_rope_semdis.csv")
data_official_brick = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_brick_semdis.csv")
data_official_chair = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_chair_semdis.csv")
data_official_pencil = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_pencil_semdis.csv")
data_official_shoe = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_shoe_semdis.csv")
data_official_box = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_box_semdis.csv")

# when on mac
# data_official_cup = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_cup_semdis.csv")
# data_official_key = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_key_semdis.csv")
# data_official_rope = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_rope_semdis.csv")
# data_official_brick = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_brick_semdis.csv")
# data_official_chair = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_chair_semdis.csv")
# data_official_pencil = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_pencil_semdis.csv")
# data_official_shoe = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_shoe_semdis.csv")
# data_official_box = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_box_semdis.csv")

In [47]:
# read in the official novelty results
results_dict = {}
for filename in glob.glob('C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/novelty_results/novelty_071321/*.csv'):
    results_dict[filename[110:-4]] = pd.read_csv(filename)

### Preprocessing

In [4]:
# spacy stop words
stopwords_spacy = STOP_WORDS

In [5]:
stopwords_edited = list(stopwords_spacy)
stopwords_edited.append("thing")
stopwords_edited.append("use")
stopwords_edited.append("things")

In [6]:
# method to clean the responses
def process_text(text, stopwords_list, remove_sw, join_list):
    # tokenize text, lemmanize words, removing punctuation, remove stop words, lowercase all words

    # hardcorded for special situations
    text = re.sub(r"doorstoppper","doorstop", text)
    
    text = re.sub("/|-"," ", text)
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)

    tokens = [w.lower() for w in tokens]
    
    if remove_sw:
        tokens = [word for word in tokens if word not in stopwords_list]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
#         stemmer = PorterStemmer()
#         tokens = [stemmer.stem(t) for t in tokens]

    if join_list:
        tokens = ' '.join(tokens)
 
    return tokens

### General Functions

In [7]:
# method to get a list of participants
def get_id_list(df):
    id_list = df['id'].unique()
    id_list = sorted(id_list)
    return id_list

In [8]:
# method to add a new column
# new column are cleaned responses
def get_cleaned_responses(df, stopwords_list, remove_sw, join_list):
    # id_df = df[df.id == id]
    df_processed = df.copy(deep=True)
    responses = df['response'].tolist()

    # make list of processed responses
    for response in range(len(responses)):
        responses[response] = process_text(responses[response], stopwords_list, remove_sw, join_list)

    # add list as column in df
    df_processed['response_processed'] = responses

    return df_processed

## Semantic Spaces


### ukwac subtitles Semantic Space

In [9]:
# when on pc
ukwac_subtitles = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/cbow_6_ukwac_subtitle.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/cbow_6_ukwac_subtitle.txt', delimiter = " ", header = None, encoding='latin-1')

ukwac_subtitles = ukwac_subtitles.set_index(0)


In [10]:
ukwac_subtitles

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
biennials,0.018570,-0.115743,0.052710,-0.103825,-0.003274,-0.111418,-0.178500,-0.168650,-0.059751,-0.139651,...,0.121335,-0.021174,-0.113822,-0.110219,0.002543,-0.079831,0.103625,0.188465,0.117148,0.035641
fawn,0.071241,0.080455,0.216394,-0.282252,-0.172025,-0.186543,0.040009,0.044497,-0.045530,-0.079626,...,0.134902,-0.007096,-0.046929,0.022484,-0.042331,0.082441,0.077913,0.417162,-0.313959,-0.040423
gai,-0.183570,-0.039461,0.036953,-0.330616,-0.290676,0.095041,0.187567,0.019878,-0.115750,0.078831,...,0.042250,-0.009011,-0.311596,-0.131843,0.078453,0.048432,-0.005434,-0.001165,-0.072534,-0.151484
nunnery,-0.037334,0.180116,0.046011,-0.454054,-0.102627,-0.037614,-0.081269,-0.144102,0.018661,-0.312565,...,0.099364,-0.011798,-0.315447,0.160066,-0.179072,0.085825,-0.018394,0.055811,0.460388,0.107053
icici,0.010815,-0.050479,0.164478,-0.035702,-0.192204,0.003913,-0.063542,-0.095691,-0.131409,-0.090370,...,-0.040970,-0.021843,-0.071962,-0.120538,-0.018662,-0.263689,-0.119164,-0.013069,0.030275,-0.072135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
proteasome,0.065590,0.063595,-0.068297,-0.341265,-0.363577,-0.028856,0.121471,-0.163342,0.164599,-0.033824,...,0.092268,-0.019238,0.032392,0.024969,0.011309,0.061474,0.066890,-0.148575,-0.067886,0.027040
jawbone,-0.198237,-0.221238,0.034535,-0.125550,0.261507,-0.176464,0.154252,-0.001421,-0.012786,0.101746,...,0.173170,-0.018633,-0.150698,-0.278048,-0.083264,-0.138730,0.011749,0.292462,-0.002925,0.116922
expands,-0.204973,-0.104741,0.371615,-0.127140,-0.024127,-0.217444,-0.088636,0.147861,-0.060367,-0.200520,...,0.093103,0.040245,-0.231791,-0.101625,0.162721,-0.040362,0.097418,-0.007231,0.107961,-0.058140
hagia,-0.111121,0.126795,0.060413,-0.201407,-0.235132,0.056504,0.211543,0.019023,-0.160508,-0.190792,...,0.129399,0.013232,0.052301,-0.174858,0.192407,0.104436,-0.107214,-0.032931,-0.016617,0.252581


### CBOW subtitles Semantic Space

In [11]:
# when on pc
cbow_subtitles = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/cbow_subtitle.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/cbow_subtitle.txt', delimiter = " ", header = None, encoding='latin-1')

cbow_subtitles = cbow_subtitles.set_index(0)


In [12]:
cbow_subtitles

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fawn,-0.092878,0.096702,0.078285,0.141875,-0.048455,0.022562,-0.116627,-0.133138,-0.006373,0.065410,...,-0.141192,0.033917,-0.091994,-0.198885,0.086601,-0.084499,0.128204,0.130868,0.034637,-0.011805
gai,-0.073987,-0.135876,0.071925,0.050427,-0.117428,0.119181,0.000579,0.058539,-0.164071,-0.039671,...,0.011994,0.046088,0.127052,0.037256,-0.099584,-0.093139,-0.116162,0.049216,-0.186436,0.006694
impotents,-0.002924,-0.016872,0.020181,0.015595,-0.016781,-0.014786,-0.007104,-0.016849,-0.026617,0.000086,...,0.045714,-0.037038,0.003953,-0.012306,0.007597,0.014559,-0.013585,-0.008614,0.016856,0.032169
sonja,0.046202,0.124720,0.111161,0.241513,-0.163221,0.119522,-0.011830,0.045682,0.213233,0.141021,...,-0.117941,0.133043,-0.038438,-0.089926,0.061131,-0.143988,-0.058695,0.110329,-0.069613,-0.142646
dionysian,0.032223,-0.007911,-0.008691,-0.015038,-0.037716,0.028007,-0.026402,0.023947,-0.036913,0.033985,...,-0.022568,-0.009473,0.001342,-0.056279,-0.001985,-0.037703,-0.027222,0.015461,-0.016171,0.015194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jawbone,-0.155908,-0.022339,0.029578,0.061183,-0.016605,-0.026134,0.004525,0.002993,-0.120240,0.077789,...,0.005585,-0.052877,0.068886,-0.118713,-0.161840,0.027309,-0.031582,0.072750,-0.000247,-0.041733
bunches,-0.032415,0.140536,0.071025,0.022989,0.031107,0.122717,-0.029894,0.092296,-0.071899,-0.097607,...,-0.010892,-0.053674,-0.071419,-0.081352,-0.032544,-0.096401,-0.006610,0.108215,0.034687,0.185569
zinka,-0.006705,0.002355,0.012616,0.021504,-0.002360,0.017963,-0.008720,-0.000567,-0.014686,-0.000533,...,-0.003436,0.006517,-0.006647,-0.009388,0.003580,-0.006105,0.006303,0.012804,0.005335,0.014729
nurnies,-0.015835,0.007069,0.023102,-0.016696,-0.024264,0.002132,-0.006475,0.008502,-0.021481,0.011426,...,-0.006696,0.023507,-0.011131,-0.000455,-0.001402,-0.020491,-0.021894,0.017392,-0.012267,0.015760


### Banori Semantic Space

In [13]:
# when on pc
banori = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/banori.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/banori.txt', delimiter = " ", header = None, encoding='latin-1')

banori = banori.set_index(0)

In [14]:
banori

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,391,392,393,394,395,396,397,398,399,400
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,-0.060292,0.067630,-0.036891,0.066684,0.024045,0.099091,0.009682,-0.099609,0.081138,-0.153635,...,0.011333,-0.072486,-0.116943,0.071367,0.002056,0.041920,0.017046,0.038706,0.033797,0.016488
",",0.026625,0.073101,-0.027073,-0.019504,0.041730,0.038811,0.094878,-0.031282,0.093890,-0.105021,...,-0.002796,0.029393,-0.069047,0.083530,-0.033128,-0.035646,0.027183,0.003230,0.000811,0.022505
.,-0.005893,0.093791,0.015333,0.046226,0.032791,0.110069,0.055551,-0.080625,0.150372,-0.121523,...,0.076293,-0.048817,-0.203812,0.012218,0.039033,-0.043925,0.087199,0.089214,0.073069,0.075056
of,-0.050371,0.031452,0.040910,0.033255,-0.009195,0.061086,0.085859,-0.122968,0.068290,-0.108840,...,-0.021496,-0.112018,-0.116068,0.071437,-0.041942,0.061624,0.030890,0.013635,-0.104569,0.005263
and,0.005456,0.063237,-0.075793,-0.000819,0.003407,0.053554,0.070145,-0.088482,0.129797,-0.094823,...,0.100012,-0.090594,-0.090813,0.028959,0.024793,-0.040878,-0.024795,0.083654,-0.010910,0.027067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
scientifica,-0.121726,0.094667,0.025527,0.001748,0.087804,-0.004830,-0.011430,-0.068062,0.016917,0.031084,...,0.082360,-0.025217,0.008511,-0.109005,-0.034504,0.027565,0.115459,0.000142,-0.022383,0.042615
schoolly,0.015603,0.100056,-0.042075,-0.032139,0.024201,0.051871,-0.069512,-0.144775,0.110104,-0.106074,...,0.014019,-0.040083,0.088833,-0.015218,-0.063183,0.021593,-0.101472,0.023395,0.022891,0.036009
schnapf,-0.057437,0.181676,-0.063820,0.017294,0.002916,0.019541,0.080525,-0.054740,-0.071306,-0.007485,...,0.079774,-0.068784,0.087408,-0.044859,-0.046738,-0.022576,0.023053,0.026027,0.010763,0.063920
scheme-,0.070691,0.026944,-0.084396,-0.063403,-0.030977,0.125771,0.084698,-0.060336,0.040576,-0.030460,...,0.027721,0.001727,0.070864,-0.004891,-0.019738,0.014031,-0.008417,0.064234,0.003573,0.004069


### GloVe 6B Semantic Space

In [15]:
# when on pc
glove_6B = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/glove_6B.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/glove_6B.txt', delimiter = " ", header = None, encoding='latin-1')

glove_6B = glove_6B.set_index(0)


In [16]:
glove_6B

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,0.046560,0.213180,-0.007436,-0.458540,-0.035639,0.236430,-0.288360,0.215210,-0.134860,-1.641300,...,-0.013064,-0.296860,-0.079913,0.195000,0.031549,0.285060,-0.087461,0.009061,-0.209890,0.053913
",",-0.255390,-0.257230,0.131690,-0.042688,0.218170,-0.022702,-0.178540,0.107560,0.058936,-1.385400,...,0.075968,-0.014359,-0.073794,0.221760,0.146520,0.566860,0.053307,-0.232900,-0.122260,0.354990
.,-0.125590,0.013630,0.103060,-0.101230,0.098128,0.136270,-0.107210,0.236970,0.328700,-1.678500,...,0.060148,-0.156190,-0.119490,0.234450,0.081367,0.246180,-0.152420,-0.342240,-0.022394,0.136840
of,-0.076947,-0.021211,0.212710,-0.722320,-0.139880,-0.122340,-0.175210,0.121370,-0.070866,-1.572100,...,-0.366730,-0.386030,0.302900,0.015747,0.340360,0.478410,0.068617,0.183510,-0.291830,-0.046533
to,-0.257560,-0.057132,-0.671900,-0.380820,-0.364210,-0.082155,-0.010955,-0.082047,0.460560,-1.847700,...,-0.012806,-0.597070,0.317340,-0.252670,0.543840,0.063007,-0.049795,-0.160430,0.046744,-0.070621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chanty,0.392700,-0.022505,0.304580,0.187990,0.141180,0.724030,-0.257810,-0.137290,-0.016521,0.595960,...,-0.182950,0.406630,-0.343630,-0.270400,-0.593680,0.016447,0.140740,0.463940,-0.369570,-0.287180
kronik,0.136790,-0.139090,-0.360890,0.079864,0.321490,0.263870,-0.109900,0.044420,0.083869,0.791330,...,0.036419,-0.036845,-0.348150,0.064732,-0.000577,-0.133790,0.428960,-0.023320,0.410210,-0.393080
rolonda,0.075713,-0.040502,0.183450,0.512300,-0.228560,0.839110,0.178780,-0.713010,0.326900,0.695350,...,-0.388530,0.545850,-0.035050,-0.184360,-0.197000,-0.350030,0.160650,0.218380,0.309670,0.437610
zsombor,0.814510,-0.362210,0.311860,0.813810,0.188520,-0.313600,0.827840,0.296560,-0.085519,0.475970,...,0.130880,0.106120,-0.408110,0.313380,-0.430250,0.069798,-0.207690,0.075486,0.284080,-0.175590


### TASA Semantic Space

In [17]:
# when on pc
TASA = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/TASA.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/TASA.txt', delimiter = " ", header = None, encoding='latin-1')

TASA = TASA.set_index(0)


In [18]:
TASA

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
who,0.058924,-0.003555,0.053929,0.037707,-0.028096,-0.025736,0.008540,-0.041124,-0.000275,0.002969,...,0.002788,-0.005665,0.008141,0.035108,-0.005556,0.001101,0.022807,-0.001179,-0.004761,0.000947
were,0.076660,0.007832,0.102716,-0.046048,0.006808,-0.012557,0.012856,-0.043569,0.004773,0.027316,...,0.000811,0.003786,-0.011847,0.010002,0.027371,0.010266,-0.000217,0.005904,-0.005155,-0.017014
the,0.037283,-0.011585,0.003933,-0.006798,0.012415,0.000927,0.004378,-0.003837,0.006263,-0.002052,...,0.000049,0.000939,-0.001295,0.000544,0.000071,-0.000817,0.001123,-0.000980,-0.000354,0.000634
first,0.049274,-0.012220,0.023342,0.002829,0.008936,0.004214,0.041426,-0.014812,0.020341,0.018361,...,0.027615,0.016552,0.052426,-0.001099,-0.024695,-0.008158,-0.026864,-0.000865,-0.030933,-0.035347
americans,0.017895,-0.024743,0.068235,-0.003476,-0.022697,-0.002701,-0.013814,0.041591,0.008424,0.019061,...,0.009282,0.029840,-0.002187,-0.029926,-0.036506,0.008290,0.003097,0.028012,-0.002667,0.043664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
phosphagens,0.000049,-0.000144,-0.000252,-0.000134,0.000194,-0.000358,-0.000433,0.000109,0.000242,0.000393,...,-0.000659,-0.000874,-0.000879,-0.000478,-0.001883,0.000523,-0.001007,-0.000753,0.000633,-0.000265
phosphated,0.000025,-0.000072,-0.000126,-0.000067,0.000097,-0.000179,-0.000216,0.000054,0.000121,0.000197,...,-0.000330,-0.000437,-0.000439,-0.000239,-0.000941,0.000262,-0.000504,-0.000376,0.000317,-0.000132
adp,0.000025,-0.000072,-0.000126,-0.000067,0.000097,-0.000179,-0.000216,0.000054,0.000121,0.000197,...,-0.000330,-0.000437,-0.000439,-0.000239,-0.000941,0.000262,-0.000504,-0.000376,0.000317,-0.000132
plummage,0.000023,-0.000049,-0.000023,-0.000043,-0.000016,-0.000036,0.000008,-0.000149,-0.000044,-0.000012,...,-0.000057,0.000473,-0.000311,-0.000031,0.000284,0.000240,-0.000090,-0.000660,-0.000446,0.000211


## Novelty Algo 3
### sem_space + element wise multiplication + cosine distance
### Greater Cos Distance, Greater Novelty
### Most similar to SemDis

In [19]:
# method to calculate cosine similarity
def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

In [20]:
# method to get the element wise multiplied vector
# multiply vectors in phrase
def get_ew_multiplied_vector(phrase_list, sem_space):
    vectors_list = []
    # add vectors to list
    # change to numpy array
    for term in phrase_list:
        vectors_list.append(np.array(sem_space.loc[term].values.tolist()))
    
    # get element wise multiplied vector
    element_wise_multiplied_vector = np.ones(len(sem_space.columns))

    for vector in vectors_list:
        element_wise_multiplied_vector = element_wise_multiplied_vector * vector

    return element_wise_multiplied_vector

In [21]:
# get cosine sim from prompt and ewm
def get_cosine_sim_ewm(prompt, response, sem_space):
    prompt_vector = np.array(sem_space.loc[prompt].values.tolist())
    ewm_vector = get_ew_multiplied_vector(response, sem_space)

    return (1 - get_cosine_similarity(prompt_vector, ewm_vector))

In [22]:
# get df with results of the cosine distance from prompt using the elementwise multiplied vectors in the response
def get_novelty_ewm_cosinedist(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    # list to store cosine sims
    cosine_sim_list = []

    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list.append(get_cosine_sim_ewm(prompt, response, sem_space))

    # add novelty rating list to dataframe
    novel_rating_df['ewm_vector_cosine_dis'] = cosine_sim_list
    
    # new column with novelty rating
    return novel_rating_df

In [23]:
# brick_novelty_ewm_cosinedist = get_novelty_ewm_cosinedist(data_official_shoe, 'shoe', stopwords_edited, TASA, True, False)
# brick_novelty_ewm_cosinedist

## Novelty Algo 4
### sem_space + local minina + cosine distance

In [24]:
# get word in phrase that has the least distance from the prompt
def get_minima_vector_cos_distance(prompt, phrase_list, sem_space):
    distances_list = []
    # get prompt vector
    prompt_vector = np.array(sem_space.loc[prompt].values.tolist())
    
    # create list of cosine distances
    for term in phrase_list:
        distances_list.append((1 - get_cosine_similarity(prompt_vector, np.array(sem_space.loc[term].values.tolist()))))
        
    # return the max cosine distance
    return max(distances_list, default=0)

In [25]:
# get df with results of the cosine distance from prompt using the minima vector in the response
def get_novelty_minimavec_cosinedist(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    # list to store cosine sims
    cosine_sim_list = []

    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list.append(get_minima_vector_cos_distance(prompt, response, sem_space))

    # add novelty rating list to dataframe
    novel_rating_df['minima_vector_cosine_dis'] = cosine_sim_list
    
    # new column with novelty rating
    return novel_rating_df

In [26]:
# brick_novelty_ukwac_minimavec_cosinedist = get_novelty_minimavec_cosinedist(data_official_brick, 'brick', stopwords_edited, ukwac_subtitles, True, False)
# brick_novelty_ukwac_minimavec_cosinedist

## Novelty Algo 5
### sem_space + element wise multiplication + cosine distance + clustering
### average responses cosine distance in the same cluster
### idea is that phrases with same alternate task will group
### variation in phrase in the same cluster will  be averaged out

In [27]:
# clusters the responses
# get a df of the clusters and their respective phrases
def get_counts_vector(num_clusters, responses):
    # initialize CountVectorizer object
    count_vectorizer = CountVectorizer()
    # vectorize the phrases
    word_count = count_vectorizer.fit_transform(responses)
    
    # elbow method to visualize and find out how many clusters to use
#     visualizer = KElbowVisualizer(KMeans(), k=(10,35), timings=False)
#     visualizer.fit(word_count.toarray())       
#     visualizer.show()

    # nltk kmeans cosine distance implementation
    number_of_clusters = num_clusters
    kmeans = KMeansClusterer(number_of_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True)
    assigned_clusters = kmeans.cluster(word_count.toarray(), assign_clusters=True)

    # scikit-learn euclidean distance implementation
#     kmeans = KMeans(n_clusters = num_clusters).fit(word_count)
        
    # cluster results scikit-learn
    results = pd.DataFrame()
    results['text'] = responses
#     results['category'] = kmeans.labels_
    results['category'] = assigned_clusters
    
    # create dictionary to organize the clusters with their respective phrases
    results_dict = {k: g["text"].tolist() for k,g in results.groupby("category")}
    
    # df of the clusters and the 
    clusters_df = pd.DataFrame(list(results_dict.items()),columns = ['category','responses']) 
    
    return clusters_df

In [28]:
# averages the distance of the phrases in each cluster
# gives each phrase in cluster the average distance 
def get_clustered_novelty_score(novel_rating_df, column):
    # get cluster df
    clusters_df = get_counts_vector(22, novel_rating_df['response_processed_phrase'].tolist())
    # get cleaned phrases and their current novelty rating
    novelty_scores = dict(zip(novel_rating_df.response_processed_phrase, novel_rating_df[column]))

    # create dictionary out of cluster df
    clusters = dict(zip(clusters_df.category, clusters_df.responses))
        
    # initialize empty dictionary to store the score for a category
    clusters_scores = dict.fromkeys(clusters)
    
    # get the average cosine distance for a cluster
    for key in clusters:
        score = 0
        for phrase in clusters[key]:
            score = score + novelty_scores[phrase]
        score = score/len(clusters[key])
        clusters_scores[key] = score
        
    # create dictionary to store a phrase and its new novelty score 
    # new score is the average of the responses in one cluster
    phrase_scores_dict = {}
    for key in clusters:
        for phrase in clusters[key]:
            phrase_scores_dict[phrase] = clusters_scores[key]
            
    # make a list that matches the one in the current dataframe
    # return list to be added to dataframe
    df_phrases_scores_list = [] 
    for phrase in novel_rating_df['response_processed_phrase'].tolist():
        df_phrases_scores_list.append(phrase_scores_dict[phrase])
    
    # uncomment to show clusters df
#     display(clusters_df)
            
    return list(df_phrases_scores_list)

In [29]:
# get df with results of the cosine distance from prompt using the elementwise multiplied vectors in the response
def get_novelty_ewm_cosinedist_cluster(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    novel_rating_df['response_processed_phrase'] = [' '.join(x) for x in cleaned_responses]
    # list to store cosine sims
    cosine_sim_list = []
    
    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list.append(get_cosine_sim_ewm(prompt, response, sem_space))

    # add novelty rating list to dataframe
    novel_rating_df['ewm_vector_cosine_dis'] = cosine_sim_list
    
    novel_rating_df['ewm_vector_cosine_dis_clus_avg'] = get_clustered_novelty_score(novel_rating_df, 'ewm_vector_cosine_dis')
    
    # new column with novelty rating
    return novel_rating_df

In [30]:
# brick_novelty_ewm_cosinedist_cluster = get_novelty_ewm_cosinedist_cluster(data_official_brick, 'brick', stopwords_edited, ukwac_subtitles, True, False)
# brick_novelty_ewm_cosinedist_cluster

## Novelty Algo 6
### sem_space + local minima + cosine distance + clustering
### average responses cosine distance in the same cluster
### idea is that phrases with same alternate task will group
### variation in phrase in the same cluster will  be averaged out
### differs from algo 5, does local minima not ewm

In [31]:
# get df with results of the cosine distance from prompt using the elementwise multiplied vectors in the response
def get_novelty_minimavec_cosinedist_cluster(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    novel_rating_df['response_processed_phrase'] = [' '.join(x) for x in cleaned_responses]
    # list to store cosine sims
    cosine_sim_list = []

    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list.append(get_minima_vector_cos_distance(prompt, response, sem_space))

    # add novelty rating list to dataframe
    novel_rating_df['minima_vector_cosine_dis'] = cosine_sim_list
        
    novel_rating_df['minima_vector_cosine_dis_clus_avg'] = get_clustered_novelty_score(novel_rating_df, 'minima_vector_cosine_dis')
    
    # new column with novelty rating
    return novel_rating_df

In [32]:
# brick_noveltyc_minimavec_cosinedist_cluster = get_novelty_minimavec_cosinedist_cluster(data_official_brick, 'brick', stopwords_edited, ukwac_subtitles, True, False)
# brick_novelty_minimavec_cosinedist_cluster

## Novelty Algo 7
### uses the cosine + minima + clustering methods
### uses the same clusterr for all algos
### two different scoring systems, average or minimum

In [33]:
# averages the distance of the phrases in each cluster
# gives each phrase in cluster the average distance 
# generalized to avg or min, both ewm and minima
def get_clustered_novelty_score_generalized(clusters_df, novel_rating_df, average, column):
    # get cluster df
    clusters_df = get_counts_vector(22, novel_rating_df['response_processed_phrase'].tolist())
    # get cleaned phrases and their current novelty rating
    novelty_scores = dict(zip(novel_rating_df.response_processed_phrase, novel_rating_df[column]))

    # create dictionary out of cluster df
    clusters = dict(zip(clusters_df.category, clusters_df.responses))
        
    # initialize empty dictionary to store the score for a category
    clusters_scores = dict.fromkeys(clusters)
    
    # get the average or min cosine distance for a cluster
    if average:
        for key in clusters:
            score = 0
            for phrase in clusters[key]:
                score = score + novelty_scores[phrase]
            score = score/len(clusters[key])
            clusters_scores[key] = score
    else:
        for key in clusters:
            scores_list = []
            for phrase in clusters[key]:
                scores_list.append(novelty_scores[phrase])
            clusters_scores[key] = min(scores_list)
        
    # create dictionary to store a phrase and its new novelty score 
    # new score is the average of the responses in one cluster
    phrase_scores_dict = {}
    for key in clusters:
        for phrase in clusters[key]:
            phrase_scores_dict[phrase] = clusters_scores[key]
            
    # make a list that matches the one in the current dataframe
    # return list to be added to dataframe
    df_phrases_scores_list = [] 
    for phrase in novel_rating_df['response_processed_phrase'].tolist():
        df_phrases_scores_list.append(phrase_scores_dict[phrase])
    
    # uncomment to show clusters df
#     display(clusters_df)
            
    return list(df_phrases_scores_list)

In [34]:
# get df with results of the cosine distance from prompt using the elementwise multiplied vectors in the response
def get_novelty_combined(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    novel_rating_df['response_processed_phrase'] = [' '.join(x) for x in cleaned_responses]
    # list to store cosine sims for ewm
    cosine_sim_list_ewm = []
    # list to store cosine sims for minima
    cosine_sim_list_minima = []

    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list_ewm.append(get_cosine_sim_ewm(prompt, response, sem_space))
        cosine_sim_list_minima.append(get_minima_vector_cos_distance(prompt, response, sem_space))
        
     # get clusters for the dataset
    # idea is to use the same clusters for each analysis
    clusters_df = get_counts_vector(22, novel_rating_df['response_processed_phrase'].tolist())

    # add novelty rating list to dataframe for ewm
    novel_rating_df['ewm_vector_cosine_dis'] = cosine_sim_list_ewm
    
    # add the columns for the novelty scores
    novel_rating_df['ewm_vector_cosine_dis_clus_avg'] = get_clustered_novelty_score_generalized(clusters_df, novel_rating_df, True, 'ewm_vector_cosine_dis')

    # add novelty rating list to dataframe for minima
    novel_rating_df['minima_vector_cosine_dis'] = cosine_sim_list_minima
    novel_rating_df['minima_vector_cosine_dis_clus_avg'] = get_clustered_novelty_score_generalized(clusters_df, novel_rating_df, True, 'minima_vector_cosine_dis')
    novel_rating_df['minima_vector_cosine_dis_clus_min'] = get_clustered_novelty_score_generalized(clusters_df, novel_rating_df, False, 'minima_vector_cosine_dis')

    
    # new column with novelty rating
    return novel_rating_df

In [35]:
# data_test_brick_novelty_combined = get_novelty_combined(data_official_brick, 'brick', stopwords_edited, ukwac_subtitles, True, False)
# data_test_brick_novelty_combined

## ICC Test 

In [48]:
prompts_list = ['box', 'brick', 'chair', 'cup', 'key', 'pencil', 'rope', 'shoe']
data_list = [data_official_box, data_official_brick, data_official_chair, data_official_cup, data_official_key, data_official_pencil, data_official_rope, data_official_shoe]
sem_space_list = [ukwac_subtitles, cbow_subtitles, banori, TASA, glove_6B]
sem_space_list_str = ['ukwac_subtitles', 'cbow_subtitles', 'banori', 'TASA', 'glove_6B']
sem_space_col_list = [8,9,10,11,12]
underscore = "_"

In [62]:
def get_id_list_overlaps():
    id_lists = []
    for i in range(len(data_list)):
        id_lists.append(get_id_list(data_list[i]))
    return list(reduce(set.intersection, [set(item) for item in id_lists]))

In [63]:
def icc_calc_one_semspace_df(sem_space):
    icc_list_ewm = []
    icc_list_ewm_clust = []
    icc_list_min = []
    icc_list_min_clust = []
    icc_list_min_clust_min = []
    id_list = get_id_list_overlaps()
    for i in range(len(prompts_list)):
        results_df = results_dict[prompts_list[i] + underscore + sem_space + underscore + "results"]
        for participant in id_list:
            id_df = results_df.loc[results_df['id'] == participant]
            
            avg_ewm = id_df["ewm_vector_cosine_dis"].mean()
            list_ewm = [participant, prompts_list[i], avg_ewm]
            icc_list_ewm.append(list_ewm)
            
            avg_ewm_clust = id_df["ewm_vector_cosine_dis"].mean()
            list_ewm_clust = [participant, prompts_list[i], avg_ewm_clust]
            icc_list_ewm_clust.append(list_ewm_clust)
            
            avg_min = id_df["minima_vector_cosine_dis"].mean()
            list_min = [participant, prompts_list[i], avg_min]
            icc_list_min.append(list_min)
            
            avg_min_clust = id_df["minima_vector_cosine_dis_clus_avg"].mean()
            list_min_clust = [participant, prompts_list[i], avg_min_clust]
            icc_list_min_clust.append(list_min_clust)
            
            avg_min_clust_min = id_df["minima_vector_cosine_dis_clus_min"].mean()
            list_min_clust_min = [participant, prompts_list[i], avg_min_clust]
            icc_list_min_clust_min.append(list_min_clust_min)
        
    icc_df_ewm = pd.DataFrame(icc_list_ewm, columns=['id', 'prompt', 'rating'])
    icc_df_ewm_clust = pd.DataFrame(icc_list_ewm_clust, columns=['id', 'prompt', 'rating'])
    icc_df_min = pd.DataFrame(icc_list_min, columns=['id', 'prompt', 'rating'])
    icc_df_min_clust = pd.DataFrame(icc_list_min_clust, columns=['id', 'prompt', 'rating'])
    icc_df_min_clust_min = pd.DataFrame(icc_list_min_clust_min, columns=['id', 'prompt', 'rating'])
    
    icc_df_ewm_results = pg.intraclass_corr(data=icc_df_ewm, targets='id', raters='prompt', ratings='rating')
    icc_df_ewm_clust_results = pg.intraclass_corr(data=icc_df_ewm_clust, targets='id', raters='prompt', ratings='rating')
    icc_df_min_results = pg.intraclass_corr(data=icc_df_min, targets='id', raters='prompt', ratings='rating')
    icc_df_min_clust_results = pg.intraclass_corr(data=icc_df_min_clust, targets='id', raters='prompt', ratings='rating')
    icc_df_min_clust_min_results = pg.intraclass_corr(data=icc_df_min_clust_min, targets='id', raters='prompt', ratings='rating')
    
    results_df_list = [icc_df_ewm_results.loc[icc_df_ewm_results['Type'] == 'ICC3k'], 
                      icc_df_ewm_clust_results.loc[icc_df_ewm_clust_results['Type'] == 'ICC3k'],
                      icc_df_min_results.loc[icc_df_min_results['Type'] == 'ICC3k'],
                      icc_df_min_clust_results.loc[icc_df_min_clust_results['Type'] == 'ICC3k'],
                      icc_df_min_clust_min_results.loc[icc_df_min_clust_min_results['Type'] == 'ICC3k']]
    
    icc_results = pd.concat(results_df_list, ignore_index=True)
    icc_results.index = ['ewm','ewm_clust','minvec','minvec_avg_clust', 'minvec_min_clust']
    icc_results.rename_axis('metrics')
    del icc_results['Description']
    del icc_results['Type']
 
    return icc_results

In [64]:
def icc_calc_all_semspace_df():
    writer = pd.ExcelWriter('icc_methods_results_071321.xlsx', engine='xlsxwriter')
    workbook = writer.book
    worksheet = workbook.add_worksheet('Result')
    sheets = 'ICC Results'
    writer.sheets[sheets] = worksheet
    row = 0
    
    for y in range(len(sem_space_list)):
        print(sem_space_list_str[y])
        worksheet.write_string(row, 0, sem_space_list_str[y])
        icc_results = icc_calc_one_semspace_df(sem_space_list_str[y])
        display(icc_results)
        icc_results.to_excel(writer,sheet_name=sheets, startrow=row + 1, startcol=0)
        row = row + len(icc_results.index) + 3
        
    writer.save()

In [None]:
# icc_calc_all_semspace_df()

In [None]:
# icc_df_ukwac_subtitles = icc_calc_one_semspace_df('ukwac_subtitles')
# icc_df_ukwac_subtitles

## Correlation Test with SemDis and Human Ratings

In [49]:
# create pvalues matrix
def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues

### Comparing the Performance of the algos to SemDis

In [50]:
# print the correlations with SemDis
def print_semdis_corrs():
    for y in range(len(sem_space_list)):
        print(sem_space_list_str[y])
        for i in range(len(prompts_list)):
            print(prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [sem_space_col_list[y],14,15,16,17,18]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[1:, 0])
            pval_df = pd.DataFrame(pval_df.iloc[1:, 0])
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df.columns = [sem_space_list_str[y], 'pval']
            corr_pval_df.index = ['ewm','ewm_clust','minvec','minvec_clust','minvec_clust_min']
            corr_pval_df.index.rename('metrics', inplace=True)
            display(corr_pval_df)

In [51]:
print_semdis_corrs()

ukwac_subtitles
box


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.628624,0.0
ewm_clust,0.277609,0.0
minvec,0.487944,0.0
minvec_clust,0.251786,0.0
minvec_clust_min,0.113257,0.0173


brick


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.602685,0.0
ewm_clust,0.402432,0.0
minvec,0.4302,0.0
minvec_clust,0.296243,0.0
minvec_clust_min,0.258004,0.0


chair


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.828039,0.0
ewm_clust,0.799978,0.0
minvec,0.765139,0.0
minvec_clust,0.738287,0.0
minvec_clust_min,0.357808,0.0


cup


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.826093,0.0
ewm_clust,0.58274,0.0
minvec,0.68045,0.0
minvec_clust,0.598166,0.0
minvec_clust_min,0.217666,0.0


key


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.65171,0.0
ewm_clust,0.219188,0.0
minvec,0.283053,0.0
minvec_clust,0.2075,0.0
minvec_clust_min,0.163124,0.0014


pencil


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.799558,0.0
ewm_clust,0.6058,0.0
minvec,0.638247,0.0
minvec_clust,0.562357,0.0
minvec_clust_min,0.113009,0.0204


rope


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.514819,0.0
ewm_clust,0.119338,0.009
minvec,0.276212,0.0
minvec_clust,0.087242,0.0566
minvec_clust_min,0.07861,0.086


shoe


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.638159,0.0
ewm_clust,0.356217,0.0
minvec,0.334016,0.0
minvec_clust,0.176627,0.0004
minvec_clust_min,0.180678,0.0003


cbow_subtitles
box


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.61773,0.0
ewm_clust,0.408152,0.0
minvec,0.502072,0.0
minvec_clust,0.29535,0.0
minvec_clust_min,0.209934,0.0


brick


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.760215,0.0
ewm_clust,0.478987,0.0
minvec,0.578779,0.0
minvec_clust,0.359592,0.0
minvec_clust_min,0.254607,0.0


chair


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.89281,0.0
ewm_clust,0.836078,0.0
minvec,0.861893,0.0
minvec_clust,0.821972,0.0
minvec_clust_min,0.278632,0.0


cup


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.779796,0.0
ewm_clust,0.597546,0.0
minvec,0.606894,0.0
minvec_clust,0.543381,0.0
minvec_clust_min,-0.061828,0.1945


key


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.740719,0.0
ewm_clust,0.32756,0.0
minvec,0.321748,0.0
minvec_clust,0.147323,0.004
minvec_clust_min,0.187936,0.0002


pencil


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.717261,0.0
ewm_clust,0.478677,0.0
minvec,0.627684,0.0
minvec_clust,0.388681,0.0
minvec_clust_min,0.111492,0.0221


rope


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.5622,0.0
ewm_clust,0.137179,0.0027
minvec,0.418361,0.0
minvec_clust,0.12153,0.0078
minvec_clust_min,0.071226,0.1199


shoe


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.712994,0.0
ewm_clust,0.316788,0.0
minvec,0.445344,0.0
minvec_clust,0.118941,0.0168
minvec_clust_min,0.150338,0.0024


banori
box


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.698511,0.0
ewm_clust,0.236529,0.0
minvec,0.504538,0.0
minvec_clust,0.136789,0.004
minvec_clust_min,-0.039233,0.4112


brick


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.802398,0.0
ewm_clust,0.551996,0.0
minvec,0.673975,0.0
minvec_clust,0.581003,0.0
minvec_clust_min,0.282758,0.0


chair


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.876511,0.0
ewm_clust,0.791578,0.0
minvec,0.832297,0.0
minvec_clust,0.772758,0.0
minvec_clust_min,0.145808,0.0024


cup


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.711498,0.0
ewm_clust,0.4827,0.0
minvec,0.526481,0.0
minvec_clust,0.295895,0.0
minvec_clust_min,0.050896,0.2857


key


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.530581,0.0
ewm_clust,0.129862,0.0112
minvec,0.087272,0.0889
minvec_clust,-0.023751,0.644
minvec_clust_min,0.008732,0.8651


pencil


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.759897,0.0
ewm_clust,0.646183,0.0
minvec,0.683359,0.0
minvec_clust,0.513686,0.0
minvec_clust_min,-0.028615,0.5582


rope


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.680766,0.0
ewm_clust,0.318377,0.0
minvec,0.606839,0.0
minvec_clust,0.259348,0.0
minvec_clust_min,-0.002996,0.9479


shoe


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.770743,0.0
ewm_clust,0.458254,0.0
minvec,0.635155,0.0
minvec_clust,0.362785,0.0
minvec_clust_min,0.000825,0.9868


TASA
box


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.610802,0.0
ewm_clust,0.292825,0.0
minvec,0.541444,0.0
minvec_clust,0.369811,0.0
minvec_clust_min,0.080787,0.0902


brick


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.872058,0.0
ewm_clust,0.565716,0.0
minvec,0.77676,0.0
minvec_clust,0.670793,0.0
minvec_clust_min,0.290142,0.0


chair


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.92876,0.0
ewm_clust,0.872443,0.0
minvec,0.892746,0.0
minvec_clust,0.868403,0.0
minvec_clust_min,0.284785,0.0


cup


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.846104,0.0
ewm_clust,0.652584,0.0
minvec,0.707048,0.0
minvec_clust,0.315569,0.0
minvec_clust_min,-0.140304,0.0031


key


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.726129,0.0
ewm_clust,0.226389,0.0
minvec,0.378046,0.0
minvec_clust,0.079017,0.1236
minvec_clust_min,0.14125,0.0057


pencil


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.874289,0.0
ewm_clust,0.550667,0.0
minvec,0.817958,0.0
minvec_clust,0.561044,0.0
minvec_clust_min,0.102557,0.0354


rope


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.657938,0.0
ewm_clust,0.340771,0.0
minvec,0.490249,0.0
minvec_clust,0.358788,0.0
minvec_clust_min,0.088926,0.052


shoe


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.779325,0.0
ewm_clust,0.372838,0.0
minvec,0.56687,0.0
minvec_clust,0.310671,0.0
minvec_clust_min,0.221706,0.0


glove_6B
box


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.355974,0.0
ewm_clust,0.219824,0.0
minvec,0.199429,0.0
minvec_clust,-0.055909,0.2413
minvec_clust_min,0.031267,0.5125


brick


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.661865,0.0
ewm_clust,0.299915,0.0
minvec,0.435554,0.0
minvec_clust,0.294639,0.0
minvec_clust_min,0.239535,0.0


chair


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.725834,0.0
ewm_clust,0.682569,0.0
minvec,0.663958,0.0
minvec_clust,0.67936,0.0
minvec_clust_min,0.289255,0.0


cup


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.535849,0.0
ewm_clust,0.335433,0.0
minvec,0.237803,0.0
minvec_clust,0.222487,0.0
minvec_clust_min,0.089418,0.0603


key


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.53582,0.0
ewm_clust,0.226804,0.0
minvec,-0.213295,0.0
minvec_clust,-0.197579,0.0001
minvec_clust_min,-0.099754,0.0517


pencil


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.719797,0.0
ewm_clust,0.512636,0.0
minvec,0.551117,0.0
minvec_clust,0.360006,0.0
minvec_clust_min,-0.047632,0.3296


rope


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.600812,0.0
ewm_clust,0.289494,0.0
minvec,0.410349,0.0
minvec_clust,0.223728,0.0
minvec_clust_min,0.028252,0.5378


shoe


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.690989,0.0
ewm_clust,0.433168,0.0
minvec,0.554079,0.0
minvec_clust,0.384637,0.0
minvec_clust_min,0.053397,0.2843


In [52]:
def write_semdis_corrs():
    writer = pd.ExcelWriter('novelty_algos_semdis_corrs_results_071321.xlsx', engine='xlsxwriter')
    workbook = writer.book

    for y in range(len(sem_space_list)):
        row = 0
        worksheet = workbook.add_worksheet(sem_space_list_str[y])
        writer.sheets[sem_space_list_str[y]] = worksheet
        for i in range(len(prompts_list)):
            worksheet.write_string(row, 0, prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [sem_space_col_list[y],14,15,16,17,18]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[1:, 0])
            pval_df = pd.DataFrame(pval_df.iloc[1:, 0])
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df.columns = [sem_space_list_str[y], 'pval']
            corr_pval_df.index = ['ewm','ewm_clust','minvec','minvec_clust','minvec_clust_min']
            corr_pval_df.index.rename('metrics', inplace=True)
            corr_pval_df.to_excel(writer, sheet_name = sem_space_list_str[y], startrow=row + 1, startcol=0, index = True)
            row = row + len(corr_pval_df.index) + 3
            
    writer.save()

In [53]:
# write_semdis_corrs()

### Comparing the performance of algos with Human Raters

In [54]:
# print the correlations with SemDis
def print_human_corrs():
    for y in range(len(sem_space_list)):
        print(sem_space_list_str[y])
        for i in range(len(prompts_list)):
            print(prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [19,20,21,14,15,16,17,18]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[3:, 0:3])
            pval_df = pd.DataFrame(pval_df.iloc[3:, 0:3])
            pval_df.columns = ['n1_pval', 'n2_pval', 'nm_pval']
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df = corr_pval_df[['novelty_1','n1_pval', 'novelty_2', 'n2_pval', 'novelty_m', 'nm_pval']]
            corr_pval_df.index = ['ewm','ewm_clust','minvec','minvec_clust','minvec_clust_min']
            corr_pval_df.index.rename('metrics', inplace=True)
            display(corr_pval_df)

In [55]:
print_human_corrs()

ukwac_subtitles
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.198285,0.0,0.223406,0.0,0.227607,0.0
ewm_clust,0.166847,0.0004,0.211012,0.0,0.204188,0.0
minvec,0.246645,0.0,0.275977,0.0,0.282065,0.0
minvec_clust,0.220691,0.0,0.222808,0.0,0.239109,0.0
minvec_clust_min,0.102437,0.0315,0.088914,0.0621,0.103006,0.0306


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.219305,0.0,0.21627,0.0,0.232813,0.0
ewm_clust,0.300777,0.0,0.335625,0.0,0.340279,0.0
minvec,0.483684,0.0,0.529606,0.0,0.541767,0.0
minvec_clust,0.469584,0.0,0.541265,0.0,0.540544,0.0
minvec_clust_min,0.231408,0.0,0.192175,0.0001,0.226287,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.499952,0.0,0.628264,0.0,0.59416,0.0
ewm_clust,0.618629,0.0,0.742256,0.0,0.717791,0.0
minvec,0.589226,0.0,0.695067,0.0,0.677776,0.0
minvec_clust,0.639989,0.0,0.748992,0.0,0.733217,0.0
minvec_clust_min,0.270543,0.0,0.371467,0.0,0.337122,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.38329,0.0,0.425956,0.0,0.42638,0.0
ewm_clust,0.419393,0.0,0.459668,0.0,0.46316,0.0
minvec,0.593761,0.0,0.572779,0.0,0.614568,0.0
minvec_clust,0.59783,0.0,0.602618,0.0,0.632453,0.0
minvec_clust_min,-0.096665,0.0422,-0.053324,0.2633,-0.078987,0.0972


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.070547,0.1694,0.120169,0.019,0.101227,0.0483
ewm_clust,-0.035328,0.4918,0.021496,0.6758,-0.006279,0.9028
minvec,0.560314,0.0,0.587851,0.0,0.604649,0.0
minvec_clust,0.346704,0.0,0.357176,0.0,0.370566,0.0
minvec_clust_min,0.004356,0.9325,0.012106,0.8138,0.008798,0.8641


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.513172,0.0,0.541416,0.0,0.553862,0.0
ewm_clust,0.655652,0.0,0.645872,0.0,0.684376,0.0
minvec,0.677766,0.0,0.743591,0.0,0.745972,0.0
minvec_clust,0.645646,0.0,0.733876,0.0,0.723567,0.0
minvec_clust_min,0.060769,0.2134,0.200966,0.0,0.134999,0.0055


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.268463,0.0,0.28255,0.0,0.304928,0.0
ewm_clust,0.342429,0.0,0.31891,0.0,0.36804,0.0
minvec,0.544325,0.0,0.443704,0.0,0.553178,0.0
minvec_clust,0.310492,0.0,0.200925,0.0,0.289258,0.0
minvec_clust_min,0.080116,0.0802,0.056523,0.2174,0.076994,0.0927


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.095649,0.0547,0.112679,0.0235,0.109087,0.0284
ewm_clust,-0.012605,0.8006,0.001504,0.976,-0.005677,0.9094
minvec,0.377059,0.0,0.31816,0.0,0.362994,0.0
minvec_clust,0.442943,0.0,0.388255,0.0,0.434134,0.0
minvec_clust_min,-0.050442,0.3118,-0.05145,0.3023,-0.053287,0.2853


cbow_subtitles
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.286212,0.0,0.299711,0.0,0.316014,0.0
ewm_clust,0.33279,0.0,0.346052,0.0,0.366104,0.0
minvec,0.376063,0.0,0.401132,0.0,0.419255,0.0
minvec_clust,0.321919,0.0,0.385256,0.0,0.381931,0.0
minvec_clust_min,0.037703,0.4296,0.012544,0.7928,0.026809,0.5745


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.267462,0.0,0.211571,0.0,0.255873,0.0
ewm_clust,0.289817,0.0,0.2672,0.0,0.297663,0.0
minvec,0.423515,0.0,0.4323,0.0,0.457476,0.0
minvec_clust,0.550552,0.0,0.59229,0.0,0.611002,0.0
minvec_clust_min,-0.015907,0.7533,-0.084021,0.0963,-0.05363,0.2889


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.551085,0.0,0.669385,0.0,0.643468,0.0
ewm_clust,0.60552,0.0,0.710384,0.0,0.694584,0.0
minvec,0.612139,0.0,0.723872,0.0,0.705012,0.0
minvec_clust,0.663332,0.0,0.728468,0.0,0.73626,0.0
minvec_clust_min,0.005909,0.9028,0.218761,0.0,0.111712,0.0205


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.371139,0.0,0.406647,0.0,0.4098,0.0
ewm_clust,0.472476,0.0,0.511782,0.0,0.518582,0.0
minvec,0.518403,0.0,0.528759,0.0,0.551699,0.0
minvec_clust,0.449113,0.0,0.474107,0.0,0.486413,0.0
minvec_clust_min,-0.260716,0.0,-0.208648,0.0,-0.24724,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.071801,0.1619,0.097013,0.0585,0.089273,0.0818
ewm_clust,0.158031,0.002,0.143299,0.0051,0.158301,0.0019
minvec,0.544813,0.0,0.591558,0.0,0.598781,0.0
minvec_clust,0.57161,0.0,0.622659,0.0,0.629323,0.0
minvec_clust_min,-0.013389,0.7945,0.014405,0.7793,0.001023,0.9841


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.54103,0.0,0.574602,0.0,0.585853,0.0
ewm_clust,0.44175,0.0,0.495632,0.0,0.491774,0.0
minvec,0.655576,0.0,0.688132,0.0,0.705769,0.0
minvec_clust,0.513636,0.0,0.564686,0.0,0.565917,0.0
minvec_clust_min,-0.144158,0.003,-0.022046,0.6519,-0.089629,0.0662


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.126623,0.0056,0.227395,0.0,0.191244,0.0
ewm_clust,0.403195,0.0,0.285535,0.0,0.388025,0.0
minvec,0.277503,0.0,0.29327,0.0,0.315804,0.0
minvec_clust,0.528597,0.0,0.316701,0.0,0.479669,0.0
minvec_clust_min,-0.187584,0.0,-0.218997,0.0,-0.223931,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.059989,0.2289,0.068466,0.1696,0.067245,0.1774
ewm_clust,-0.125182,0.0118,-0.113362,0.0227,-0.124626,0.0122
minvec,0.209444,0.0,0.144587,0.0036,0.184535,0.0002
minvec_clust,0.059393,0.2336,0.050113,0.315,0.057176,0.2515
minvec_clust_min,-0.18207,0.0002,-0.139275,0.005,-0.167643,0.0007


banori
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.165181,0.0005,0.146938,0.002,0.168059,0.0004
ewm_clust,0.01068,0.823,-0.011314,0.8127,-0.000585,0.9902
minvec,0.180866,0.0001,0.193644,0.0,0.202036,0.0
minvec_clust,0.103312,0.0301,0.132783,0.0052,0.127602,0.0073
minvec_clust_min,-0.30806,0.0,-0.275485,0.0,-0.314223,0.0


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.262296,0.0,0.195837,0.0001,0.244668,0.0
ewm_clust,0.322474,0.0,0.282814,0.0,0.32341,0.0
minvec,0.473735,0.0,0.425204,0.0,0.480343,0.0
minvec_clust,0.322764,0.0,0.328829,0.0,0.348307,0.0
minvec_clust_min,0.028301,0.5759,-0.015378,0.7612,0.006768,0.8936


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.550756,0.0,0.681719,0.0,0.649392,0.0
ewm_clust,0.641444,0.0,0.770426,0.0,0.744658,0.0
minvec,0.648121,0.0,0.754915,0.0,0.740753,0.0
minvec_clust,0.694817,0.0,0.800385,0.0,0.789703,0.0
minvec_clust_min,0.026789,0.5796,0.084816,0.0789,0.057176,0.2368


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.165164,0.0005,0.183287,0.0001,0.183594,0.0001
ewm_clust,0.327388,0.0,0.429576,0.0,0.398883,0.0
minvec,0.266258,0.0,0.259333,0.0,0.276899,0.0
minvec_clust,0.252999,0.0,0.331464,0.0,0.307983,0.0
minvec_clust_min,-0.335239,0.0,-0.230306,0.0,-0.297871,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,-0.008641,0.8665,0.037303,0.4679,0.01589,0.7572
ewm_clust,-0.107326,0.0363,-0.10319,0.0441,-0.110701,0.0307
minvec,0.48144,0.0,0.481696,0.0,0.506807,0.0
minvec_clust,0.596436,0.0,0.594347,0.0,0.626554,0.0
minvec_clust_min,0.173283,0.0007,0.193861,0.0001,0.193553,0.0001


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.541558,0.0,0.560541,0.0,0.579009,0.0
ewm_clust,0.542888,0.0,0.640294,0.0,0.620184,0.0
minvec,0.612295,0.0,0.64709,0.0,0.6614,0.0
minvec_clust,0.692355,0.0,0.729915,0.0,0.746975,0.0
minvec_clust_min,-0.301988,0.0,-0.186141,0.0001,-0.258744,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.186292,0.0,0.281966,0.0,0.254872,0.0
ewm_clust,0.166537,0.0003,0.331711,0.0,0.26797,0.0
minvec,0.315293,0.0,0.354952,0.0,0.369764,0.0
minvec_clust,0.482968,0.0,0.441057,0.0,0.514688,0.0
minvec_clust_min,-0.245187,0.0,-0.209562,0.0,-0.254062,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.302717,0.0,0.215963,0.0,0.270431,0.0
ewm_clust,0.241216,0.0,0.202701,0.0,0.231773,0.0
minvec,0.265645,0.0,0.18671,0.0002,0.235821,0.0
minvec_clust,0.241976,0.0,0.165364,0.0008,0.212304,0.0
minvec_clust_min,-0.252717,0.0,-0.183811,0.0002,-0.227635,0.0


TASA
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.078913,0.0979,0.119032,0.0124,0.107153,0.0244
ewm_clust,-0.034388,0.4713,-0.002152,0.9641,-0.019342,0.6854
minvec,0.062562,0.1897,0.136903,0.004,0.108352,0.0229
minvec_clust,-0.052776,0.2688,0.044743,0.3485,-0.003252,0.9457
minvec_clust_min,0.007612,0.8733,0.037219,0.4356,0.024496,0.6079


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.396307,0.0,0.362756,0.0,0.405625,0.0
ewm_clust,0.490002,0.0,0.471823,0.0,0.514054,0.0
minvec,0.537388,0.0,0.507712,0.0,0.558529,0.0
minvec_clust,0.529448,0.0,0.518694,0.0,0.560215,0.0
minvec_clust_min,0.23232,0.0,0.125408,0.0128,0.19087,0.0001


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.551117,0.0,0.680828,0.0,0.649155,0.0
ewm_clust,0.638468,0.0,0.752332,0.0,0.73401,0.0
minvec,0.57853,0.0,0.722439,0.0,0.685282,0.0
minvec_clust,0.599623,0.0,0.725718,0.0,0.698843,0.0
minvec_clust_min,-0.086926,0.0718,0.042896,0.3749,-0.027944,0.5633


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.367854,0.0,0.382416,0.0,0.395287,0.0
ewm_clust,0.486581,0.0,0.521348,0.0,0.531049,0.0
minvec,0.479505,0.0,0.486682,0.0,0.509036,0.0
minvec_clust,0.235956,0.0,0.273292,0.0,0.268324,0.0
minvec_clust_min,-0.407202,0.0,-0.415546,0.0,-0.433466,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.124653,0.0149,0.135453,0.0081,0.137058,0.0074
ewm_clust,0.274942,0.0,0.32224,0.0,0.315069,0.0
minvec,0.469444,0.0,0.538891,0.0,0.531808,0.0
minvec_clust,0.477455,0.0,0.522948,0.0,0.527213,0.0
minvec_clust_min,-0.241026,0.0,-0.168003,0.001,-0.213948,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.420013,0.0,0.547691,0.0,0.50635,0.0
ewm_clust,0.384374,0.0,0.513122,0.0,0.469423,0.0
minvec,0.46404,0.0,0.589185,0.0,0.551354,0.0
minvec_clust,0.416554,0.0,0.538489,0.0,0.4998,0.0
minvec_clust_min,-0.380547,0.0,-0.214503,0.0,-0.31588,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.3966,0.0,0.394666,0.0,0.439011,0.0
ewm_clust,0.533636,0.0,0.483022,0.0,0.566513,0.0
minvec,0.449782,0.0,0.420917,0.0,0.484444,0.0
minvec_clust,0.532782,0.0,0.406143,0.0,0.527265,0.0
minvec_clust_min,-0.179623,0.0001,-0.174473,0.0001,-0.196678,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.30838,0.0,0.286354,0.0,0.310784,0.0
ewm_clust,0.2843,0.0,0.271288,0.0,0.290396,0.0
minvec,0.479374,0.0,0.477693,0.0,0.500429,0.0
minvec_clust,0.44974,0.0,0.443709,0.0,0.467124,0.0
minvec_clust_min,-0.105365,0.0342,-0.103125,0.0383,-0.108998,0.0285


glove_6B
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.164957,0.0005,0.184313,0.0001,0.188502,0.0001
ewm_clust,0.113512,0.0171,0.138712,0.0035,0.13625,0.0042
minvec,0.367363,0.0,0.370438,0.0,0.397775,0.0
minvec_clust,0.412439,0.0,0.390608,0.0,0.432673,0.0
minvec_clust_min,-0.049551,0.2992,-0.206885,0.0,-0.139982,0.0032


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.144087,0.0042,0.120275,0.0171,0.14123,0.005
ewm_clust,-0.129548,0.0101,-0.136314,0.0068,-0.142129,0.0048
minvec,0.519243,0.0,0.50254,0.0,0.546107,0.0
minvec_clust,0.554606,0.0,0.54684,0.0,0.588717,0.0
minvec_clust_min,-0.095376,0.0589,-0.106736,0.0344,-0.108068,0.0322


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.471251,0.0,0.57644,0.0,0.552246,0.0
ewm_clust,0.585325,0.0,0.731768,0.0,0.693748,0.0
minvec,0.653349,0.0,0.73902,0.0,0.735838,0.0
minvec_clust,0.673659,0.0,0.786271,0.0,0.770738,0.0
minvec_clust_min,0.128414,0.0077,0.249746,0.0,0.19639,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.139296,0.0033,0.151077,0.0014,0.152991,0.0013
ewm_clust,0.313218,0.0,0.393344,0.0,0.372312,0.0
minvec,0.394362,0.0,0.37524,0.0,0.405445,0.0
minvec_clust,0.519839,0.0,0.522214,0.0,0.549002,0.0
minvec_clust_min,-0.40372,0.0,-0.377855,0.0,-0.411748,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,-0.182468,0.0003,-0.124313,0.0152,-0.160406,0.0017
ewm_clust,-0.252863,0.0,-0.274984,0.0,-0.278143,0.0
minvec,0.555455,0.0,0.51199,0.0,0.560926,0.0
minvec_clust,0.586438,0.0,0.513935,0.0,0.577742,0.0
minvec_clust_min,0.146078,0.0043,0.076115,0.1381,0.115688,0.0239


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.427104,0.0,0.442396,0.0,0.456802,0.0
ewm_clust,0.568521,0.0,0.648653,0.0,0.638372,0.0
minvec,0.534145,0.0,0.567845,0.0,0.578679,0.0
minvec_clust,0.60885,0.0,0.644926,0.0,0.658427,0.0
minvec_clust_min,-0.457398,0.0,-0.253513,0.0,-0.377486,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.258127,0.0,0.295079,0.0,0.304981,0.0
ewm_clust,0.230857,0.0,0.39216,0.0,0.337376,0.0
minvec,0.457398,0.0,0.405663,0.0,0.481371,0.0
minvec_clust,0.579105,0.0,0.411168,0.0,0.557849,0.0
minvec_clust_min,-0.24215,0.0,-0.193215,0.0,-0.243986,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.194688,0.0001,0.182852,0.0002,0.197306,0.0001
ewm_clust,0.158191,0.0014,0.114959,0.0208,0.142438,0.0041
minvec,0.376357,0.0,0.346184,0.0,0.377539,0.0
minvec_clust,0.459967,0.0,0.432727,0.0,0.466538,0.0
minvec_clust_min,-0.292181,0.0,-0.29343,0.0,-0.306223,0.0


In [56]:
def write_human_corrs():
    writer = pd.ExcelWriter('novelty_algos_humans_corrs_results_071321.xlsx', engine='xlsxwriter')
    workbook = writer.book

    for y in range(len(sem_space_list)):
        row = 0
        worksheet = workbook.add_worksheet(sem_space_list_str[y])
        writer.sheets[sem_space_list_str[y]] = worksheet
        for i in range(len(prompts_list)):
            worksheet.write_string(row, 0, prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [19,20,21,14,15,16,17,18]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[3:, 0:3])
            pval_df = pd.DataFrame(pval_df.iloc[3:, 0:3])
            pval_df.columns = ['n1_pval', 'n2_pval', 'nm_pval']
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df = corr_pval_df[['novelty_1','n1_pval', 'novelty_2', 'n2_pval', 'novelty_m', 'nm_pval']]
            corr_pval_df.index = ['ewm','ewm_clust','minvec','minvec_clust','minvec_clust_min']
            corr_pval_df.index.rename('metrics', inplace=True)
            corr_pval_df.to_excel(writer, sheet_name = sem_space_list_str[y], startrow=row + 1, startcol=0, index = True)
            row = row + len(corr_pval_df.index) + 3
            
    writer.save()

In [57]:
# write_human_corrs()

### Comparing the performance of algos and SemDis with Human Raters

In [58]:
# print the correlations with SemDis
def print_compiled_corrs():
    for y in range(len(sem_space_list)):
        print(sem_space_list_str[y])
        for i in range(len(prompts_list)):
            print(prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [19,20,21,8,9,10,11,12,14,15,16,17,18]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[3:, 0:3])
            pval_df = pd.DataFrame(pval_df.iloc[3:, 0:3])
            pval_df.columns = ['n1_pval', 'n2_pval', 'nm_pval']
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df = corr_pval_df[['novelty_1','n1_pval', 'novelty_2', 'n2_pval', 'novelty_m', 'nm_pval']]
            corr_pval_df.index.rename('metrics', inplace=True)
            display(corr_pval_df)

In [59]:
print_compiled_corrs()

ukwac_subtitles
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.198285,0.0,0.223406,0.0,0.227607,0.0
ewm_vector_cosine_dis_clus_avg,0.166847,0.0004,0.211012,0.0,0.204188,0.0
minima_vector_cosine_dis,0.246645,0.0,0.275977,0.0,0.282065,0.0
minima_vector_cosine_dis_clus_avg,0.220691,0.0,0.222808,0.0,0.239109,0.0
minima_vector_cosine_dis_clus_min,0.102437,0.0315,0.088914,0.0621,0.103006,0.0306


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.219305,0.0,0.21627,0.0,0.232813,0.0
ewm_vector_cosine_dis_clus_avg,0.300777,0.0,0.335625,0.0,0.340279,0.0
minima_vector_cosine_dis,0.483684,0.0,0.529606,0.0,0.541767,0.0
minima_vector_cosine_dis_clus_avg,0.469584,0.0,0.541265,0.0,0.540544,0.0
minima_vector_cosine_dis_clus_min,0.231408,0.0,0.192175,0.0001,0.226287,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.499952,0.0,0.628264,0.0,0.59416,0.0
ewm_vector_cosine_dis_clus_avg,0.618629,0.0,0.742256,0.0,0.717791,0.0
minima_vector_cosine_dis,0.589226,0.0,0.695067,0.0,0.677776,0.0
minima_vector_cosine_dis_clus_avg,0.639989,0.0,0.748992,0.0,0.733217,0.0
minima_vector_cosine_dis_clus_min,0.270543,0.0,0.371467,0.0,0.337122,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.38329,0.0,0.425956,0.0,0.42638,0.0
ewm_vector_cosine_dis_clus_avg,0.419393,0.0,0.459668,0.0,0.46316,0.0
minima_vector_cosine_dis,0.593761,0.0,0.572779,0.0,0.614568,0.0
minima_vector_cosine_dis_clus_avg,0.59783,0.0,0.602618,0.0,0.632453,0.0
minima_vector_cosine_dis_clus_min,-0.096665,0.0422,-0.053324,0.2633,-0.078987,0.0972


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,0.070547,0.1694,0.120169,0.019,0.101227,0.0483
ewm_vector_cosine_dis_clus_avg,-0.035328,0.4918,0.021496,0.6758,-0.006279,0.9028
minima_vector_cosine_dis,0.560314,0.0,0.587851,0.0,0.604649,0.0
minima_vector_cosine_dis_clus_avg,0.346704,0.0,0.357176,0.0,0.370566,0.0
minima_vector_cosine_dis_clus_min,0.004356,0.9325,0.012106,0.8138,0.008798,0.8641


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.513172,0.0,0.541416,0.0,0.553862,0.0
ewm_vector_cosine_dis_clus_avg,0.655652,0.0,0.645872,0.0,0.684376,0.0
minima_vector_cosine_dis,0.677766,0.0,0.743591,0.0,0.745972,0.0
minima_vector_cosine_dis_clus_avg,0.645646,0.0,0.733876,0.0,0.723567,0.0
minima_vector_cosine_dis_clus_min,0.060769,0.2134,0.200966,0.0,0.134999,0.0055


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.268463,0.0,0.28255,0.0,0.304928,0.0
ewm_vector_cosine_dis_clus_avg,0.342429,0.0,0.31891,0.0,0.36804,0.0
minima_vector_cosine_dis,0.544325,0.0,0.443704,0.0,0.553178,0.0
minima_vector_cosine_dis_clus_avg,0.310492,0.0,0.200925,0.0,0.289258,0.0
minima_vector_cosine_dis_clus_min,0.080116,0.0802,0.056523,0.2174,0.076994,0.0927


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.095649,0.0547,0.112679,0.0235,0.109087,0.0284
ewm_vector_cosine_dis_clus_avg,-0.012605,0.8006,0.001504,0.976,-0.005677,0.9094
minima_vector_cosine_dis,0.377059,0.0,0.31816,0.0,0.362994,0.0
minima_vector_cosine_dis_clus_avg,0.442943,0.0,0.388255,0.0,0.434134,0.0
minima_vector_cosine_dis_clus_min,-0.050442,0.3118,-0.05145,0.3023,-0.053287,0.2853


cbow_subtitles
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.286212,0.0,0.299711,0.0,0.316014,0.0
ewm_vector_cosine_dis_clus_avg,0.33279,0.0,0.346052,0.0,0.366104,0.0
minima_vector_cosine_dis,0.376063,0.0,0.401132,0.0,0.419255,0.0
minima_vector_cosine_dis_clus_avg,0.321919,0.0,0.385256,0.0,0.381931,0.0
minima_vector_cosine_dis_clus_min,0.037703,0.4296,0.012544,0.7928,0.026809,0.5745


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.267462,0.0,0.211571,0.0,0.255873,0.0
ewm_vector_cosine_dis_clus_avg,0.289817,0.0,0.2672,0.0,0.297663,0.0
minima_vector_cosine_dis,0.423515,0.0,0.4323,0.0,0.457476,0.0
minima_vector_cosine_dis_clus_avg,0.550552,0.0,0.59229,0.0,0.611002,0.0
minima_vector_cosine_dis_clus_min,-0.015907,0.7533,-0.084021,0.0963,-0.05363,0.2889


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.551085,0.0,0.669385,0.0,0.643468,0.0
ewm_vector_cosine_dis_clus_avg,0.60552,0.0,0.710384,0.0,0.694584,0.0
minima_vector_cosine_dis,0.612139,0.0,0.723872,0.0,0.705012,0.0
minima_vector_cosine_dis_clus_avg,0.663332,0.0,0.728468,0.0,0.73626,0.0
minima_vector_cosine_dis_clus_min,0.005909,0.9028,0.218761,0.0,0.111712,0.0205


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.371139,0.0,0.406647,0.0,0.4098,0.0
ewm_vector_cosine_dis_clus_avg,0.472476,0.0,0.511782,0.0,0.518582,0.0
minima_vector_cosine_dis,0.518403,0.0,0.528759,0.0,0.551699,0.0
minima_vector_cosine_dis_clus_avg,0.449113,0.0,0.474107,0.0,0.486413,0.0
minima_vector_cosine_dis_clus_min,-0.260716,0.0,-0.208648,0.0,-0.24724,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,0.071801,0.1619,0.097013,0.0585,0.089273,0.0818
ewm_vector_cosine_dis_clus_avg,0.158031,0.002,0.143299,0.0051,0.158301,0.0019
minima_vector_cosine_dis,0.544813,0.0,0.591558,0.0,0.598781,0.0
minima_vector_cosine_dis_clus_avg,0.57161,0.0,0.622659,0.0,0.629323,0.0
minima_vector_cosine_dis_clus_min,-0.013389,0.7945,0.014405,0.7793,0.001023,0.9841


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.54103,0.0,0.574602,0.0,0.585853,0.0
ewm_vector_cosine_dis_clus_avg,0.44175,0.0,0.495632,0.0,0.491774,0.0
minima_vector_cosine_dis,0.655576,0.0,0.688132,0.0,0.705769,0.0
minima_vector_cosine_dis_clus_avg,0.513636,0.0,0.564686,0.0,0.565917,0.0
minima_vector_cosine_dis_clus_min,-0.144158,0.003,-0.022046,0.6519,-0.089629,0.0662


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.126623,0.0056,0.227395,0.0,0.191244,0.0
ewm_vector_cosine_dis_clus_avg,0.403195,0.0,0.285535,0.0,0.388025,0.0
minima_vector_cosine_dis,0.277503,0.0,0.29327,0.0,0.315804,0.0
minima_vector_cosine_dis_clus_avg,0.528597,0.0,0.316701,0.0,0.479669,0.0
minima_vector_cosine_dis_clus_min,-0.187584,0.0,-0.218997,0.0,-0.223931,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.059989,0.2289,0.068466,0.1696,0.067245,0.1774
ewm_vector_cosine_dis_clus_avg,-0.125182,0.0118,-0.113362,0.0227,-0.124626,0.0122
minima_vector_cosine_dis,0.209444,0.0,0.144587,0.0036,0.184535,0.0002
minima_vector_cosine_dis_clus_avg,0.059393,0.2336,0.050113,0.315,0.057176,0.2515
minima_vector_cosine_dis_clus_min,-0.18207,0.0002,-0.139275,0.005,-0.167643,0.0007


banori
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.165181,0.0005,0.146938,0.002,0.168059,0.0004
ewm_vector_cosine_dis_clus_avg,0.01068,0.823,-0.011314,0.8127,-0.000585,0.9902
minima_vector_cosine_dis,0.180866,0.0001,0.193644,0.0,0.202036,0.0
minima_vector_cosine_dis_clus_avg,0.103312,0.0301,0.132783,0.0052,0.127602,0.0073
minima_vector_cosine_dis_clus_min,-0.30806,0.0,-0.275485,0.0,-0.314223,0.0


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.262296,0.0,0.195837,0.0001,0.244668,0.0
ewm_vector_cosine_dis_clus_avg,0.322474,0.0,0.282814,0.0,0.32341,0.0
minima_vector_cosine_dis,0.473735,0.0,0.425204,0.0,0.480343,0.0
minima_vector_cosine_dis_clus_avg,0.322764,0.0,0.328829,0.0,0.348307,0.0
minima_vector_cosine_dis_clus_min,0.028301,0.5759,-0.015378,0.7612,0.006768,0.8936


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.550756,0.0,0.681719,0.0,0.649392,0.0
ewm_vector_cosine_dis_clus_avg,0.641444,0.0,0.770426,0.0,0.744658,0.0
minima_vector_cosine_dis,0.648121,0.0,0.754915,0.0,0.740753,0.0
minima_vector_cosine_dis_clus_avg,0.694817,0.0,0.800385,0.0,0.789703,0.0
minima_vector_cosine_dis_clus_min,0.026789,0.5796,0.084816,0.0789,0.057176,0.2368


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.165164,0.0005,0.183287,0.0001,0.183594,0.0001
ewm_vector_cosine_dis_clus_avg,0.327388,0.0,0.429576,0.0,0.398883,0.0
minima_vector_cosine_dis,0.266258,0.0,0.259333,0.0,0.276899,0.0
minima_vector_cosine_dis_clus_avg,0.252999,0.0,0.331464,0.0,0.307983,0.0
minima_vector_cosine_dis_clus_min,-0.335239,0.0,-0.230306,0.0,-0.297871,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,-0.008641,0.8665,0.037303,0.4679,0.01589,0.7572
ewm_vector_cosine_dis_clus_avg,-0.107326,0.0363,-0.10319,0.0441,-0.110701,0.0307
minima_vector_cosine_dis,0.48144,0.0,0.481696,0.0,0.506807,0.0
minima_vector_cosine_dis_clus_avg,0.596436,0.0,0.594347,0.0,0.626554,0.0
minima_vector_cosine_dis_clus_min,0.173283,0.0007,0.193861,0.0001,0.193553,0.0001


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.541558,0.0,0.560541,0.0,0.579009,0.0
ewm_vector_cosine_dis_clus_avg,0.542888,0.0,0.640294,0.0,0.620184,0.0
minima_vector_cosine_dis,0.612295,0.0,0.64709,0.0,0.6614,0.0
minima_vector_cosine_dis_clus_avg,0.692355,0.0,0.729915,0.0,0.746975,0.0
minima_vector_cosine_dis_clus_min,-0.301988,0.0,-0.186141,0.0001,-0.258744,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.186292,0.0,0.281966,0.0,0.254872,0.0
ewm_vector_cosine_dis_clus_avg,0.166537,0.0003,0.331711,0.0,0.26797,0.0
minima_vector_cosine_dis,0.315293,0.0,0.354952,0.0,0.369764,0.0
minima_vector_cosine_dis_clus_avg,0.482968,0.0,0.441057,0.0,0.514688,0.0
minima_vector_cosine_dis_clus_min,-0.245187,0.0,-0.209562,0.0,-0.254062,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.302717,0.0,0.215963,0.0,0.270431,0.0
ewm_vector_cosine_dis_clus_avg,0.241216,0.0,0.202701,0.0,0.231773,0.0
minima_vector_cosine_dis,0.265645,0.0,0.18671,0.0002,0.235821,0.0
minima_vector_cosine_dis_clus_avg,0.241976,0.0,0.165364,0.0008,0.212304,0.0
minima_vector_cosine_dis_clus_min,-0.252717,0.0,-0.183811,0.0002,-0.227635,0.0


TASA
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.078913,0.0979,0.119032,0.0124,0.107153,0.0244
ewm_vector_cosine_dis_clus_avg,-0.034388,0.4713,-0.002152,0.9641,-0.019342,0.6854
minima_vector_cosine_dis,0.062562,0.1897,0.136903,0.004,0.108352,0.0229
minima_vector_cosine_dis_clus_avg,-0.052776,0.2688,0.044743,0.3485,-0.003252,0.9457
minima_vector_cosine_dis_clus_min,0.007612,0.8733,0.037219,0.4356,0.024496,0.6079


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.396307,0.0,0.362756,0.0,0.405625,0.0
ewm_vector_cosine_dis_clus_avg,0.490002,0.0,0.471823,0.0,0.514054,0.0
minima_vector_cosine_dis,0.537388,0.0,0.507712,0.0,0.558529,0.0
minima_vector_cosine_dis_clus_avg,0.529448,0.0,0.518694,0.0,0.560215,0.0
minima_vector_cosine_dis_clus_min,0.23232,0.0,0.125408,0.0128,0.19087,0.0001


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.551117,0.0,0.680828,0.0,0.649155,0.0
ewm_vector_cosine_dis_clus_avg,0.638468,0.0,0.752332,0.0,0.73401,0.0
minima_vector_cosine_dis,0.57853,0.0,0.722439,0.0,0.685282,0.0
minima_vector_cosine_dis_clus_avg,0.599623,0.0,0.725718,0.0,0.698843,0.0
minima_vector_cosine_dis_clus_min,-0.086926,0.0718,0.042896,0.3749,-0.027944,0.5633


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.367854,0.0,0.382416,0.0,0.395287,0.0
ewm_vector_cosine_dis_clus_avg,0.486581,0.0,0.521348,0.0,0.531049,0.0
minima_vector_cosine_dis,0.479505,0.0,0.486682,0.0,0.509036,0.0
minima_vector_cosine_dis_clus_avg,0.235956,0.0,0.273292,0.0,0.268324,0.0
minima_vector_cosine_dis_clus_min,-0.407202,0.0,-0.415546,0.0,-0.433466,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,0.124653,0.0149,0.135453,0.0081,0.137058,0.0074
ewm_vector_cosine_dis_clus_avg,0.274942,0.0,0.32224,0.0,0.315069,0.0
minima_vector_cosine_dis,0.469444,0.0,0.538891,0.0,0.531808,0.0
minima_vector_cosine_dis_clus_avg,0.477455,0.0,0.522948,0.0,0.527213,0.0
minima_vector_cosine_dis_clus_min,-0.241026,0.0,-0.168003,0.001,-0.213948,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.420013,0.0,0.547691,0.0,0.50635,0.0
ewm_vector_cosine_dis_clus_avg,0.384374,0.0,0.513122,0.0,0.469423,0.0
minima_vector_cosine_dis,0.46404,0.0,0.589185,0.0,0.551354,0.0
minima_vector_cosine_dis_clus_avg,0.416554,0.0,0.538489,0.0,0.4998,0.0
minima_vector_cosine_dis_clus_min,-0.380547,0.0,-0.214503,0.0,-0.31588,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.3966,0.0,0.394666,0.0,0.439011,0.0
ewm_vector_cosine_dis_clus_avg,0.533636,0.0,0.483022,0.0,0.566513,0.0
minima_vector_cosine_dis,0.449782,0.0,0.420917,0.0,0.484444,0.0
minima_vector_cosine_dis_clus_avg,0.532782,0.0,0.406143,0.0,0.527265,0.0
minima_vector_cosine_dis_clus_min,-0.179623,0.0001,-0.174473,0.0001,-0.196678,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.30838,0.0,0.286354,0.0,0.310784,0.0
ewm_vector_cosine_dis_clus_avg,0.2843,0.0,0.271288,0.0,0.290396,0.0
minima_vector_cosine_dis,0.479374,0.0,0.477693,0.0,0.500429,0.0
minima_vector_cosine_dis_clus_avg,0.44974,0.0,0.443709,0.0,0.467124,0.0
minima_vector_cosine_dis_clus_min,-0.105365,0.0342,-0.103125,0.0383,-0.108998,0.0285


glove_6B
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.164957,0.0005,0.184313,0.0001,0.188502,0.0001
ewm_vector_cosine_dis_clus_avg,0.113512,0.0171,0.138712,0.0035,0.13625,0.0042
minima_vector_cosine_dis,0.367363,0.0,0.370438,0.0,0.397775,0.0
minima_vector_cosine_dis_clus_avg,0.412439,0.0,0.390608,0.0,0.432673,0.0
minima_vector_cosine_dis_clus_min,-0.049551,0.2992,-0.206885,0.0,-0.139982,0.0032


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.144087,0.0042,0.120275,0.0171,0.14123,0.005
ewm_vector_cosine_dis_clus_avg,-0.129548,0.0101,-0.136314,0.0068,-0.142129,0.0048
minima_vector_cosine_dis,0.519243,0.0,0.50254,0.0,0.546107,0.0
minima_vector_cosine_dis_clus_avg,0.554606,0.0,0.54684,0.0,0.588717,0.0
minima_vector_cosine_dis_clus_min,-0.095376,0.0589,-0.106736,0.0344,-0.108068,0.0322


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.471251,0.0,0.57644,0.0,0.552246,0.0
ewm_vector_cosine_dis_clus_avg,0.585325,0.0,0.731768,0.0,0.693748,0.0
minima_vector_cosine_dis,0.653349,0.0,0.73902,0.0,0.735838,0.0
minima_vector_cosine_dis_clus_avg,0.673659,0.0,0.786271,0.0,0.770738,0.0
minima_vector_cosine_dis_clus_min,0.128414,0.0077,0.249746,0.0,0.19639,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.139296,0.0033,0.151077,0.0014,0.152991,0.0013
ewm_vector_cosine_dis_clus_avg,0.313218,0.0,0.393344,0.0,0.372312,0.0
minima_vector_cosine_dis,0.394362,0.0,0.37524,0.0,0.405445,0.0
minima_vector_cosine_dis_clus_avg,0.519839,0.0,0.522214,0.0,0.549002,0.0
minima_vector_cosine_dis_clus_min,-0.40372,0.0,-0.377855,0.0,-0.411748,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,-0.182468,0.0003,-0.124313,0.0152,-0.160406,0.0017
ewm_vector_cosine_dis_clus_avg,-0.252863,0.0,-0.274984,0.0,-0.278143,0.0
minima_vector_cosine_dis,0.555455,0.0,0.51199,0.0,0.560926,0.0
minima_vector_cosine_dis_clus_avg,0.586438,0.0,0.513935,0.0,0.577742,0.0
minima_vector_cosine_dis_clus_min,0.146078,0.0043,0.076115,0.1381,0.115688,0.0239


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.427104,0.0,0.442396,0.0,0.456802,0.0
ewm_vector_cosine_dis_clus_avg,0.568521,0.0,0.648653,0.0,0.638372,0.0
minima_vector_cosine_dis,0.534145,0.0,0.567845,0.0,0.578679,0.0
minima_vector_cosine_dis_clus_avg,0.60885,0.0,0.644926,0.0,0.658427,0.0
minima_vector_cosine_dis_clus_min,-0.457398,0.0,-0.253513,0.0,-0.377486,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.258127,0.0,0.295079,0.0,0.304981,0.0
ewm_vector_cosine_dis_clus_avg,0.230857,0.0,0.39216,0.0,0.337376,0.0
minima_vector_cosine_dis,0.457398,0.0,0.405663,0.0,0.481371,0.0
minima_vector_cosine_dis_clus_avg,0.579105,0.0,0.411168,0.0,0.557849,0.0
minima_vector_cosine_dis_clus_min,-0.24215,0.0,-0.193215,0.0,-0.243986,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.194688,0.0001,0.182852,0.0002,0.197306,0.0001
ewm_vector_cosine_dis_clus_avg,0.158191,0.0014,0.114959,0.0208,0.142438,0.0041
minima_vector_cosine_dis,0.376357,0.0,0.346184,0.0,0.377539,0.0
minima_vector_cosine_dis_clus_avg,0.459967,0.0,0.432727,0.0,0.466538,0.0
minima_vector_cosine_dis_clus_min,-0.292181,0.0,-0.29343,0.0,-0.306223,0.0


In [60]:
def write_compiled_corrs():
    writer = pd.ExcelWriter('novelty_algos_compiled_corrs_results_071321.xlsx', engine='xlsxwriter')
    workbook = writer.book

    for y in range(len(sem_space_list)):
        row = 0
        worksheet = workbook.add_worksheet(sem_space_list_str[y])
        writer.sheets[sem_space_list_str[y]] = worksheet
        for i in range(len(prompts_list)):
            worksheet.write_string(row, 0, prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [19,20,21,8,9,10,11,12,14,15,16,17,18]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[3:, 0:3])
            pval_df = pd.DataFrame(pval_df.iloc[3:, 0:3])
            pval_df.columns = ['n1_pval', 'n2_pval', 'nm_pval']
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df = corr_pval_df[['novelty_1','n1_pval', 'novelty_2', 'n2_pval', 'novelty_m', 'nm_pval']]
            corr_pval_df.index.rename('metrics', inplace=True)
            corr_pval_df.to_excel(writer, sheet_name = sem_space_list_str[y], startrow=row + 1, startcol=0, index = True)
            row = row + len(corr_pval_df.index) + 3
            
    writer.save()

In [61]:
# write_compiled_corrs()

### Comparing Novelty Scores with Flexibility Scores

In [66]:
# individual df's for each sheet

# when on pc
flexibility_official_cup = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='cup')
flexibility_official_key = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='key')
flexibility_official_rope = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='rope')
flexibility_official_brick = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='brick')
flexibility_official_chair = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='chair')
flexibility_official_pencil = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='pencil')
flexibility_official_shoe = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='shoe')
flexibility_official_box = pd.read_excel("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='box')

# when on mac
# flexibility_official_cup = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='cup')
# flexibility_official_key = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='key')
# flexibility_official_rope = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='rope')
# flexibility_official_brick = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='brick')
# flexibility_official_chair = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='chair')
# flexibility_official_pencil = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='pencil')
# flexibility_official_shoe = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='shoe')
# flexibility_official_box = pd.read_excel("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/results/results_methods/flexibility_results/flexibility_methods_results_071421.xlsx", sheet_name='box')

In [102]:
flex_data_list = [flexibility_official_box, flexibility_official_brick, flexibility_official_chair, flexibility_official_cup, flexibility_official_key, flexibility_official_pencil, flexibility_official_rope, flexibility_official_shoe]

In [68]:
prompts_df_list = []
for i in range(len(prompts_list)):
    df_list = []
    for y in range(len(sem_space_list_str)):
        df_list.append(results_dict[prompts_list[i] + underscore + sem_space_list_str[0] + underscore + "results"])
    prompts_df_list.append(df_list)

In [83]:
combined_df_list = []
for i in range(len(prompts_df_list)):
    combined_df = pd.concat(prompts_df_list[i])
    combined_df_list.append(combined_df)

In [99]:
def get_novelty_avg(collapse_prompt_df):
    # get id list
    id_list = get_id_list(collapse_prompt_df)
    participant_values = []
    
    for participant in id_list:
        values_list = []
        values_list.append(participant)
        temp_df = collapse_prompt_df.loc[collapse_prompt_df['id'] == participant]
        values_list.append(temp_df["ewm_vector_cosine_dis"].mean())
        values_list.append(temp_df["ewm_vector_cosine_dis_clus_avg"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis_clus_avg"].mean())
        values_list.append(temp_df["minima_vector_cosine_dis_clus_min"].mean())
        values_tuple = tuple(values_list)
        participant_values.append(values_tuple)
        
    participant_avg_novelty_df = pd.DataFrame(participant_values, columns=['id', 'ewm', 'ewm_clust', 'minvec', 'minvec_clust', "minvec_min"])
    
    return participant_avg_novelty_df

In [108]:
def get_novelty_avg_flexibility_df(participant_avg_novelty_df, index):
    df_cd = pd.merge(flex_data_list[i], participant_avg_novelty_df, how='inner', on = 'id')
    df_cd = df_cd.loc[:, df_cd.columns != 'id']

    return df_cd

In [127]:
def calculate_corrs_novelty_flexibility():
    for i in range(len(combined_df_list)):
        print(prompts_list[i])
        participant_avg_novelty_df = get_novelty_avg(combined_df_list[i])
        combined_df = get_novelty_avg_flexibility_df(participant_avg_novelty_df, i)
        pval_df = calculate_pvalues(combined_df)
        corrs_df = combined_df.corr()
        corrs_df = pd.DataFrame(corrs_df.iloc[2:, :2])
        pval_df = pd.DataFrame(pval_df.iloc[2:, :2])
        pval_df.columns = ['rating_pval', 'method_pval']
        corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
        corr_pval_df = corr_pval_df[['rating','rating_pval', 'method_avg', 'method_pval']]
        corr_pval_df.index.rename('metrics', inplace=True)
        display(corr_pval_df)

In [128]:
calculate_corrs_novelty_flexibility()

box


Unnamed: 0_level_0,rating,rating_pval,method_avg,method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ewm,-0.079815,0.4572,0.257462,0.0149
ewm_clust,0.075572,0.4815,0.250704,0.0178
minvec,-0.109651,0.3063,0.200379,0.0597
minvec_clust,0.073616,0.493,0.250914,0.0177
minvec_min,-0.015258,0.8871,0.131829,0.2181


brick


Unnamed: 0_level_0,rating,rating_pval,method_avg,method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ewm,-0.005971,0.9557,0.053811,0.6165
ewm_clust,0.092104,0.3906,0.044584,0.6782
minvec,-0.022362,0.8352,0.006657,0.9506
minvec_clust,0.201151,0.0587,0.21909,0.0391
minvec_min,0.099946,0.3514,0.005607,0.9584


chair


Unnamed: 0_level_0,rating,rating_pval,method_avg,method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ewm,0.119797,0.2607,0.380768,0.0002
ewm_clust,0.215403,0.0415,0.291424,0.0053
minvec,0.20911,0.0479,0.37374,0.0003
minvec_clust,0.222308,0.0352,0.289083,0.0057
minvec_min,-0.022725,0.8316,0.215778,0.0411


cup


Unnamed: 0_level_0,rating,rating_pval,method_avg,method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ewm,-0.076304,0.4773,0.077207,0.4721
ewm_clust,-0.096486,0.3684,0.182157,0.0875
minvec,0.070378,0.5122,0.088289,0.4107
minvec_clust,-0.081755,0.4463,0.09104,0.3962
minvec_min,-0.074825,0.4859,0.010482,0.9223


key


Unnamed: 0_level_0,rating,rating_pval,method_avg,method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ewm,-0.154487,0.146,-0.000788,0.9941
ewm_clust,-0.151635,0.1537,-0.057237,0.5921
minvec,-0.127107,0.2325,0.069065,0.5177
minvec_clust,-0.029828,0.7802,0.069781,0.5134
minvec_min,-0.10039,0.3465,-0.112703,0.2902


pencil


Unnamed: 0_level_0,rating,rating_pval,method_avg,method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ewm,0.152696,0.1508,0.172735,0.1035
ewm_clust,0.161952,0.1273,0.047569,0.6562
minvec,0.205753,0.0517,0.183515,0.0834
minvec_clust,0.222325,0.0352,0.158873,0.1347
minvec_min,0.043359,0.6849,0.181487,0.0869


rope


Unnamed: 0_level_0,rating,rating_pval,method_avg,method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ewm,0.109608,0.3038,0.220619,0.0367
ewm_clust,0.157899,0.1372,0.194632,0.066
minvec,0.18816,0.0757,0.187858,0.0762
minvec_clust,0.131888,0.2153,0.033609,0.7532
minvec_min,0.096709,0.3645,0.140441,0.1867


shoe


Unnamed: 0_level_0,rating,rating_pval,method_avg,method_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ewm,-0.038865,0.7161,0.071254,0.5045
ewm_clust,-0.069245,0.5167,-0.042127,0.6934
minvec,0.031848,0.7657,0.13212,0.2145
minvec_clust,0.108278,0.3097,0.255131,0.0152
minvec_min,-0.04377,0.6821,-0.029578,0.782


Algo Design Brainstorming:
* Word2vec, average similarity between words in two responses
* Problems:
    * phrases vs words
    * compare between just two responses or the whole list of responses
* Algo Idea
    * find similarity between prompt and average of word in response
    * the average is the value that represents novelty
* Things to Look Into:
    * doc2Vec - look into sentence vector
        * follow similar logic to above but no need to average for multiple words
    * Word2Vec + SIF + Cosine Similarity
    * Word2Vec + WMD 
* Course of Actions for Missing Words/Misspellings?
    * hand remove?
* What to do when whole phrase stop words
    * make 0

To Do List
- [x] write preprocessing methods
- [x] write out initial algo
- [x] implement first algo idea
- [x] set up work environment on macbook
- [x] look into Word2Vec + SIF + Cosine Similarity
- [x] set up excel sheet download with different sheets
- [x] use ukwac semantic space for cosine distance
- [x] set up element wise multiplied vectors for cosine distance algo
- [x] set up phrase minima vector for cosine distance algo 
- [x] Compare the ewm and phrase minima algos with SemDis
- [x] Get a sense of the effect of the compositions
- [x] 1 - to get the distance
- [x] Remove the word could 
    - had to lowercase first and then remove stop words
- [x] write clustering method
    - count vectorizer and kmeans
- [x] update clustering method to use cosine distance not euclidean 
    - changed from scikit learn to cosine distance from NLTK 
- [x] write algo to average novelty scores of responses in the same category
    - do for both ewm and minima
- [x] use the elbow method to figure out how many clusters to use
- [x] download and upload all the semantic spaces from SemDis
- [x] update methods to change semantic spaces
- [x] copy semantic spaces to mac
- [x] set up scripts to pass in official data for pc and mac
- [x] add .DS_Store to gitignore
- [x] get SemDis results for all 6 different semantic spaces with the official data
- [x] compare performance on the 6 different semantic spaces
    - correlation tests for coefficient and significance
- [x] update stop words list to include "use" and "thing"
- [ ] figure out how many times to run kmeans
    - cross validation
    - then averaging the results of all the iterations
- [ ] figure out way to automate the number of clusters used in each run
 
Don't look into unless you have time
- [ ] look into doc2vec 
- [ ] look into Word2Vec + WMD 
- [ ] look at https://github.com/PrincetonML/SIF for better SIF 
