# Novelty Analysis

## Comparing Novelty Algorithms to SemDis 
## Comparing Novelty Algorithms to Human Raters

### Import Packages

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from functools import reduce
import openpyxl
import xlsxwriter

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk import ngrams, FreqDist
from nltk.lm import NgramCounter
import string
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

from spacy.lang.en.stop_words import STOP_WORDS

from collections import Counter
import itertools

from sklearn.metrics.pairwise import cosine_similarity

from scipy import spatial

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

from nltk.cluster.kmeans import KMeansClusterer

from scipy import stats
from scipy.stats import pearsonr

import pingouin as pg

import glob

### Put Data from Excel Sheet into Dataframes

In [2]:
# individual df's for each sheet

# when on pc
data_official_cup = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_cup_semdis.csv")
data_official_key = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_key_semdis.csv")
data_official_rope = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_rope_semdis.csv")
data_official_brick = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_brick_semdis.csv")
data_official_chair = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_chair_semdis.csv")
data_official_pencil = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_pencil_semdis.csv")
data_official_shoe = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_shoe_semdis.csv")
data_official_box = pd.read_csv("C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_box_semdis.csv")

# when on mac
# data_official_cup = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_cup_semdis.csv")
# data_official_key = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_key_semdis.csv")
# data_official_rope = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_rope_semdis.csv")
# data_official_brick = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_brick_semdis.csv")
# data_official_chair = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_chair_semdis.csv")
# data_official_pencil = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_pencil_semdis.csv")
# data_official_shoe = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_shoe_semdis.csv")
# data_official_box = pd.read_csv("/Users/johnhenrycruz/Desktop/Northwestern_SROP/AUT-Scoring/data/novelty/official/semdis/autdata_official_box_semdis.csv")

In [3]:
results_dict = {}
for filename in glob.glob('C:/Users/jhec8/Documents/Northwestern_SROP/AUT-Scoring/results/novelty_070921/*.csv'):
    results_dict[filename[78:-4]] = pd.read_csv(filename)

### Preprocessing

In [4]:
# nltk corpus stop words
stopwords_nltk = stopwords.words('english')
# spacy stop words
stopwords_spacy = STOP_WORDS

In [5]:
stopwords_edited = list(stopwords_spacy)
stopwords_edited.append("thing")
stopwords_edited.append("use")
stopwords_edited.append("things")

In [6]:
# method to clean the responses
def process_text(text, stopwords_list, remove_sw, join_list):
    # tokenize text, lemmanize words, removing punctuation, remove stop words, lowercase all words

    # hardcorded for special situations
    text = re.sub(r"doorstoppper","doorstop", text)
    
    text = re.sub("/|-"," ", text)
    text = text.translate(str.maketrans('','',string.punctuation))
    tokens = word_tokenize(text)

    tokens = [w.lower() for w in tokens]
    
    if remove_sw:
        tokens = [word for word in tokens if word not in stopwords_list]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
#         stemmer = PorterStemmer()
#         tokens = [stemmer.stem(t) for t in tokens]

    if join_list:
        tokens = ' '.join(tokens)
 
    return tokens

### General Functions

In [7]:
# method to get a list of participants
def get_id_list(df):
    id_list = df['id'].unique()
    id_list = sorted(id_list)
    return id_list

In [8]:
# method to add a new column
# new column are cleaned responses
def get_cleaned_responses(df, stopwords_list, remove_sw, join_list):
    # id_df = df[df.id == id]
    df_processed = df.copy(deep=True)
    responses = df['response'].tolist()

    # make list of processed responses
    for response in range(len(responses)):
        responses[response] = process_text(responses[response], stopwords_list, remove_sw, join_list)

    # add list as column in df
    df_processed['response_processed'] = responses

    return df_processed

## Semantic Spaces


### ukwac subtitles Semantic Space

In [9]:
# when on pc
ukwac_subtitles = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/cbow_6_ukwac_subtitle.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/cbow_6_ukwac_subtitle.txt', delimiter = " ", header = None, encoding='latin-1')

ukwac_subtitles = ukwac_subtitles.set_index(0)


In [10]:
ukwac_subtitles

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
biennials,0.018570,-0.115743,0.052710,-0.103825,-0.003274,-0.111418,-0.178500,-0.168650,-0.059751,-0.139651,...,0.121335,-0.021174,-0.113822,-0.110219,0.002543,-0.079831,0.103625,0.188465,0.117148,0.035641
fawn,0.071241,0.080455,0.216394,-0.282252,-0.172025,-0.186543,0.040009,0.044497,-0.045530,-0.079626,...,0.134902,-0.007096,-0.046929,0.022484,-0.042331,0.082441,0.077913,0.417162,-0.313959,-0.040423
gai,-0.183570,-0.039461,0.036953,-0.330616,-0.290676,0.095041,0.187567,0.019878,-0.115750,0.078831,...,0.042250,-0.009011,-0.311596,-0.131843,0.078453,0.048432,-0.005434,-0.001165,-0.072534,-0.151484
nunnery,-0.037334,0.180116,0.046011,-0.454054,-0.102627,-0.037614,-0.081269,-0.144102,0.018661,-0.312565,...,0.099364,-0.011798,-0.315447,0.160066,-0.179072,0.085825,-0.018394,0.055811,0.460388,0.107053
icici,0.010815,-0.050479,0.164478,-0.035702,-0.192204,0.003913,-0.063542,-0.095691,-0.131409,-0.090370,...,-0.040970,-0.021843,-0.071962,-0.120538,-0.018662,-0.263689,-0.119164,-0.013069,0.030275,-0.072135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
proteasome,0.065590,0.063595,-0.068297,-0.341265,-0.363577,-0.028856,0.121471,-0.163342,0.164599,-0.033824,...,0.092268,-0.019238,0.032392,0.024969,0.011309,0.061474,0.066890,-0.148575,-0.067886,0.027040
jawbone,-0.198237,-0.221238,0.034535,-0.125550,0.261507,-0.176464,0.154252,-0.001421,-0.012786,0.101746,...,0.173170,-0.018633,-0.150698,-0.278048,-0.083264,-0.138730,0.011749,0.292462,-0.002925,0.116922
expands,-0.204973,-0.104741,0.371615,-0.127140,-0.024127,-0.217444,-0.088636,0.147861,-0.060367,-0.200520,...,0.093103,0.040245,-0.231791,-0.101625,0.162721,-0.040362,0.097418,-0.007231,0.107961,-0.058140
hagia,-0.111121,0.126795,0.060413,-0.201407,-0.235132,0.056504,0.211543,0.019023,-0.160508,-0.190792,...,0.129399,0.013232,0.052301,-0.174858,0.192407,0.104436,-0.107214,-0.032931,-0.016617,0.252581


### CBOW subtitles Semantic Space

In [11]:
# when on pc
cbow_subtitles = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/cbow_subtitle.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/cbow_subtitle.txt', delimiter = " ", header = None, encoding='latin-1')

cbow_subtitles = cbow_subtitles.set_index(0)


In [12]:
cbow_subtitles

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fawn,-0.092878,0.096702,0.078285,0.141875,-0.048455,0.022562,-0.116627,-0.133138,-0.006373,0.065410,...,-0.141192,0.033917,-0.091994,-0.198885,0.086601,-0.084499,0.128204,0.130868,0.034637,-0.011805
gai,-0.073987,-0.135876,0.071925,0.050427,-0.117428,0.119181,0.000579,0.058539,-0.164071,-0.039671,...,0.011994,0.046088,0.127052,0.037256,-0.099584,-0.093139,-0.116162,0.049216,-0.186436,0.006694
impotents,-0.002924,-0.016872,0.020181,0.015595,-0.016781,-0.014786,-0.007104,-0.016849,-0.026617,0.000086,...,0.045714,-0.037038,0.003953,-0.012306,0.007597,0.014559,-0.013585,-0.008614,0.016856,0.032169
sonja,0.046202,0.124720,0.111161,0.241513,-0.163221,0.119522,-0.011830,0.045682,0.213233,0.141021,...,-0.117941,0.133043,-0.038438,-0.089926,0.061131,-0.143988,-0.058695,0.110329,-0.069613,-0.142646
dionysian,0.032223,-0.007911,-0.008691,-0.015038,-0.037716,0.028007,-0.026402,0.023947,-0.036913,0.033985,...,-0.022568,-0.009473,0.001342,-0.056279,-0.001985,-0.037703,-0.027222,0.015461,-0.016171,0.015194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jawbone,-0.155908,-0.022339,0.029578,0.061183,-0.016605,-0.026134,0.004525,0.002993,-0.120240,0.077789,...,0.005585,-0.052877,0.068886,-0.118713,-0.161840,0.027309,-0.031582,0.072750,-0.000247,-0.041733
bunches,-0.032415,0.140536,0.071025,0.022989,0.031107,0.122717,-0.029894,0.092296,-0.071899,-0.097607,...,-0.010892,-0.053674,-0.071419,-0.081352,-0.032544,-0.096401,-0.006610,0.108215,0.034687,0.185569
zinka,-0.006705,0.002355,0.012616,0.021504,-0.002360,0.017963,-0.008720,-0.000567,-0.014686,-0.000533,...,-0.003436,0.006517,-0.006647,-0.009388,0.003580,-0.006105,0.006303,0.012804,0.005335,0.014729
nurnies,-0.015835,0.007069,0.023102,-0.016696,-0.024264,0.002132,-0.006475,0.008502,-0.021481,0.011426,...,-0.006696,0.023507,-0.011131,-0.000455,-0.001402,-0.020491,-0.021894,0.017392,-0.012267,0.015760


### Banori Semantic Space

In [13]:
# when on pc
banori = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/banori.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/banori.txt', delimiter = " ", header = None, encoding='latin-1')

banori = banori.set_index(0)

In [14]:
banori

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,391,392,393,394,395,396,397,398,399,400
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,-0.060292,0.067630,-0.036891,0.066684,0.024045,0.099091,0.009682,-0.099609,0.081138,-0.153635,...,0.011333,-0.072486,-0.116943,0.071367,0.002056,0.041920,0.017046,0.038706,0.033797,0.016488
",",0.026625,0.073101,-0.027073,-0.019504,0.041730,0.038811,0.094878,-0.031282,0.093890,-0.105021,...,-0.002796,0.029393,-0.069047,0.083530,-0.033128,-0.035646,0.027183,0.003230,0.000811,0.022505
.,-0.005893,0.093791,0.015333,0.046226,0.032791,0.110069,0.055551,-0.080625,0.150372,-0.121523,...,0.076293,-0.048817,-0.203812,0.012218,0.039033,-0.043925,0.087199,0.089214,0.073069,0.075056
of,-0.050371,0.031452,0.040910,0.033255,-0.009195,0.061086,0.085859,-0.122968,0.068290,-0.108840,...,-0.021496,-0.112018,-0.116068,0.071437,-0.041942,0.061624,0.030890,0.013635,-0.104569,0.005263
and,0.005456,0.063237,-0.075793,-0.000819,0.003407,0.053554,0.070145,-0.088482,0.129797,-0.094823,...,0.100012,-0.090594,-0.090813,0.028959,0.024793,-0.040878,-0.024795,0.083654,-0.010910,0.027067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
scientifica,-0.121726,0.094667,0.025527,0.001748,0.087804,-0.004830,-0.011430,-0.068062,0.016917,0.031084,...,0.082360,-0.025217,0.008511,-0.109005,-0.034504,0.027565,0.115459,0.000142,-0.022383,0.042615
schoolly,0.015603,0.100056,-0.042075,-0.032139,0.024201,0.051871,-0.069512,-0.144775,0.110104,-0.106074,...,0.014019,-0.040083,0.088833,-0.015218,-0.063183,0.021593,-0.101472,0.023395,0.022891,0.036009
schnapf,-0.057437,0.181676,-0.063820,0.017294,0.002916,0.019541,0.080525,-0.054740,-0.071306,-0.007485,...,0.079774,-0.068784,0.087408,-0.044859,-0.046738,-0.022576,0.023053,0.026027,0.010763,0.063920
scheme-,0.070691,0.026944,-0.084396,-0.063403,-0.030977,0.125771,0.084698,-0.060336,0.040576,-0.030460,...,0.027721,0.001727,0.070864,-0.004891,-0.019738,0.014031,-0.008417,0.064234,0.003573,0.004069


### GloVe 6B Semantic Space

In [15]:
# when on pc
glove_6B = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/glove_6B.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/glove_6B.txt', delimiter = " ", header = None, encoding='latin-1')

glove_6B = glove_6B.set_index(0)


In [16]:
glove_6B

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,0.046560,0.213180,-0.007436,-0.458540,-0.035639,0.236430,-0.288360,0.215210,-0.134860,-1.641300,...,-0.013064,-0.296860,-0.079913,0.195000,0.031549,0.285060,-0.087461,0.009061,-0.209890,0.053913
",",-0.255390,-0.257230,0.131690,-0.042688,0.218170,-0.022702,-0.178540,0.107560,0.058936,-1.385400,...,0.075968,-0.014359,-0.073794,0.221760,0.146520,0.566860,0.053307,-0.232900,-0.122260,0.354990
.,-0.125590,0.013630,0.103060,-0.101230,0.098128,0.136270,-0.107210,0.236970,0.328700,-1.678500,...,0.060148,-0.156190,-0.119490,0.234450,0.081367,0.246180,-0.152420,-0.342240,-0.022394,0.136840
of,-0.076947,-0.021211,0.212710,-0.722320,-0.139880,-0.122340,-0.175210,0.121370,-0.070866,-1.572100,...,-0.366730,-0.386030,0.302900,0.015747,0.340360,0.478410,0.068617,0.183510,-0.291830,-0.046533
to,-0.257560,-0.057132,-0.671900,-0.380820,-0.364210,-0.082155,-0.010955,-0.082047,0.460560,-1.847700,...,-0.012806,-0.597070,0.317340,-0.252670,0.543840,0.063007,-0.049795,-0.160430,0.046744,-0.070621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chanty,0.392700,-0.022505,0.304580,0.187990,0.141180,0.724030,-0.257810,-0.137290,-0.016521,0.595960,...,-0.182950,0.406630,-0.343630,-0.270400,-0.593680,0.016447,0.140740,0.463940,-0.369570,-0.287180
kronik,0.136790,-0.139090,-0.360890,0.079864,0.321490,0.263870,-0.109900,0.044420,0.083869,0.791330,...,0.036419,-0.036845,-0.348150,0.064732,-0.000577,-0.133790,0.428960,-0.023320,0.410210,-0.393080
rolonda,0.075713,-0.040502,0.183450,0.512300,-0.228560,0.839110,0.178780,-0.713010,0.326900,0.695350,...,-0.388530,0.545850,-0.035050,-0.184360,-0.197000,-0.350030,0.160650,0.218380,0.309670,0.437610
zsombor,0.814510,-0.362210,0.311860,0.813810,0.188520,-0.313600,0.827840,0.296560,-0.085519,0.475970,...,0.130880,0.106120,-0.408110,0.313380,-0.430250,0.069798,-0.207690,0.075486,0.284080,-0.175590


### TASA Semantic Space

In [17]:
# when on pc
TASA = pd.read_csv('C:/Users/jhec8/Documents/Northwestern_SROP/TASA.txt', delimiter = " ", header = None)

# when on mac
# ukwac_subtitles_sem_space = pd.read_csv('/Users/johnhenrycruz/Desktop/Northwestern_SROP/TASA.txt', delimiter = " ", header = None, encoding='latin-1')

TASA = TASA.set_index(0)


In [18]:
TASA

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
who,0.058924,-0.003555,0.053929,0.037707,-0.028096,-0.025736,0.008540,-0.041124,-0.000275,0.002969,...,0.002788,-0.005665,0.008141,0.035108,-0.005556,0.001101,0.022807,-0.001179,-0.004761,0.000947
were,0.076660,0.007832,0.102716,-0.046048,0.006808,-0.012557,0.012856,-0.043569,0.004773,0.027316,...,0.000811,0.003786,-0.011847,0.010002,0.027371,0.010266,-0.000217,0.005904,-0.005155,-0.017014
the,0.037283,-0.011585,0.003933,-0.006798,0.012415,0.000927,0.004378,-0.003837,0.006263,-0.002052,...,0.000049,0.000939,-0.001295,0.000544,0.000071,-0.000817,0.001123,-0.000980,-0.000354,0.000634
first,0.049274,-0.012220,0.023342,0.002829,0.008936,0.004214,0.041426,-0.014812,0.020341,0.018361,...,0.027615,0.016552,0.052426,-0.001099,-0.024695,-0.008158,-0.026864,-0.000865,-0.030933,-0.035347
americans,0.017895,-0.024743,0.068235,-0.003476,-0.022697,-0.002701,-0.013814,0.041591,0.008424,0.019061,...,0.009282,0.029840,-0.002187,-0.029926,-0.036506,0.008290,0.003097,0.028012,-0.002667,0.043664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
phosphagens,0.000049,-0.000144,-0.000252,-0.000134,0.000194,-0.000358,-0.000433,0.000109,0.000242,0.000393,...,-0.000659,-0.000874,-0.000879,-0.000478,-0.001883,0.000523,-0.001007,-0.000753,0.000633,-0.000265
phosphated,0.000025,-0.000072,-0.000126,-0.000067,0.000097,-0.000179,-0.000216,0.000054,0.000121,0.000197,...,-0.000330,-0.000437,-0.000439,-0.000239,-0.000941,0.000262,-0.000504,-0.000376,0.000317,-0.000132
adp,0.000025,-0.000072,-0.000126,-0.000067,0.000097,-0.000179,-0.000216,0.000054,0.000121,0.000197,...,-0.000330,-0.000437,-0.000439,-0.000239,-0.000941,0.000262,-0.000504,-0.000376,0.000317,-0.000132
plummage,0.000023,-0.000049,-0.000023,-0.000043,-0.000016,-0.000036,0.000008,-0.000149,-0.000044,-0.000012,...,-0.000057,0.000473,-0.000311,-0.000031,0.000284,0.000240,-0.000090,-0.000660,-0.000446,0.000211


## Novelty Algo 3
### sem_space + element wise multiplication + cosine distance
### Greater Cos Distance, Greater Novelty
### Most similar to SemDis

In [19]:
# method to calculate cosine similarity
def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

In [20]:
# method to get the element wise multiplied vector
# multiply vectors in phrase
def get_ew_multiplied_vector(phrase_list, sem_space):
    vectors_list = []
    # add vectors to list
    # change to numpy array
    for term in phrase_list:
        vectors_list.append(np.array(sem_space.loc[term].values.tolist()))
    
    # get element wise multiplied vector
    element_wise_multiplied_vector = np.ones(len(sem_space.columns))

    for vector in vectors_list:
        element_wise_multiplied_vector = element_wise_multiplied_vector * vector

    return element_wise_multiplied_vector

In [21]:
# get cosine sim from prompt and ewm
def get_cosine_sim_ewm(prompt, response, sem_space):
    prompt_vector = np.array(sem_space.loc[prompt].values.tolist())
    ewm_vector = get_ew_multiplied_vector(response, sem_space)

    return (1 - get_cosine_similarity(prompt_vector, ewm_vector))

In [22]:
# get df with results of the cosine distance from prompt using the elementwise multiplied vectors in the response
def get_novelty_ewm_cosinedist(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    # list to store cosine sims
    cosine_sim_list = []

    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list.append(get_cosine_sim_ewm(prompt, response, sem_space))

    # add novelty rating list to dataframe
    novel_rating_df['ewm_vector_cosine_dis'] = cosine_sim_list
    
    # new column with novelty rating
    return novel_rating_df

In [23]:
# brick_novelty_ewm_cosinedist = get_novelty_ewm_cosinedist(data_official_shoe, 'shoe', stopwords_edited, TASA, True, False)
# brick_novelty_ewm_cosinedist

## Novelty Algo 4
### sem_space + local minina + cosine distance

In [24]:
# get word in phrase that has the least distance from the prompt
def get_minima_vector_cos_distance(prompt, phrase_list, sem_space):
    distances_list = []
    # get prompt vector
    prompt_vector = np.array(sem_space.loc[prompt].values.tolist())
    
    # create list of cosine distances
    for term in phrase_list:
        distances_list.append((1 - get_cosine_similarity(prompt_vector, np.array(sem_space.loc[term].values.tolist()))))
        
    # return the max cosine distance
    return max(distances_list, default=0)

In [25]:
# get df with results of the cosine distance from prompt using the minima vector in the response
def get_novelty_minimavec_cosinedist(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    # list to store cosine sims
    cosine_sim_list = []

    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list.append(get_minima_vector_cos_distance(prompt, response, sem_space))

    # add novelty rating list to dataframe
    novel_rating_df['minima_vector_cosine_dis'] = cosine_sim_list
    
    # new column with novelty rating
    return novel_rating_df

In [26]:
# brick_novelty_ukwac_minimavec_cosinedist = get_novelty_minimavec_cosinedist(data_official_brick, 'brick', stopwords_edited, ukwac_subtitles, True, False)
# brick_novelty_ukwac_minimavec_cosinedist

## Novelty Algo 5
### sem_space + element wise multiplication + cosine distance + clustering
### average responses cosine distance in the same cluster
### idea is that phrases with same alternate task will group
### variation in phrase in the same cluster will  be averaged out

In [27]:
# clusters the responses
# get a df of the clusters and their respective phrases
def get_counts_vector(num_clusters, responses):
    # initialize CountVectorizer object
    count_vectorizer = CountVectorizer()
    # vectorize the phrases
    word_count = count_vectorizer.fit_transform(responses)
    
    # elbow method to visualize and find out how many clusters to use
#     visualizer = KElbowVisualizer(KMeans(), k=(10,35), timings=False)
#     visualizer.fit(word_count.toarray())       
#     visualizer.show()

    # nltk kmeans cosine distance implementation
    number_of_clusters = num_clusters
    kmeans = KMeansClusterer(number_of_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True)
    assigned_clusters = kmeans.cluster(word_count.toarray(), assign_clusters=True)

    # scikit-learn euclidean distance implementation
#     kmeans = KMeans(n_clusters = num_clusters).fit(word_count)
        
    # cluster results scikit-learn
    results = pd.DataFrame()
    results['text'] = responses
#     results['category'] = kmeans.labels_
    results['category'] = assigned_clusters
    
    # create dictionary to organize the clusters with their respective phrases
    results_dict = {k: g["text"].tolist() for k,g in results.groupby("category")}
    
    # df of the clusters and the 
    clusters_df = pd.DataFrame(list(results_dict.items()),columns = ['category','responses']) 
    
    return clusters_df

In [28]:
# averages the distance of the phrases in each cluster
# gives each phrase in cluster the average distance 
def get_clustered_novelty_score(novel_rating_df, column):
    # get cluster df
    clusters_df = get_counts_vector(22, novel_rating_df['response_processed_phrase'].tolist())
    # get cleaned phrases and their current novelty rating
    novelty_scores = dict(zip(novel_rating_df.response_processed_phrase, novel_rating_df[column]))

    # create dictionary out of cluster df
    clusters = dict(zip(clusters_df.category, clusters_df.responses))
        
    # initialize empty dictionary to store the score for a category
    clusters_scores = dict.fromkeys(clusters)
    
    # get the average cosine distance for a cluster
    for key in clusters:
        score = 0
        for phrase in clusters[key]:
            score = score + novelty_scores[phrase]
        score = score/len(clusters[key])
        clusters_scores[key] = score
        
    # create dictionary to store a phrase and its new novelty score 
    # new score is the average of the responses in one cluster
    phrase_scores_dict = {}
    for key in clusters:
        for phrase in clusters[key]:
            phrase_scores_dict[phrase] = clusters_scores[key]
            
    # make a list that matches the one in the current dataframe
    # return list to be added to dataframe
    df_phrases_scores_list = [] 
    for phrase in novel_rating_df['response_processed_phrase'].tolist():
        df_phrases_scores_list.append(phrase_scores_dict[phrase])
    
    # uncomment to show clusters df
#     display(clusters_df)
            
    return list(df_phrases_scores_list)

In [29]:
# get df with results of the cosine distance from prompt using the elementwise multiplied vectors in the response
def get_novelty_ewm_cosinedist_cluster(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    novel_rating_df['response_processed_phrase'] = [' '.join(x) for x in cleaned_responses]
    # list to store cosine sims
    cosine_sim_list = []
    
    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list.append(get_cosine_sim_ewm(prompt, response, sem_space))

    # add novelty rating list to dataframe
    novel_rating_df['ewm_vector_cosine_dis'] = cosine_sim_list
    
    novel_rating_df['ewm_vector_cosine_dis_clus_avg'] = get_clustered_novelty_score(novel_rating_df, 'ewm_vector_cosine_dis')
    
    # new column with novelty rating
    return novel_rating_df

In [30]:
# brick_novelty_ewm_cosinedist_cluster = get_novelty_ewm_cosinedist_cluster(data_official_brick, 'brick', stopwords_edited, ukwac_subtitles, True, False)
# brick_novelty_ewm_cosinedist_cluster

## Novelty Algo 6
### sem_space + local minima + cosine distance + clustering
### average responses cosine distance in the same cluster
### idea is that phrases with same alternate task will group
### variation in phrase in the same cluster will  be averaged out
### differs from algo 5, does local minima not ewm

In [31]:
# get df with results of the cosine distance from prompt using the elementwise multiplied vectors in the response
def get_novelty_minimavec_cosinedist_cluster(df, prompt, stopwords_list, sem_space, remove_sw, join_list):
    # clean the responses
    novel_rating_df = get_cleaned_responses(df, stopwords_list, remove_sw, join_list)
    novel_rating_df = novel_rating_df[novel_rating_df.astype(str)['response_processed'] != '[]']
    cleaned_responses = novel_rating_df['response_processed'].tolist()
    novel_rating_df['response_processed_phrase'] = [' '.join(x) for x in cleaned_responses]
    # list to store cosine sims
    cosine_sim_list = []

    # implement algo
    # pass in clean responses
    for response in cleaned_responses:
        # add novelty rating to list 
        cosine_sim_list.append(get_minima_vector_cos_distance(prompt, response, sem_space))

    # add novelty rating list to dataframe
    novel_rating_df['minima_vector_cosine_dis'] = cosine_sim_list
        
    novel_rating_df['minima_vector_cosine_dis_clus_avg'] = get_clustered_novelty_score(novel_rating_df, 'minima_vector_cosine_dis')
    
    # new column with novelty rating
    return novel_rating_df

In [32]:
# brick_noveltyc_minimavec_cosinedist_cluster = get_novelty_minimavec_cosinedist_cluster(data_official_brick, 'brick', stopwords_edited, ukwac_subtitles, True, False)
# brick_novelty_minimavec_cosinedist_cluster

In [33]:
prompts_list = ['box', 'brick', 'chair', 'cup', 'key', 'pencil', 'rope', 'shoe']
data_list = [data_official_box, data_official_brick, data_official_chair, data_official_cup, data_official_key, data_official_pencil, data_official_rope, data_official_shoe]
sem_space_list = [ukwac_subtitles, cbow_subtitles, banori, TASA, glove_6B]
sem_space_list_str = ['ukwac_subtitles', 'cbow_subtitles', 'banori', 'TASA', 'glove_6B']
sem_space_col_list = [8,9,10,11,12]
underscore = "_"

## Correlation Test with SemDis and Human Ratings

In [34]:
# create pvalues matrix
def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues

### Comparing the Performance of the algos to SemDis

In [35]:
# print the correlations with SemDis
def print_semdis_corrs():
    for y in range(len(sem_space_list)):
        print(sem_space_list_str[y])
        for i in range(len(prompts_list)):
            print(prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [sem_space_col_list[y],14,15,16,17]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[1:, 0])
            pval_df = pd.DataFrame(pval_df.iloc[1:, 0])
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df.columns = [sem_space_list_str[y], 'pval']
            corr_pval_df.index = ['ewm','ewm_clust','minvec','minvec_clust']
            corr_pval_df.index.rename('metrics', inplace=True)
            display(corr_pval_df)

In [36]:
print_semdis_corrs()

ukwac_subtitles
box


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.628624,0.0
ewm_clust,0.168943,0.0004
minvec,0.487944,0.0
minvec_clust,0.190339,0.0001


brick


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.602996,0.0
ewm_clust,0.44692,0.0
minvec,0.431333,0.0
minvec_clust,0.249562,0.0


chair


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.827713,0.0
ewm_clust,0.78094,0.0
minvec,0.765201,0.0
minvec_clust,0.579824,0.0


cup


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.835245,0.0
ewm_clust,0.729305,0.0
minvec,0.688375,0.0
minvec_clust,0.589373,0.0


key


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.654186,0.0
ewm_clust,0.23959,0.0
minvec,0.285606,0.0
minvec_clust,0.088993,0.0828


pencil


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.796148,0.0
ewm_clust,0.583356,0.0
minvec,0.633005,0.0
minvec_clust,0.53446,0.0


rope


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.514802,0.0
ewm_clust,0.131883,0.0039
minvec,0.275024,0.0
minvec_clust,0.108317,0.0178


shoe


Unnamed: 0_level_0,ukwac_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.648501,0.0
ewm_clust,0.511536,0.0
minvec,0.335716,0.0
minvec_clust,0.215719,0.0


cbow_subtitles
box


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.61773,0.0
ewm_clust,0.46662,0.0
minvec,0.502072,0.0
minvec_clust,0.0901,0.0587


brick


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.760466,0.0
ewm_clust,0.374293,0.0
minvec,0.583408,0.0
minvec_clust,0.343581,0.0


chair


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.892649,0.0
ewm_clust,0.84912,0.0
minvec,0.861011,0.0
minvec_clust,0.835879,0.0


cup


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.818162,0.0
ewm_clust,0.59897,0.0
minvec,0.6384,0.0
minvec_clust,0.573205,0.0


key


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.740503,0.0
ewm_clust,0.230954,0.0
minvec,0.320927,0.0
minvec_clust,0.124964,0.0147


pencil


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.72198,0.0
ewm_clust,0.613975,0.0
minvec,0.633888,0.0
minvec_clust,0.541137,0.0


rope


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.562246,0.0
ewm_clust,0.286867,0.0
minvec,0.41784,0.0
minvec_clust,0.083346,0.0687


shoe


Unnamed: 0_level_0,cbow_subtitles,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.721026,0.0
ewm_clust,0.326935,0.0
minvec,0.455136,0.0
minvec_clust,0.228894,0.0


banori
box


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.698511,0.0
ewm_clust,0.225379,0.0
minvec,0.504538,0.0
minvec_clust,0.101329,0.0334


brick


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.800546,0.0
ewm_clust,0.499409,0.0
minvec,0.673372,0.0
minvec_clust,0.427257,0.0


chair


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.876424,0.0
ewm_clust,0.795493,0.0
minvec,0.832358,0.0
minvec_clust,0.782103,0.0


cup


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.790208,0.0
ewm_clust,0.370099,0.0
minvec,0.599525,0.0
minvec_clust,0.461689,0.0


key


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.53206,0.0
ewm_clust,0.215307,0.0
minvec,0.088354,0.085
minvec_clust,-0.039485,0.4422


pencil


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.750287,0.0
ewm_clust,0.485524,0.0
minvec,0.669895,0.0
minvec_clust,0.319698,0.0


rope


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.681241,0.0
ewm_clust,0.447275,0.0
minvec,0.605779,0.0
minvec_clust,0.263578,0.0


shoe


Unnamed: 0_level_0,banori,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.800472,0.0
ewm_clust,0.47207,0.0
minvec,0.645295,0.0
minvec_clust,0.347641,0.0


TASA
box


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.610802,0.0
ewm_clust,0.210058,0.0
minvec,0.541444,0.0
minvec_clust,0.306478,0.0


brick


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.877071,0.0
ewm_clust,0.676301,0.0
minvec,0.782293,0.0
minvec_clust,0.348942,0.0


chair


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.928228,0.0
ewm_clust,0.879397,0.0
minvec,0.892954,0.0
minvec_clust,0.863637,0.0


cup


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.877639,0.0
ewm_clust,0.545081,0.0
minvec,0.725667,0.0
minvec_clust,0.558465,0.0


key


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.728262,0.0
ewm_clust,0.214788,0.0
minvec,0.376685,0.0
minvec_clust,0.15938,0.0018


pencil


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.874151,0.0
ewm_clust,0.533531,0.0
minvec,0.817355,0.0
minvec_clust,0.736354,0.0


rope


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.657921,0.0
ewm_clust,0.368795,0.0
minvec,0.490656,0.0
minvec_clust,0.337405,0.0


shoe


Unnamed: 0_level_0,TASA,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.785925,0.0
ewm_clust,0.510398,0.0
minvec,0.566796,0.0
minvec_clust,0.295228,0.0


glove_6B
box


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.355974,0.0
ewm_clust,0.050266,0.2922
minvec,0.199429,0.0
minvec_clust,-0.001843,0.9692


brick


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.664411,0.0
ewm_clust,0.523284,0.0
minvec,0.435784,0.0
minvec_clust,0.022948,0.6502


chair


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.725254,0.0
ewm_clust,0.552838,0.0
minvec,0.662959,0.0
minvec_clust,0.680075,0.0


cup


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.559136,0.0
ewm_clust,0.331235,0.0
minvec,0.260218,0.0
minvec_clust,0.16654,0.0004


key


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.535779,0.0
ewm_clust,0.243385,0.0
minvec,-0.21372,0.0
minvec_clust,-0.220515,0.0


pencil


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.723635,0.0
ewm_clust,0.316304,0.0
minvec,0.561862,0.0
minvec_clust,0.401523,0.0


rope


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.601145,0.0
ewm_clust,0.38693,0.0
minvec,0.411058,0.0
minvec_clust,0.219971,0.0


shoe


Unnamed: 0_level_0,glove_6B,pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
ewm,0.694004,0.0
ewm_clust,0.495413,0.0
minvec,0.551488,0.0
minvec_clust,0.373746,0.0


In [37]:
def write_semdis_corrs():
    writer = pd.ExcelWriter('novelty_algos_semdis_corrs_results_070921.xlsx', engine='xlsxwriter')
    workbook = writer.book

    for y in range(len(sem_space_list)):
        row = 0
        worksheet = workbook.add_worksheet(sem_space_list_str[y])
        writer.sheets[sem_space_list_str[y]] = worksheet
        for i in range(len(prompts_list)):
            worksheet.write_string(row, 0, prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [sem_space_col_list[y],14,15,16,17]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[1:, 0])
            pval_df = pd.DataFrame(pval_df.iloc[1:, 0])
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df.columns = [sem_space_list_str[y], 'pval']
            corr_pval_df.index = ['ewm','ewm_clust','minvec','minvec_clust']
            corr_pval_df.index.rename('metrics', inplace=True)
            corr_pval_df.to_excel(writer, sheet_name = sem_space_list_str[y], startrow=row + 1, startcol=0, index = True)
            row = row + len(corr_pval_df.index) + 3
            
    writer.save()

In [38]:
# write_semdis_corrs()

### Comparing the performance of algos with Human Raters

In [39]:
# print the correlations with SemDis
def print_human_corrs():
    for y in range(len(sem_space_list)):
        print(sem_space_list_str[y])
        for i in range(len(prompts_list)):
            print(prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [18,19,20,14,15,16,17]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[3:, 0:3])
            pval_df = pd.DataFrame(pval_df.iloc[3:, 0:3])
            pval_df.columns = ['n1_pval', 'n2_pval', 'nm_pval']
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df = corr_pval_df[['novelty_1','n1_pval', 'novelty_2', 'n2_pval', 'novelty_m', 'nm_pval']]
            corr_pval_df.index = ['ewm','ewm_clust','minvec','minvec_clust']
            corr_pval_df.index.rename('metrics', inplace=True)
            display(corr_pval_df)

In [40]:
print_human_corrs()

ukwac_subtitles
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.198285,0.0,0.223406,0.0,0.227607,0.0
ewm_clust,0.222457,0.0,0.141674,0.0029,0.195406,0.0
minvec,0.246645,0.0,0.275977,0.0,0.282065,0.0
minvec_clust,0.128836,0.0067,0.173949,0.0002,0.163728,0.0006


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.222237,0.0,0.214917,0.0,0.233643,0.0
ewm_clust,0.296875,0.0,0.349461,0.0,0.345646,0.0
minvec,0.48624,0.0,0.530499,0.0,0.543606,0.0
minvec_clust,0.598277,0.0,0.650887,0.0,0.667867,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.499581,0.0,0.627807,0.0,0.593724,0.0
ewm_clust,0.592509,0.0,0.745064,0.0,0.7044,0.0
minvec,0.58933,0.0,0.69519,0.0,0.677895,0.0
minvec_clust,0.480773,0.0,0.564375,0.0,0.551658,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.400958,0.0,0.452852,0.0,0.449866,0.0
ewm_clust,0.486804,0.0,0.507135,0.0,0.523667,0.0
minvec,0.624547,0.0,0.609444,0.0,0.650109,0.0
minvec_clust,0.675944,0.0,0.651335,0.0,0.699249,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.07221,0.1595,0.121037,0.0181,0.102545,0.0455
ewm_clust,0.113632,0.0266,0.147331,0.004,0.137911,0.007
minvec,0.560721,0.0,0.587315,0.0,0.604565,0.0
minvec_clust,0.635747,0.0,0.644149,0.0,0.67363,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.514071,0.0,0.538147,0.0,0.552693,0.0
ewm_clust,0.595945,0.0,0.671769,0.0,0.66502,0.0
minvec,0.670702,0.0,0.727372,0.0,0.733902,0.0
minvec_clust,0.62239,0.0,0.710728,0.0,0.699171,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.268577,0.0,0.282599,0.0,0.305022,0.0
ewm_clust,0.463733,0.0,0.334945,0.0,0.449579,0.0
minvec,0.545713,0.0,0.443946,0.0,0.55414,0.0
minvec_clust,0.608196,0.0,0.433446,0.0,0.58669,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.090874,0.0681,0.107671,0.0305,0.103969,0.0367
ewm_clust,0.097003,0.0514,0.073659,0.1394,0.089027,0.0739
minvec,0.375478,0.0,0.316616,0.0,0.36136,0.0
minvec_clust,0.407179,0.0,0.370607,0.0,0.406369,0.0


cbow_subtitles
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.286212,0.0,0.299711,0.0,0.316014,0.0
ewm_clust,0.289682,0.0,0.372818,0.0,0.358066,0.0
minvec,0.376063,0.0,0.401132,0.0,0.419255,0.0
minvec_clust,0.27472,0.0,0.366523,0.0,0.346703,0.0


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.271906,0.0,0.212283,0.0,0.258617,0.0
ewm_clust,0.415441,0.0,0.385683,0.0,0.42812,0.0
minvec,0.428251,0.0,0.431931,0.0,0.459794,0.0
minvec_clust,0.499762,0.0,0.492273,0.0,0.530236,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.549628,0.0,0.667627,0.0,0.641773,0.0
ewm_clust,0.636505,0.0,0.766073,0.0,0.739706,0.0
minvec,0.613614,0.0,0.725625,0.0,0.706715,0.0
minvec_clust,0.647242,0.0,0.748091,0.0,0.736875,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.401794,0.0,0.453973,0.0,0.450897,0.0
ewm_clust,0.512556,0.0,0.553522,0.0,0.561689,0.0
minvec,0.564149,0.0,0.590369,0.0,0.608271,0.0
minvec_clust,0.558914,0.0,0.584886,0.0,0.602625,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.072757,0.1564,0.097565,0.0571,0.09006,0.0791
ewm_clust,-0.05688,0.2681,0.01832,0.7215,-0.018968,0.7121
minvec,0.545239,0.0,0.591456,0.0,0.598942,0.0
minvec_clust,0.575936,0.0,0.625328,0.0,0.632974,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.555603,0.0,0.570531,0.0,0.591719,0.0
ewm_clust,0.570867,0.0,0.651705,0.0,0.641196,0.0
minvec,0.665357,0.0,0.676656,0.0,0.705271,0.0
minvec_clust,0.60342,0.0,0.711967,0.0,0.689476,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.126511,0.0056,0.22713,0.0,0.191043,0.0
ewm_clust,0.053011,0.2474,0.188116,0.0,0.126876,0.0055
minvec,0.278197,0.0,0.294575,0.0,0.316881,0.0
minvec_clust,0.517223,0.0,0.299959,0.0,0.464346,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.055691,0.2641,0.063955,0.1996,0.062637,0.209
ewm_clust,-0.117217,0.0184,-0.145686,0.0033,-0.137727,0.0056
minvec,0.204235,0.0,0.139437,0.005,0.179119,0.0003
minvec_clust,0.210153,0.0,0.125665,0.0115,0.174835,0.0004


banori
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.165181,0.0005,0.146938,0.002,0.168059,0.0004
ewm_clust,0.292189,0.0,0.223089,0.0,0.277017,0.0
minvec,0.180866,0.0001,0.193644,0.0,0.202036,0.0
minvec_clust,0.014868,0.7555,0.047069,0.324,0.033746,0.4797


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.263175,0.0,0.191584,0.0001,0.242848,0.0
ewm_clust,0.397018,0.0,0.346514,0.0,0.397269,0.0
minvec,0.475338,0.0,0.421859,0.0,0.479397,0.0
minvec_clust,0.463553,0.0,0.427825,0.0,0.476343,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.550633,0.0,0.681569,0.0,0.649248,0.0
ewm_clust,0.645019,0.0,0.783423,0.0,0.753119,0.0
minvec,0.648311,0.0,0.755139,0.0,0.740971,0.0
minvec_clust,0.664474,0.0,0.768933,0.0,0.756951,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.206793,0.0,0.247999,0.0,0.239637,0.0
ewm_clust,0.183425,0.0001,0.270367,0.0,0.239147,0.0
minvec,0.344838,0.0,0.361712,0.0,0.372255,0.0
minvec_clust,0.415584,0.0,0.463463,0.0,0.463159,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,-0.006919,0.8929,0.038218,0.457,0.017263,0.737
ewm_clust,-0.097763,0.0566,-0.137259,0.0073,-0.124363,0.0151
minvec,0.48193,0.0,0.481615,0.0,0.507012,0.0
minvec_clust,0.642312,0.0,0.657875,0.0,0.684433,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.533945,0.0,0.545151,0.0,0.56706,0.0
ewm_clust,0.550911,0.0,0.597652,0.0,0.602921,0.0
minvec,0.598492,0.0,0.621352,0.0,0.640834,0.0
minvec_clust,0.535174,0.0,0.589144,0.0,0.590042,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.186417,0.0,0.280809,0.0,0.254364,0.0
ewm_clust,0.149712,0.001,0.318426,0.0,0.251088,0.0
minvec,0.31587,0.0,0.35596,0.0,0.370622,0.0
minvec_clust,0.13901,0.0023,0.288277,0.0,0.229418,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.292273,0.0,0.205452,0.0,0.259473,0.0
ewm_clust,0.376891,0.0,0.264216,0.0,0.334213,0.0
minvec,0.258946,0.0,0.180474,0.0003,0.229061,0.0
minvec_clust,0.216732,0.0,0.161216,0.0011,0.197125,0.0001


TASA
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.078913,0.0979,0.119032,0.0124,0.107153,0.0244
ewm_clust,0.055622,0.2438,0.124158,0.0091,0.097675,0.0403
minvec,0.062562,0.1897,0.136903,0.004,0.108352,0.0229
minvec_clust,0.004282,0.9285,0.121617,0.0106,0.069169,0.147


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.398248,0.0,0.361618,0.0,0.406044,0.0
ewm_clust,0.343601,0.0,0.332048,0.0,0.361109,0.0
minvec,0.540294,0.0,0.507181,0.0,0.559788,0.0
minvec_clust,0.348041,0.0,0.327644,0.0,0.3611,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.551792,0.0,0.681647,0.0,0.649942,0.0
ewm_clust,0.624937,0.0,0.758934,0.0,0.729623,0.0
minvec,0.578003,0.0,0.721794,0.0,0.684664,0.0
minvec_clust,0.620468,0.0,0.755141,0.0,0.725215,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.400337,0.0,0.423777,0.0,0.434198,0.0
ewm_clust,0.376523,0.0,0.397623,0.0,0.407871,0.0
minvec,0.513132,0.0,0.53201,0.0,0.550642,0.0
minvec_clust,0.627786,0.0,0.638824,0.0,0.667315,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.124679,0.0149,0.135386,0.0081,0.137035,0.0074
ewm_clust,0.083577,0.1033,0.112306,0.0284,0.103579,0.0433
minvec,0.470585,0.0,0.539324,0.0,0.532623,0.0
minvec_clust,0.554499,0.0,0.604499,0.0,0.610745,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.440331,0.0,0.547566,0.0,0.517343,0.0
ewm_clust,0.390444,0.0,0.474435,0.0,0.453104,0.0
minvec,0.483851,0.0,0.587052,0.0,0.561053,0.0
minvec_clust,0.44702,0.0,0.580968,0.0,0.537925,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.396963,0.0,0.395055,0.0,0.439427,0.0
ewm_clust,0.476425,0.0,0.369054,0.0,0.474449,0.0
minvec,0.451105,0.0,0.420399,0.0,0.484984,0.0
minvec_clust,0.551778,0.0,0.419773,0.0,0.545635,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.306015,0.0,0.283635,0.0,0.308122,0.0
ewm_clust,0.310641,0.0,0.241466,0.0,0.288068,0.0
minvec,0.479176,0.0,0.477488,0.0,0.500218,0.0
minvec_clust,0.477978,0.0,0.471968,0.0,0.496666,0.0


glove_6B
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.164957,0.0005,0.184313,0.0001,0.188502,0.0001
ewm_clust,0.101589,0.0329,0.047399,0.3207,0.079719,0.0945
minvec,0.367363,0.0,0.370438,0.0,0.397775,0.0
minvec_clust,0.475336,0.0,0.455076,0.0,0.501352,0.0


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.147614,0.0034,0.119275,0.018,0.142567,0.0046
ewm_clust,0.176431,0.0004,0.172237,0.0006,0.186356,0.0002
minvec,0.519981,0.0,0.502631,0.0,0.546548,0.0
minvec_clust,0.45241,0.0,0.441343,0.0,0.477691,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.47085,0.0,0.575954,0.0,0.551778,0.0
ewm_clust,0.451027,0.0,0.563904,0.0,0.53459,0.0
minvec,0.652504,0.0,0.738036,0.0,0.734872,0.0
minvec_clust,0.634289,0.0,0.748395,0.0,0.729695,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.157397,0.0009,0.187723,0.0001,0.181848,0.0001
ewm_clust,0.020427,0.6684,0.080752,0.0899,0.053354,0.263
minvec,0.438586,0.0,0.438147,0.0,0.461901,0.0
minvec_clust,0.637822,0.0,0.625918,0.0,0.665785,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,-0.182525,0.0003,-0.124344,0.0152,-0.160452,0.0017
ewm_clust,-0.283151,0.0,-0.299433,0.0,-0.306842,0.0
minvec,0.555661,0.0,0.512357,0.0,0.56123,0.0
minvec_clust,0.606108,0.0,0.530259,0.0,0.596623,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.428244,0.0,0.433819,0.0,0.453072,0.0
ewm_clust,0.520993,0.0,0.506664,0.0,0.540491,0.0
minvec,0.523,0.0,0.542419,0.0,0.559718,0.0
minvec_clust,0.512729,0.0,0.586955,0.0,0.576718,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.258818,0.0,0.294487,0.0,0.305101,0.0
ewm_clust,0.328442,0.0,0.387055,0.0,0.393901,0.0
minvec,0.457058,0.0,0.408301,0.0,0.482494,0.0
minvec_clust,0.449109,0.0,0.270867,0.0,0.40844,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ewm,0.193659,0.0001,0.181691,0.0002,0.19616,0.0001
ewm_clust,0.14265,0.0041,0.107872,0.0302,0.130682,0.0085
minvec,0.376204,0.0,0.346269,0.0,0.377506,0.0
minvec_clust,0.36606,0.0,0.346426,0.0,0.372377,0.0


In [41]:
def write_human_corrs():
    writer = pd.ExcelWriter('novelty_algos_humans_corrs_results_070921.xlsx', engine='xlsxwriter')
    workbook = writer.book

    for y in range(len(sem_space_list)):
        row = 0
        worksheet = workbook.add_worksheet(sem_space_list_str[y])
        writer.sheets[sem_space_list_str[y]] = worksheet
        for i in range(len(prompts_list)):
            worksheet.write_string(row, 0, prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [18,19,20,14,15,16,17]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[3:, 0:3])
            pval_df = pd.DataFrame(pval_df.iloc[3:, 0:3])
            pval_df.columns = ['n1_pval', 'n2_pval', 'nm_pval']
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df = corr_pval_df[['novelty_1','n1_pval', 'novelty_2', 'n2_pval', 'novelty_m', 'nm_pval']]
            corr_pval_df.index = ['ewm','ewm_clust','minvec','minvec_clust']
            corr_pval_df.index.rename('metrics', inplace=True)
            corr_pval_df.to_excel(writer, sheet_name = sem_space_list_str[y], startrow=row + 1, startcol=0, index = True)
            row = row + len(corr_pval_df.index) + 3
            
    writer.save()

In [42]:
# write_human_corrs()

### Comparing the performance of algos and SemDis with Human Raters

In [43]:
# print the correlations with SemDis
def print_compiled_corrs():
    for y in range(len(sem_space_list)):
        print(sem_space_list_str[y])
        for i in range(len(prompts_list)):
            print(prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [18,19,20,8,9,10,11,12,14,15,16,17]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[3:, 0:3])
            pval_df = pd.DataFrame(pval_df.iloc[3:, 0:3])
            pval_df.columns = ['n1_pval', 'n2_pval', 'nm_pval']
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df = corr_pval_df[['novelty_1','n1_pval', 'novelty_2', 'n2_pval', 'novelty_m', 'nm_pval']]
            corr_pval_df.index.rename('metrics', inplace=True)
            display(corr_pval_df)

In [44]:
print_compiled_corrs()

ukwac_subtitles
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.198285,0.0,0.223406,0.0,0.227607,0.0
ewm_vector_cosine_dis_clus_avg,0.222457,0.0,0.141674,0.0029,0.195406,0.0
minima_vector_cosine_dis,0.246645,0.0,0.275977,0.0,0.282065,0.0
minima_vector_cosine_dis_clus_avg,0.128836,0.0067,0.173949,0.0002,0.163728,0.0006


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.222237,0.0,0.214917,0.0,0.233643,0.0
ewm_vector_cosine_dis_clus_avg,0.296875,0.0,0.349461,0.0,0.345646,0.0
minima_vector_cosine_dis,0.48624,0.0,0.530499,0.0,0.543606,0.0
minima_vector_cosine_dis_clus_avg,0.598277,0.0,0.650887,0.0,0.667867,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.499581,0.0,0.627807,0.0,0.593724,0.0
ewm_vector_cosine_dis_clus_avg,0.592509,0.0,0.745064,0.0,0.7044,0.0
minima_vector_cosine_dis,0.58933,0.0,0.69519,0.0,0.677895,0.0
minima_vector_cosine_dis_clus_avg,0.480773,0.0,0.564375,0.0,0.551658,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.400958,0.0,0.452852,0.0,0.449866,0.0
ewm_vector_cosine_dis_clus_avg,0.486804,0.0,0.507135,0.0,0.523667,0.0
minima_vector_cosine_dis,0.624547,0.0,0.609444,0.0,0.650109,0.0
minima_vector_cosine_dis_clus_avg,0.675944,0.0,0.651335,0.0,0.699249,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,0.07221,0.1595,0.121037,0.0181,0.102545,0.0455
ewm_vector_cosine_dis_clus_avg,0.113632,0.0266,0.147331,0.004,0.137911,0.007
minima_vector_cosine_dis,0.560721,0.0,0.587315,0.0,0.604565,0.0
minima_vector_cosine_dis_clus_avg,0.635747,0.0,0.644149,0.0,0.67363,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.514071,0.0,0.538147,0.0,0.552693,0.0
ewm_vector_cosine_dis_clus_avg,0.595945,0.0,0.671769,0.0,0.66502,0.0
minima_vector_cosine_dis,0.670702,0.0,0.727372,0.0,0.733902,0.0
minima_vector_cosine_dis_clus_avg,0.62239,0.0,0.710728,0.0,0.699171,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.268577,0.0,0.282599,0.0,0.305022,0.0
ewm_vector_cosine_dis_clus_avg,0.463733,0.0,0.334945,0.0,0.449579,0.0
minima_vector_cosine_dis,0.545713,0.0,0.443946,0.0,0.55414,0.0
minima_vector_cosine_dis_clus_avg,0.608196,0.0,0.433446,0.0,0.58669,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.090874,0.0681,0.107671,0.0305,0.103969,0.0367
ewm_vector_cosine_dis_clus_avg,0.097003,0.0514,0.073659,0.1394,0.089027,0.0739
minima_vector_cosine_dis,0.375478,0.0,0.316616,0.0,0.36136,0.0
minima_vector_cosine_dis_clus_avg,0.407179,0.0,0.370607,0.0,0.406369,0.0


cbow_subtitles
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.286212,0.0,0.299711,0.0,0.316014,0.0
ewm_vector_cosine_dis_clus_avg,0.289682,0.0,0.372818,0.0,0.358066,0.0
minima_vector_cosine_dis,0.376063,0.0,0.401132,0.0,0.419255,0.0
minima_vector_cosine_dis_clus_avg,0.27472,0.0,0.366523,0.0,0.346703,0.0


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.271906,0.0,0.212283,0.0,0.258617,0.0
ewm_vector_cosine_dis_clus_avg,0.415441,0.0,0.385683,0.0,0.42812,0.0
minima_vector_cosine_dis,0.428251,0.0,0.431931,0.0,0.459794,0.0
minima_vector_cosine_dis_clus_avg,0.499762,0.0,0.492273,0.0,0.530236,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.549628,0.0,0.667627,0.0,0.641773,0.0
ewm_vector_cosine_dis_clus_avg,0.636505,0.0,0.766073,0.0,0.739706,0.0
minima_vector_cosine_dis,0.613614,0.0,0.725625,0.0,0.706715,0.0
minima_vector_cosine_dis_clus_avg,0.647242,0.0,0.748091,0.0,0.736875,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.401794,0.0,0.453973,0.0,0.450897,0.0
ewm_vector_cosine_dis_clus_avg,0.512556,0.0,0.553522,0.0,0.561689,0.0
minima_vector_cosine_dis,0.564149,0.0,0.590369,0.0,0.608271,0.0
minima_vector_cosine_dis_clus_avg,0.558914,0.0,0.584886,0.0,0.602625,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,0.072757,0.1564,0.097565,0.0571,0.09006,0.0791
ewm_vector_cosine_dis_clus_avg,-0.05688,0.2681,0.01832,0.7215,-0.018968,0.7121
minima_vector_cosine_dis,0.545239,0.0,0.591456,0.0,0.598942,0.0
minima_vector_cosine_dis_clus_avg,0.575936,0.0,0.625328,0.0,0.632974,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.555603,0.0,0.570531,0.0,0.591719,0.0
ewm_vector_cosine_dis_clus_avg,0.570867,0.0,0.651705,0.0,0.641196,0.0
minima_vector_cosine_dis,0.665357,0.0,0.676656,0.0,0.705271,0.0
minima_vector_cosine_dis_clus_avg,0.60342,0.0,0.711967,0.0,0.689476,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.126511,0.0056,0.22713,0.0,0.191043,0.0
ewm_vector_cosine_dis_clus_avg,0.053011,0.2474,0.188116,0.0,0.126876,0.0055
minima_vector_cosine_dis,0.278197,0.0,0.294575,0.0,0.316881,0.0
minima_vector_cosine_dis_clus_avg,0.517223,0.0,0.299959,0.0,0.464346,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.055691,0.2641,0.063955,0.1996,0.062637,0.209
ewm_vector_cosine_dis_clus_avg,-0.117217,0.0184,-0.145686,0.0033,-0.137727,0.0056
minima_vector_cosine_dis,0.204235,0.0,0.139437,0.005,0.179119,0.0003
minima_vector_cosine_dis_clus_avg,0.210153,0.0,0.125665,0.0115,0.174835,0.0004


banori
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.165181,0.0005,0.146938,0.002,0.168059,0.0004
ewm_vector_cosine_dis_clus_avg,0.292189,0.0,0.223089,0.0,0.277017,0.0
minima_vector_cosine_dis,0.180866,0.0001,0.193644,0.0,0.202036,0.0
minima_vector_cosine_dis_clus_avg,0.014868,0.7555,0.047069,0.324,0.033746,0.4797


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.263175,0.0,0.191584,0.0001,0.242848,0.0
ewm_vector_cosine_dis_clus_avg,0.397018,0.0,0.346514,0.0,0.397269,0.0
minima_vector_cosine_dis,0.475338,0.0,0.421859,0.0,0.479397,0.0
minima_vector_cosine_dis_clus_avg,0.463553,0.0,0.427825,0.0,0.476343,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.550633,0.0,0.681569,0.0,0.649248,0.0
ewm_vector_cosine_dis_clus_avg,0.645019,0.0,0.783423,0.0,0.753119,0.0
minima_vector_cosine_dis,0.648311,0.0,0.755139,0.0,0.740971,0.0
minima_vector_cosine_dis_clus_avg,0.664474,0.0,0.768933,0.0,0.756951,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.206793,0.0,0.247999,0.0,0.239637,0.0
ewm_vector_cosine_dis_clus_avg,0.183425,0.0001,0.270367,0.0,0.239147,0.0
minima_vector_cosine_dis,0.344838,0.0,0.361712,0.0,0.372255,0.0
minima_vector_cosine_dis_clus_avg,0.415584,0.0,0.463463,0.0,0.463159,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,-0.006919,0.8929,0.038218,0.457,0.017263,0.737
ewm_vector_cosine_dis_clus_avg,-0.097763,0.0566,-0.137259,0.0073,-0.124363,0.0151
minima_vector_cosine_dis,0.48193,0.0,0.481615,0.0,0.507012,0.0
minima_vector_cosine_dis_clus_avg,0.642312,0.0,0.657875,0.0,0.684433,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.533945,0.0,0.545151,0.0,0.56706,0.0
ewm_vector_cosine_dis_clus_avg,0.550911,0.0,0.597652,0.0,0.602921,0.0
minima_vector_cosine_dis,0.598492,0.0,0.621352,0.0,0.640834,0.0
minima_vector_cosine_dis_clus_avg,0.535174,0.0,0.589144,0.0,0.590042,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.186417,0.0,0.280809,0.0,0.254364,0.0
ewm_vector_cosine_dis_clus_avg,0.149712,0.001,0.318426,0.0,0.251088,0.0
minima_vector_cosine_dis,0.31587,0.0,0.35596,0.0,0.370622,0.0
minima_vector_cosine_dis_clus_avg,0.13901,0.0023,0.288277,0.0,0.229418,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.292273,0.0,0.205452,0.0,0.259473,0.0
ewm_vector_cosine_dis_clus_avg,0.376891,0.0,0.264216,0.0,0.334213,0.0
minima_vector_cosine_dis,0.258946,0.0,0.180474,0.0003,0.229061,0.0
minima_vector_cosine_dis_clus_avg,0.216732,0.0,0.161216,0.0011,0.197125,0.0001


TASA
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.078913,0.0979,0.119032,0.0124,0.107153,0.0244
ewm_vector_cosine_dis_clus_avg,0.055622,0.2438,0.124158,0.0091,0.097675,0.0403
minima_vector_cosine_dis,0.062562,0.1897,0.136903,0.004,0.108352,0.0229
minima_vector_cosine_dis_clus_avg,0.004282,0.9285,0.121617,0.0106,0.069169,0.147


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.398248,0.0,0.361618,0.0,0.406044,0.0
ewm_vector_cosine_dis_clus_avg,0.343601,0.0,0.332048,0.0,0.361109,0.0
minima_vector_cosine_dis,0.540294,0.0,0.507181,0.0,0.559788,0.0
minima_vector_cosine_dis_clus_avg,0.348041,0.0,0.327644,0.0,0.3611,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.551792,0.0,0.681647,0.0,0.649942,0.0
ewm_vector_cosine_dis_clus_avg,0.624937,0.0,0.758934,0.0,0.729623,0.0
minima_vector_cosine_dis,0.578003,0.0,0.721794,0.0,0.684664,0.0
minima_vector_cosine_dis_clus_avg,0.620468,0.0,0.755141,0.0,0.725215,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.400337,0.0,0.423777,0.0,0.434198,0.0
ewm_vector_cosine_dis_clus_avg,0.376523,0.0,0.397623,0.0,0.407871,0.0
minima_vector_cosine_dis,0.513132,0.0,0.53201,0.0,0.550642,0.0
minima_vector_cosine_dis_clus_avg,0.627786,0.0,0.638824,0.0,0.667315,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,0.124679,0.0149,0.135386,0.0081,0.137035,0.0074
ewm_vector_cosine_dis_clus_avg,0.083577,0.1033,0.112306,0.0284,0.103579,0.0433
minima_vector_cosine_dis,0.470585,0.0,0.539324,0.0,0.532623,0.0
minima_vector_cosine_dis_clus_avg,0.554499,0.0,0.604499,0.0,0.610745,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.440331,0.0,0.547566,0.0,0.517343,0.0
ewm_vector_cosine_dis_clus_avg,0.390444,0.0,0.474435,0.0,0.453104,0.0
minima_vector_cosine_dis,0.483851,0.0,0.587052,0.0,0.561053,0.0
minima_vector_cosine_dis_clus_avg,0.44702,0.0,0.580968,0.0,0.537925,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.396963,0.0,0.395055,0.0,0.439427,0.0
ewm_vector_cosine_dis_clus_avg,0.476425,0.0,0.369054,0.0,0.474449,0.0
minima_vector_cosine_dis,0.451105,0.0,0.420399,0.0,0.484984,0.0
minima_vector_cosine_dis_clus_avg,0.551778,0.0,0.419773,0.0,0.545635,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.306015,0.0,0.283635,0.0,0.308122,0.0
ewm_vector_cosine_dis_clus_avg,0.310641,0.0,0.241466,0.0,0.288068,0.0
minima_vector_cosine_dis,0.479176,0.0,0.477488,0.0,0.500218,0.0
minima_vector_cosine_dis_clus_avg,0.477978,0.0,0.471968,0.0,0.496666,0.0


glove_6B
box


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.068798,0.1492,0.119259,0.0122,0.101937,0.0323
SemDis_cbowsubtitle_nf_m,0.13251,0.0053,0.164203,0.0005,0.160306,0.0007
SemDis_cbowBNCwikiukwac_nf_m,0.145705,0.0022,0.08802,0.0648,0.125361,0.0084
SemDis_TASA_nf_m,0.053272,0.2643,0.121129,0.0109,0.094768,0.0467
SemDis_glove_nf_m,-0.014156,0.7669,-0.010722,0.8223,-0.013374,0.7794
ewm_vector_cosine_dis,0.164957,0.0005,0.184313,0.0001,0.188502,0.0001
ewm_vector_cosine_dis_clus_avg,0.101589,0.0329,0.047399,0.3207,0.079719,0.0945
minima_vector_cosine_dis,0.367363,0.0,0.370438,0.0,0.397775,0.0
minima_vector_cosine_dis_clus_avg,0.475336,0.0,0.455076,0.0,0.501352,0.0


brick


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.146538,0.0036,0.114403,0.0233,0.139375,0.0056
SemDis_cbowsubtitle_nf_m,0.213515,0.0,0.173665,0.0005,0.206827,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.23533,0.0,0.14527,0.0039,0.20315,0.0
SemDis_TASA_nf_m,0.348638,0.0,0.322109,0.0,0.358441,0.0
SemDis_glove_nf_m,0.069156,0.1712,0.029644,0.5579,0.052684,0.2975
ewm_vector_cosine_dis,0.147614,0.0034,0.119275,0.018,0.142567,0.0046
ewm_vector_cosine_dis_clus_avg,0.176431,0.0004,0.172237,0.0006,0.186356,0.0002
minima_vector_cosine_dis,0.519981,0.0,0.502631,0.0,0.546548,0.0
minima_vector_cosine_dis_clus_avg,0.45241,0.0,0.441343,0.0,0.477691,0.0


chair


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.502675,0.0,0.646162,0.0,0.604567,0.0
SemDis_cbowsubtitle_nf_m,0.569665,0.0,0.695433,0.0,0.666886,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.55032,0.0,0.678645,0.0,0.647622,0.0
SemDis_TASA_nf_m,0.563739,0.0,0.695213,0.0,0.663424,0.0
SemDis_glove_nf_m,0.433389,0.0,0.589843,0.0,0.537458,0.0
ewm_vector_cosine_dis,0.47085,0.0,0.575954,0.0,0.551778,0.0
ewm_vector_cosine_dis_clus_avg,0.451027,0.0,0.563904,0.0,0.53459,0.0
minima_vector_cosine_dis,0.652504,0.0,0.738036,0.0,0.734872,0.0
minima_vector_cosine_dis_clus_avg,0.634289,0.0,0.748395,0.0,0.729695,0.0


cup


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.34857,0.0,0.423089,0.0,0.406603,0.0
SemDis_cbowsubtitle_nf_m,0.340794,0.0,0.397972,0.0,0.38926,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.209724,0.0,0.27178,0.0,0.253727,0.0
SemDis_TASA_nf_m,0.383622,0.0,0.420388,0.0,0.423617,0.0
SemDis_glove_nf_m,0.047687,0.3172,0.127391,0.0073,0.092302,0.0525
ewm_vector_cosine_dis,0.157397,0.0009,0.187723,0.0001,0.181848,0.0001
ewm_vector_cosine_dis_clus_avg,0.020427,0.6684,0.080752,0.0899,0.053354,0.263
minima_vector_cosine_dis,0.438586,0.0,0.438147,0.0,0.461901,0.0
minima_vector_cosine_dis_clus_avg,0.637822,0.0,0.625918,0.0,0.665785,0.0


key


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,-0.037084,0.4705,0.002113,0.9672,-0.017713,0.7304
SemDis_cbowsubtitle_nf_m,-0.008231,0.8728,-0.007403,0.8855,-0.008212,0.8731
SemDis_cbowBNCwikiukwac_nf_m,-0.138714,0.0067,-0.141125,0.0058,-0.147294,0.004
SemDis_TASA_nf_m,0.022692,0.6588,0.015431,0.764,0.019933,0.6981
SemDis_glove_nf_m,-0.219126,0.0,-0.221757,0.0,-0.232039,0.0
ewm_vector_cosine_dis,-0.182525,0.0003,-0.124344,0.0152,-0.160452,0.0017
ewm_vector_cosine_dis_clus_avg,-0.283151,0.0,-0.299433,0.0,-0.306842,0.0
minima_vector_cosine_dis,0.555661,0.0,0.512357,0.0,0.56123,0.0
minima_vector_cosine_dis_clus_avg,0.606108,0.0,0.530259,0.0,0.596623,0.0


pencil


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.469966,0.0,0.507451,0.0,0.513123,0.0
SemDis_cbowsubtitle_nf_m,0.480507,0.0,0.505043,0.0,0.517638,0.0
SemDis_cbowBNCwikiukwac_nf_m,0.48607,0.0,0.548597,0.0,0.542755,0.0
SemDis_TASA_nf_m,0.4087,0.0,0.52413,0.0,0.488243,0.0
SemDis_glove_nf_m,0.406042,0.0,0.427526,0.0,0.4378,0.0
ewm_vector_cosine_dis,0.428244,0.0,0.433819,0.0,0.453072,0.0
ewm_vector_cosine_dis_clus_avg,0.520993,0.0,0.506664,0.0,0.540491,0.0
minima_vector_cosine_dis,0.523,0.0,0.542419,0.0,0.559718,0.0
minima_vector_cosine_dis_clus_avg,0.512729,0.0,0.586955,0.0,0.576718,0.0


rope


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.095047,0.0378,0.157097,0.0006,0.136705,0.0027
SemDis_cbowsubtitle_nf_m,0.000216,0.9962,0.107911,0.0183,0.054496,0.2343
SemDis_cbowBNCwikiukwac_nf_m,0.097041,0.0339,0.179498,0.0001,0.149199,0.0011
SemDis_TASA_nf_m,0.277842,0.0,0.306497,0.0,0.322673,0.0
SemDis_glove_nf_m,0.093323,0.0414,0.157785,0.0005,0.136008,0.0029
ewm_vector_cosine_dis,0.258818,0.0,0.294487,0.0,0.305101,0.0
ewm_vector_cosine_dis_clus_avg,0.328442,0.0,0.387055,0.0,0.393901,0.0
minima_vector_cosine_dis,0.457058,0.0,0.408301,0.0,0.482494,0.0
minima_vector_cosine_dis_clus_avg,0.449109,0.0,0.270867,0.0,0.40844,0.0


shoe


Unnamed: 0_level_0,novelty_1,n1_pval,novelty_2,n2_pval,novelty_m,nm_pval
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SemDis_cbowukwacsubtitle_nf_m,0.128026,0.01,0.11479,0.021,0.126847,0.0107
SemDis_cbowsubtitle_nf_m,0.040856,0.4128,0.035679,0.4745,0.039973,0.423
SemDis_cbowBNCwikiukwac_nf_m,0.242609,0.0,0.161295,0.0011,0.210464,0.0
SemDis_TASA_nf_m,0.196974,0.0001,0.185345,0.0002,0.199807,0.0001
SemDis_glove_nf_m,0.25309,0.0,0.233486,0.0,0.25425,0.0
ewm_vector_cosine_dis,0.193659,0.0001,0.181691,0.0002,0.19616,0.0001
ewm_vector_cosine_dis_clus_avg,0.14265,0.0041,0.107872,0.0302,0.130682,0.0085
minima_vector_cosine_dis,0.376204,0.0,0.346269,0.0,0.377506,0.0
minima_vector_cosine_dis_clus_avg,0.36606,0.0,0.346426,0.0,0.372377,0.0


In [45]:
def write_compiled_corrs():
    writer = pd.ExcelWriter('novelty_algos_compiled_corrs_results_070921.xlsx', engine='xlsxwriter')
    workbook = writer.book

    for y in range(len(sem_space_list)):
        row = 0
        worksheet = workbook.add_worksheet(sem_space_list_str[y])
        writer.sheets[sem_space_list_str[y]] = worksheet
        for i in range(len(prompts_list)):
            worksheet.write_string(row, 0, prompts_list[i])
            results_df = results_dict[prompts_list[i] + underscore + sem_space_list_str[y] + underscore + "results"]
            scores_df = results_df.iloc[:, [18,19,20,8,9,10,11,12,14,15,16,17]]
            pval_df = calculate_pvalues(scores_df)
            scores_df = scores_df.corr()
            corrs_df = pd.DataFrame(scores_df.iloc[3:, 0:3])
            pval_df = pd.DataFrame(pval_df.iloc[3:, 0:3])
            pval_df.columns = ['n1_pval', 'n2_pval', 'nm_pval']
            corr_pval_df = pd.concat([corrs_df, pval_df], axis=1)
            corr_pval_df = corr_pval_df[['novelty_1','n1_pval', 'novelty_2', 'n2_pval', 'novelty_m', 'nm_pval']]
            corr_pval_df.index.rename('metrics', inplace=True)
            corr_pval_df.to_excel(writer, sheet_name = sem_space_list_str[y], startrow=row + 1, startcol=0, index = True)
            row = row + len(corr_pval_df.index) + 3
            
    writer.save()

In [46]:
# write_compiled_corrs()

## ICC Test 

In [47]:
def get_id_list_overlaps():
    id_lists = []
    for i in range(len(data_list)):
        id_lists.append(get_id_list(data_list[i]))
    return list(reduce(set.intersection, [set(item) for item in id_lists]))

In [98]:
def icc_calc_one_semspace_df(sem_space):
    icc_list_ewm = []
    icc_list_ewm_clust = []
    icc_list_min = []
    icc_list_min_clust = []
    id_list = get_id_list_overlaps()
    for i in range(len(prompts_list)):
        results_df = results_dict[prompts_list[i] + underscore + sem_space + underscore + "results"]
        for participant in id_list:
            id_df = results_df.loc[results_df['id'] == participant]
            
            avg_ewm = id_df["ewm_vector_cosine_dis"].mean()
            list_ewm = [participant, prompts_list[i], avg_ewm]
            icc_list_ewm.append(list_ewm)
            
            avg_ewm_clust = id_df["ewm_vector_cosine_dis"].mean()
            list_ewm_clust = [participant, prompts_list[i], avg_ewm_clust]
            icc_list_ewm_clust.append(list_ewm_clust)
            
            avg_min = id_df["minima_vector_cosine_dis"].mean()
            list_min = [participant, prompts_list[i], avg_min]
            icc_list_min.append(list_min)
            
            avg_min_clust = id_df["minima_vector_cosine_dis_clus_avg"].mean()
            list_min_clust = [participant, prompts_list[i], avg_min_clust]
            icc_list_min_clust.append(list_min_clust)
        
    icc_df_ewm = pd.DataFrame(icc_list_ewm, columns=['id', 'prompt', 'rating'])
    icc_df_ewm_clust = pd.DataFrame(icc_list_ewm_clust, columns=['id', 'prompt', 'rating'])
    icc_df_min = pd.DataFrame(icc_list_min, columns=['id', 'prompt', 'rating'])
    icc_df_min_clust = pd.DataFrame(icc_list_min_clust, columns=['id', 'prompt', 'rating'])
    
    icc_df_ewm_results = pg.intraclass_corr(data=icc_df_ewm, targets='id', raters='prompt', ratings='rating')
    icc_df_ewm_clust_results = pg.intraclass_corr(data=icc_df_ewm_clust, targets='id', raters='prompt', ratings='rating')
    icc_df_min_results = pg.intraclass_corr(data=icc_df_min, targets='id', raters='prompt', ratings='rating')
    icc_df_min_clust_results = pg.intraclass_corr(data=icc_df_min_clust, targets='id', raters='prompt', ratings='rating')
    
    results_df_list = [icc_df_ewm_results.loc[icc_df_ewm_results['Type'] == 'ICC3k'], 
                      icc_df_ewm_clust_results.loc[icc_df_ewm_clust_results['Type'] == 'ICC3k'],
                      icc_df_min_results.loc[icc_df_min_results['Type'] == 'ICC3k'],
                      icc_df_min_clust_results.loc[icc_df_min_clust_results['Type'] == 'ICC3k']]
    
    icc_results = pd.concat(results_df_list, ignore_index=True)
    icc_results.index = ['ewm','ewm_clust','minvec','minvec_clust']
    icc_results.rename_axis('metrics')
    del icc_results['Description']
    del icc_results['Type']
 
    return icc_results

In [99]:
def icc_calc_all_semspace_df():
    writer = pd.ExcelWriter('icc_methods_results.xlsx', engine='xlsxwriter')
    workbook = writer.book
    worksheet = workbook.add_worksheet('Result')
    sheets = 'ICC Results'
    writer.sheets[sheets] = worksheet
    row = 0
    
    for y in range(len(sem_space_list)):
        print(sem_space_list_str[y])
        worksheet.write_string(row, 0, sem_space_list_str[y])
        icc_results = icc_calc_one_semspace_df(sem_space_list_str[y])
        display(icc_results)
        icc_results.to_excel(writer,sheet_name=sheets, startrow=row + 1, startcol=0)
        row = row + len(icc_results.index) + 3
        
    writer.save()

In [100]:
icc_calc_all_semspace_df()

ukwac_subtitles


Unnamed: 0,ICC,F,df1,df2,pval,CI95%
ewm,0.642997,2.801094,86,602,3.428044e-13,"[0.52, 0.75]"
ewm_clust,0.642997,2.801094,86,602,3.428044e-13,"[0.52, 0.75]"
minvec,0.665717,2.991476,86,602,7.157401e-15,"[0.55, 0.76]"
minvec_clust,0.644877,2.815929,86,602,2.540569e-13,"[0.52, 0.75]"


cbow_subtitles


Unnamed: 0,ICC,F,df1,df2,pval,CI95%
ewm,0.523451,2.098421,86,602,2.685809e-07,"[0.36, 0.66]"
ewm_clust,0.523451,2.098421,86,602,2.685809e-07,"[0.36, 0.66]"
minvec,0.590131,2.439802,86,602,4.416559e-10,"[0.45, 0.71]"
minvec_clust,0.665595,2.990388,86,602,7.318465e-15,"[0.55, 0.76]"


banori


Unnamed: 0,ICC,F,df1,df2,pval,CI95%
ewm,0.551909,2.231691,86,602,2.321309e-08,"[0.39, 0.68]"
ewm_clust,0.551909,2.231691,86,602,2.321309e-08,"[0.39, 0.68]"
minvec,0.485756,1.944601,86,602,4.061356e-06,"[0.3, 0.63]"
minvec_clust,0.539448,2.171306,86,602,7.108536e-08,"[0.38, 0.67]"


TASA


Unnamed: 0,ICC,F,df1,df2,pval,CI95%
ewm,0.49201,1.968544,86,602,2.684526e-06,"[0.31, 0.64]"
ewm_clust,0.49201,1.968544,86,602,2.684526e-06,"[0.31, 0.64]"
minvec,0.55378,2.241046,86,602,1.949188e-08,"[0.4, 0.68]"
minvec_clust,0.479332,1.920611,86,602,6.127414e-06,"[0.3, 0.63]"


glove_6B


Unnamed: 0,ICC,F,df1,df2,pval,CI95%
ewm,0.575678,2.356699,86,602,2.187161e-09,"[0.43, 0.7]"
ewm_clust,0.575678,2.356699,86,602,2.187161e-09,"[0.43, 0.7]"
minvec,0.513065,2.053663,86,602,5.998639e-07,"[0.34, 0.65]"
minvec_clust,0.562481,2.285617,86,602,8.438596e-09,"[0.41, 0.69]"


In [81]:
icc_df_ukwac_subtitles = icc_calc_one_semspace_df('ukwac_subtitles')
icc_df_ukwac_subtitles

Unnamed: 0,Type,ICC,F,df1,df2,pval,CI95%
ewm,ICC3k,0.642997,2.801094,86,602,3.428044e-13,"[0.52, 0.75]"
ewm_clust,ICC3k,0.642997,2.801094,86,602,3.428044e-13,"[0.52, 0.75]"
minvec,ICC3k,0.665717,2.991476,86,602,7.157401e-15,"[0.55, 0.76]"
minvec_clust,ICC3k,0.644877,2.815929,86,602,2.540569e-13,"[0.52, 0.75]"


Algo Design Brainstorming:
* Word2vec, average similarity between words in two responses
* Problems:
    * phrases vs words
    * compare between just two responses or the whole list of responses
* Algo Idea
    * find similarity between prompt and average of word in response
    * the average is the value that represents novelty
* Things to Look Into:
    * doc2Vec - look into sentence vector
        * follow similar logic to above but no need to average for multiple words
    * Word2Vec + SIF + Cosine Similarity
    * Word2Vec + WMD 
* Course of Actions for Missing Words/Misspellings?
    * hand remove?
* What to do when whole phrase stop words
    * make 0

To Do List
- [x] write preprocessing methods
- [x] write out initial algo
- [x] implement first algo idea
- [x] set up work environment on macbook
- [x] look into Word2Vec + SIF + Cosine Similarity
- [x] set up excel sheet download with different sheets
- [x] use ukwac semantic space for cosine distance
- [x] set up element wise multiplied vectors for cosine distance algo
- [x] set up phrase minima vector for cosine distance algo 
- [x] Compare the ewm and phrase minima algos with SemDis
- [x] Get a sense of the effect of the compositions
- [x] 1 - to get the distance
- [x] Remove the word could 
    - had to lowercase first and then remove stop words
- [x] write clustering method
    - count vectorizer and kmeans
- [x] update clustering method to use cosine distance not euclidean 
    - changed from scikit learn to cosine distance from NLTK 
- [x] write algo to average novelty scores of responses in the same category
    - do for both ewm and minima
- [x] use the elbow method to figure out how many clusters to use
- [x] download and upload all the semantic spaces from SemDis
- [x] update methods to change semantic spaces
- [x] copy semantic spaces to mac
- [x] set up scripts to pass in official data for pc and mac
- [x] add .DS_Store to gitignore
- [x] get SemDis results for all 6 different semantic spaces with the official data
- [x] compare performance on the 6 different semantic spaces
    - correlation tests for coefficient and significance
- [x] update stop words list to include "use" and "thing"
- [ ] figure out how many times to run kmeans
    - cross validation
    - then averaging the results of all the iterations
- [ ] figure out way to automate the number of clusters used in each run
 
Don't look into unless you have time
- [ ] look into doc2vec 
- [ ] look into Word2Vec + WMD 
- [ ] look at https://github.com/PrincetonML/SIF for better SIF 
