In [2]:
import csv
import sys
import os
import pickle
import random
import numpy as np
import time
import operator
import seaborn as sns
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

from collections import Counter

import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import statsmodels.api as sm

import scipy.stats
from scipy.stats import entropy
from scipy.stats import spearmanr
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import xgboost as xgb

In [3]:
sys.path.append('../python')
from permutation import permutation, compute_alignment_strength
from utils.utils_funcs import get_variance, get_distinctness_from_nearest_5
from utils.utils_funcs import load_data_to_dict

In [4]:
# aggregate both visual embeddings and language embeddings
def aggregate_embeddings_visual_and_language(input_struct,n_sample_per_visual,n_sample_per_language):
	words=input_struct['words']
	embed_dict=dict()
	for word in words:
		n_sample_v = min(len(input_struct['embeds'][word]['visual']), n_sample_per_visual)
		visual_temp=np.mean(np.array(random.sample(input_struct['embeds'][word]['visual'],n_sample_v)),axis=0)
		n_sample_l = min(len(input_struct['embeds'][word]['language']), n_sample_per_language)
		language_temp=np.mean(np.array(random.sample(input_struct['embeds'][word]['language'],n_sample_l)),axis=0)
		#embed_dict[word]=dict(visual=np.expand_dims(visual_temp,axis=0),language=language_temp)
		embed_dict[word]=dict(visual=visual_temp,language=np.squeeze(language_temp))
	return dict(embeds=embed_dict,words=words)

In [5]:
def get_concept_level_alignment_strength(struct,concepts,targets):
    visual_embeddings_list=list()
    langauge_embeddings_list=list()
    for concept in concepts:
        embeddings=struct["embeds"][concept]
        visual_embeddings_list.append(embeddings["visual"])
        langauge_embeddings_list.append(embeddings["language"])
    z_0=np.array(visual_embeddings_list)
    z_1=np.array(langauge_embeddings_list)
    sim_z_0=cosine_similarity(z_0)
    sim_z_1=cosine_similarity(z_1)
    rt=list()
    for target in targets:
        idx=concepts.index(target)
        rt.append(spearmanr(sim_z_0[idx],sim_z_1[idx])[0])
    return rt

In [6]:
def get_variance(struct):
    words=struct["words"]
    visual_variability=dict()
    language_variance=dict()
    for word in words:
        visual_embeddings=struct["embeds"][word]["visual"]
        visual_variability[word]=np.mean(np.linalg.norm(visual_embeddings-np.mean(visual_embeddings,axis=0),axis=1))
        language_embeddings=struct["embeds"][word]["language"]
        language_variance[word]=np.mean(np.linalg.norm(language_embeddings-np.mean(language_embeddings,axis=0),axis=1))
    return visual_variability,language_variance

In [7]:
def get_distinctness_from_nearest_5(struct):
    words=struct["words"]
    visual_centers=dict()
    language_centers=dict()
    for word in words:
        visual_embeddings=np.array(struct["embeds"][word]["visual"])
        visual_centers[word]=np.mean(visual_embeddings,axis=0)
        language_embeddings=np.array(struct["embeds"][word]["language"])
        language_centers[word]=np.mean(language_embeddings,axis=0)
    visual_discriminability=dict()
    language_distinctness=dict()
    for word in words:
        visual_distances=[np.linalg.norm(visual_centers[word]-visual_centers[temp]) for temp in words]
        visual_discriminability[word]=np.sum(sorted(visual_distances)[:6])/5
        language_distances=[np.linalg.norm(language_centers[word]-language_centers[temp]) for temp in words]
        language_distinctness[word]=np.sum(sorted(language_distances)[:6])/5
    return visual_discriminability,language_distinctness

In [8]:
def combine_all_data_2(data,aoa_dict,v_var_dict,l_var_dict,v_dist_dict,l_dist_dict):
    aoas=list()
    v_vars=list()
    l_vars=list()
    v_dists=list()
    l_dists=list()
    
    intersect_concepts=list()
    for concept in aoa_dict:
        if concept in data['words']:
            aoas.append(aoa_dict[concept])
            v_vars.append(v_var_dict[concept])
            l_vars.append(l_var_dict[concept])
            v_dists.append(v_dist_dict[concept])
            l_dists.append(l_dist_dict[concept])
            intersect_concepts.append(concept)
    alignments=get_concept_level_alignment_strength(data,data['words'],intersect_concepts)
    return aoas,alignments,v_vars,l_vars,v_dists,l_dists

In [None]:
def combine_all_data(data,aoa_dict,v_var_dict,l_var_dict,v_dist_dict,l_dist_dict,freq_dict):
    aoas=list()
    v_vars=list()
    l_vars=list()
    v_dists=list()
    l_dists=list()
    freqs=list()
    
    intersect_concepts=list()
    for concept in aoa_dict:
        if concept in data['words']:
            aoas.append(aoa_dict[concept])
            v_vars.append(v_var_dict[concept])
            l_vars.append(l_var_dict[concept])
            v_dists.append(v_dist_dict[concept])
            l_dists.append(l_dist_dict[concept])
            intersect_concepts.append(concept)
    alignments=get_concept_level_alignment_strength(data,data['words'],intersect_concepts)
    return aoas,alignments,v_vars,l_vars,v_dists,l_dists

In [9]:
vg_noun_data=pickle.load(open("../data/dumped_embeddings/vg_noun_swav_bert_window5_20.pkl","rb"))
vg_verb_data=pickle.load(open("../data/dumped_embeddings/vg_verb_swav_bert_window5_20.pkl","rb"))

aggregated_vg_noun_data=aggregate_embeddings_visual_and_language(vg_noun_data,20,20)
aggregated_vg_verb_data=aggregate_embeddings_visual_and_language(vg_verb_data,20,20)

In [10]:
# vg_noun_data=pickle.load(open("../data/dumped_embeddings/vg_noun_ll_swav_bert_20.pkl","rb"))
# vg_verb_data=pickle.load(open("../data/dumped_embeddings/vg_verb_ll_swav_bert_20.pkl","rb"))

# aggregated_vg_noun_data=aggregate_embeddings_visual_and_language(vg_noun_data,20,20)
# aggregated_vg_verb_data=aggregate_embeddings_visual_and_language(vg_verb_data,20,20)

In [11]:
# vg_noun_data=pickle.load(open("../data/dumped_embeddings/vg_noun_concept_least20_swav_bert_20.pkl","rb"))
# vg_verb_data=pickle.load(open("../data/dumped_embeddings/vg_verb_concept_least20_swav_bert_20.pkl","rb"))

# aggregated_vg_noun_data=aggregate_embeddings_visual_and_language(vg_noun_data,20,20)
# aggregated_vg_verb_data=aggregate_embeddings_visual_and_language(vg_verb_data,20,20)

In [12]:
# vg_noun_data=pickle.load(open("../data/dumped_embeddings/vg_noun_concept_least20_swav_glove_20.pkl","rb"))
# vg_verb_data=pickle.load(open("../data/dumped_embeddings/vg_verb_concept_least20_swav_glove_20.pkl","rb"))

# aggregated_vg_noun_data=aggregate_embeddings_visual_and_language(vg_noun_data,20,1)
# aggregated_vg_verb_data=aggregate_embeddings_visual_and_language(vg_verb_data,20,1)

----

In [13]:
noun_aoa_dict=load_data_to_dict("../data/processed/aoa/kuperman_noun_aoa.txt",concepts=vg_noun_data["words"])
verb_aoa_dict=load_data_to_dict("../data/processed/aoa/kuperman_verb_aoa.txt",concepts=vg_verb_data["words"])

noun_aoa_sorted = sorted(noun_aoa_dict.items(), key=operator.itemgetter(1))
verb_aoa_sorted = sorted(verb_aoa_dict.items(), key=operator.itemgetter(1))

In [14]:
# noun_aoa_dict=load_data_to_dict("../data/processed/aoa/wordbank_noun_aoa.txt",concepts=vg_noun_data["words"])
# verb_aoa_dict=load_data_to_dict("../data/processed/aoa/wordbank_verb_aoa.txt",concepts=vg_verb_data["words"])

# noun_aoa_sorted = sorted(noun_aoa_dict.items(), key=operator.itemgetter(1))
# verb_aoa_sorted = sorted(verb_aoa_dict.items(), key=operator.itemgetter(1))

In [15]:
# noun_aoa_dict=load_data_to_dict("../data/processed/aoa/glasgow_noun_aoa.txt",concepts=vg_noun_data["words"])
# verb_aoa_dict=load_data_to_dict("../data/processed/aoa/glasgow_verb_aoa.txt",concepts=vg_verb_data["words"])

# noun_aoa_sorted = sorted(noun_aoa_dict.items(), key=operator.itemgetter(1))
# verb_aoa_sorted = sorted(verb_aoa_dict.items(), key=operator.itemgetter(1))

----

In [16]:
noun_visual_variability,noun_language_variance=get_variance(vg_noun_data)
verb_visual_variability,verb_language_variance=get_variance(vg_verb_data)

In [17]:
noun_visual_discriminability,noun_language_distinctness=get_distinctness_from_nearest_5(vg_noun_data)
verb_visual_discriminability,verb_language_distinctness=get_distinctness_from_nearest_5(vg_verb_data)

----

In [18]:
aoas,alignments,v_vars,l_vars,v_dists,l_dists=combine_all_data(aggregated_vg_noun_data,noun_aoa_dict,
                                                              noun_visual_variability,noun_language_variance,
                                                              noun_visual_discriminability,noun_language_distinctness)

In [19]:
df1=pd.DataFrame(data=np.transpose(np.array([aoas,alignments,v_vars,l_vars,v_dists,l_dists])),columns=['aoa','alignment','v_variability','l_variability','v_discriminability','l_discriminability'])
df1['type']='noun'

In [20]:
df1

Unnamed: 0,aoa,alignment,v_variability,l_variability,v_discriminability,l_discriminability,type
0,3.94,0.217737,0.859551,0.295025,0.361048,8.626495,noun
1,3.94,0.235367,0.593261,0.200954,0.627667,8.671186,noun
2,2.89,0.317529,0.870794,0.232416,0.418797,8.459664,noun
3,4.15,0.153757,0.742962,0.238987,0.441333,8.855353,noun
4,3.26,0.357621,0.835725,0.235835,0.296650,7.827719,noun
...,...,...,...,...,...,...,...
202,4.58,0.211939,0.912326,0.244061,0.359449,8.217980,noun
203,4.42,0.196053,0.736928,0.244147,0.322387,8.973331,noun
204,5.94,0.330452,0.603568,0.227072,0.251161,8.387299,noun
205,4.77,0.097514,0.761205,0.228313,0.368581,8.701978,noun


In [21]:
aoas,alignments,v_vars,l_vars,v_dists,l_dists=combine_all_data(aggregated_vg_verb_data,verb_aoa_dict,
                                                              verb_visual_variability,verb_language_variance,
                                                              verb_visual_discriminability,verb_language_distinctness)

In [22]:
df2=pd.DataFrame(data=np.transpose(np.array([aoas,alignments,v_vars,l_vars,v_dists,l_dists])),columns=['aoa','alignment','v_variability','l_variability','v_discriminability','l_discriminability'])
df2['type']='verb'

In [23]:
df2

Unnamed: 0,aoa,alignment,v_variability,l_variability,v_discriminability,l_discriminability,type
0,13.36,0.154660,0.825772,0.153138,0.492119,9.991625,verb
1,8.53,0.088723,0.862767,0.217756,0.319524,8.425980,verb
2,5.10,0.113732,0.772262,0.236029,0.518686,8.071630,verb
3,8.43,0.085468,0.874238,0.196295,0.313468,8.182177,verb
4,9.63,0.265958,0.828123,0.145860,0.360399,10.526436,verb
...,...,...,...,...,...,...,...
202,4.63,0.137376,0.846328,0.258009,0.297241,8.541475,verb
203,7.11,0.183319,0.920983,0.209003,0.316375,7.801135,verb
204,5.86,0.189100,0.822247,0.253786,0.311517,7.794843,verb
205,4.89,0.024849,0.924379,0.206177,0.311869,7.851201,verb


In [24]:
df=pd.concat([df1,df2])

In [25]:
df

Unnamed: 0,aoa,alignment,v_variability,l_variability,v_discriminability,l_discriminability,type
0,3.94,0.217737,0.859551,0.295025,0.361048,8.626495,noun
1,3.94,0.235367,0.593261,0.200954,0.627667,8.671186,noun
2,2.89,0.317529,0.870794,0.232416,0.418797,8.459664,noun
3,4.15,0.153757,0.742962,0.238987,0.441333,8.855353,noun
4,3.26,0.357621,0.835725,0.235835,0.296650,7.827719,noun
...,...,...,...,...,...,...,...
202,4.63,0.137376,0.846328,0.258009,0.297241,8.541475,verb
203,7.11,0.183319,0.920983,0.209003,0.316375,7.801135,verb
204,5.86,0.189100,0.822247,0.253786,0.311517,7.794843,verb
205,4.89,0.024849,0.924379,0.206177,0.311869,7.851201,verb


In [28]:
df['type']=df['type'].astype("category")

In [29]:
df.dtypes

aoa                    float64
alignment              float64
v_variability          float64
l_variability          float64
v_discriminability     float64
l_discriminability     float64
type                  category
dtype: object

In [30]:
x=df.iloc[:,1:]

In [31]:
y=df.iloc[:,:1]

In [32]:
test_size=0.33
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size)

In [36]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, tree_method="hist", enable_categorical=True)

In [37]:
xg_reg

XGBRegressor(alpha=10, base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.3, early_stopping_rounds=None,
             enable_categorical=True, eval_metric=None, gamma=None, gpu_id=None,
             grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, n_estimators=10, n_jobs=None,
             num_parallel_tree=None, objective='reg:linear', predictor=None,
             random_state=None, ...)

In [38]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)



In [39]:
preds

array([3.623076 , 4.0302052, 3.623076 , 3.623076 , 3.623076 , 4.0302052,
       4.0302052, 3.623076 , 3.623076 , 3.623076 , 3.3978748, 4.0302052,
       4.0302052, 4.0302052, 4.0302052, 3.623076 , 3.623076 , 3.623076 ,
       4.0302052, 3.623076 , 3.623076 , 3.623076 , 3.623076 , 3.623076 ,
       3.623076 , 3.623076 , 4.0302052, 3.623076 , 4.0302052, 4.0302052,
       3.623076 , 4.0302052, 3.623076 , 4.0302052, 3.623076 , 4.0302052,
       4.0302052, 4.0302052, 3.3978748, 3.623076 , 3.3978748, 4.0302052,
       3.623076 , 4.0302052, 3.623076 , 3.828522 , 3.623076 , 3.623076 ,
       3.623076 , 4.0302052, 4.0302052, 3.623076 , 3.623076 , 4.0302052,
       3.623076 , 3.623076 , 4.0302052, 4.0302052, 4.0302052, 4.0302052,
       3.623076 , 4.0302052, 4.0302052, 4.0302052, 4.0302052, 4.0302052,
       3.623076 , 4.0302052, 4.0302052, 4.0302052, 3.623076 , 4.0302052,
       4.0302052, 3.623076 , 4.0302052, 3.623076 , 3.623076 , 3.623076 ,
       4.0302052, 4.0302052, 3.3978748, 4.0302052, 

In [40]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 2.934748
