In [None]:
import numpy as np
import torch
import ot
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
import sys
sys.path.append('../code/')
from HierarchicalOT import load_wmd_data,change_embeddings,reduce_vocab,fit_topics,sparseOT
from sinkhorn_iterates import sinkhorn
from linear_solver import UOT_W

In [None]:
data_path = './data/WordMoverDistances/'
embeddings_path = './data/WordMoverDistances/glove.6B/glove.6B.300d.txt'

# Pick a dataset (n_doc,n_vocab)
#data_name = 'bbcsport-emd_tr_te_split.mat' #(737,3657)
data_name = 'twitter-emd_tr_te_split.mat' #(3108, 1205)
#data_name = 'r8-emd_tr_te3.mat' # (7674,5495)
#data_name = 'amazon-emd_tr_te_split.mat' #(8000, 16753)
#data_name = 'classic-emd_tr_te_split.mat' # (7093, 5813)
#data_name = 'ohsumed-emd_tr_te_ix.mat'# (9152, 8261)

vocab, embed_vocab, bow_data, y = load_wmd_data(data_path + data_name)
y = y - 1
vocab, embed_vocab, bow_data = change_embeddings(vocab, bow_data, embeddings_path) # embed in Glove

vocab, embed_vocab, bow_data = reduce_vocab(bow_data, vocab, embed_vocab, embed_aggregate='mean') # Reduce vocabulary by removing short words, stop words, and stemming (root words?)
embeddings = np.array([embed_vocab[w] for w in vocab])
cost_embeddings = euclidean_distances(embeddings, embeddings) ** 2 # Matrix of word embeddings: nb_vocab x 300

# WMD Distance

In [None]:
# UOT penalization
WMD=np.zeros((bow_data.shape[0],bow_data.shape[0]))
for i in range(bow_data.shape[0]):
    print(i,end=" ")
    for j in range(i + 1, bow_data.shape[0]):
        #print(j,end=" ")
        a,b,C=sparseOT(bow_data[i], bow_data[j], cost_embeddings)
        WMD[i,j]=ot.emd2(a,b,C)
WMD = WMD + WMD.T
np.savetxt("result/WordMoverDistances/WMD_"+data_name+".txt",WMD)

# HOTT

In [None]:
topics, lda_centers, topic_proportions = fit_topics(bow_data, embeddings, vocab, K=70)

# Reduce topics to top-20 words via threshold
# topics is now sparse
n_words_keep = 20
if n_words_keep is not None:
    for k in range(70):
        to_0_idx = np.argsort(-topics[k])[n_words_keep:]
        topics[k][to_0_idx] = 0
        
# Compute WD for the unormalized topics[i],topics[j]. 
# It then defines cost matrix for distrib on topics
cost_topics = np.zeros((topics.shape[0], topics.shape[0]))        
for i in range(cost_topics.shape[0]):
    for j in range(i + 1, cost_topics.shape[1]):
        a,b,C=sparseOT(bow_data[i], bow_data[j], cost_embeddings)
        cost_topics[i,j]=ot.emd2(a,b,C)
cost_topics=cost_topics+cost_topics.T #sparse_ot is symmetric

HOTT = np.zeros((bow_data.shape[0], bow_data.shape[0]))        
for i in range(bow_data.shape[0]):
    print(i,end=" ")
    for j in range(i + 1, bow_data.shape[0]):
        a,b,C=sparseOT(topic_proportions[i], topic_proportions[j], cost_topics)
        HOTT[i,j]=ot.emd2(a,b,C)
HOTT= HOTT+HOTT.T
np.savetxt("result/WordMoverDistances/HOTT_"+data_name+".txt",HOTT)

# HROT

In [None]:
# Linear Solver
lam=[.1,.5,1,10]
for l in lam:
    WMDuot1=np.zeros((bow_data.shape[0],bow_data.shape[0]))
    WMDuot2=np.zeros((bow_data.shape[0],bow_data.shape[0]))
    #for i in range(bow_data.shape[0]):
    #    print(i,end= " ")
    #    for j in range(i + 1, bow_data.shape[0]):
    #        #print(j,end=" ")
    #        a,b,C=sparseOT(bow_data[i], bow_data[j], cost_embeddings)
    #        _,_,Cx=sparseOT(bow_data[i], bow_data[i], cost_embeddings)
    #        _,_,Cy=sparseOT(bow_data[j], bow_data[j], cost_embeddings)
    #        P,Qx,Qy=UOT_W(a,b,C,lam=l,Cx=Cx,Cy=Cy,innerplan=True,solver="CLARABEL")
    #        WMDuot1[i,j]=np.sum(np.multiply(P,C))
    #        WMDuot2[i,j]=np.sum(np.multiply(P,C))+l*(np.sum(Qx*Cx)+np.sum(Qy*Cy))
    #WMDuot1 = WMDuot1 + WMDuot1.T
    #WMDuot2 = WMDuot2 + WMDuot2.T
    np.savetxt("result/WordMoverDistances/UOTP_"+data_name+"_"+str(l)+".txt",WMDuot1) #Only with the plan
    np.savetxt("result/WordMoverDistances/UOT_"+data_name+"_"+str(l)+".txt",WMDuot2) #Plan and divergences

In [None]:
# Sinkhorn Algorithm
lam=[.1,.5,1,10]
for l in lam:
    WMDuot1=np.zeros((bow_data.shape[0],bow_data.shape[0]))
    WMDuot2=np.zeros((bow_data.shape[0],bow_data.shape[0]))
    for i in range(bow_data.shape[0]):
        print(i, end=" ")
        for j in range(i + 1, bow_data.shape[0]):
            #print(j,end=" ")
            a,b,C=sparseOT(bow_data[i], bow_data[j], cost_embeddings)
            print(bow_data[j].max())
            _,_,Cx=sparseOT(bow_data[i], bow_data[i], cost_embeddings)
            _,_,Cy=sparseOT(bow_data[j], bow_data[j], cost_embeddings)
            print(Cy)
            print(a.shape,b.shape,C.shape,Cy.shape,Cx.shape)
            print(b)
            P,Qx,Qy=sinkhorn(torch.tensor(a),torch.tensor(b),torch.tensor(C),lam=l,
                             eps=3,numiter=20,lam2=None,pen="sinkhorn",Cx=torch.tensor(Cx),
                             Cy=torch.tensor(Cy),numiter2=1,innerplan=True)
            P,Qx,Qy=P.numpy(),Qx.numpy(),Qy.numpy()
            WMDuot1[i,j]=np.sum(np.multiply(P,C))
            WMDuot2[i,j]=np.sum(np.multiply(P,C))+l*(np.sum(Qx*Cx)+np.sum(Qy*Cy))
    WMDuot1 = WMDuot1 + WMDuot1.T
    WMDuot2 = WMDuot2 + WMDuot2.T
    np.savetxt("result/WordMoverDistances/UOTeP_"+data_name+"_"+str(l)+".txt",WMDuot1) #Only with the plan
    np.savetxt("result/WordMoverDistances/UOTe_"+data_name+"_"+str(l)+".txt",WMDuot2) #Plan and divergences