In [1]:
using LinearAlgebra
using DelimitedFiles

In [2]:
struct Model
    labels::Vector{String}
    matrix::Matrix{Float64}
    shape::Tuple{Int64, Int64}

    function Model(file_name; limit=nothing)
        file = open(file_name, "r")
        rows, columns = parse.(Int64, split(readline(file)))
        rows = limit == nothing ? rows : min(rows, limit)
    
        matrix = Array{Float64}(undef, rows, columns)
        labels = Array{String}(undef, rows)
    
        for i in 1:rows
            line = split(readline(file))    
            labels[i] = line[1]
            matrix[i, :] .= parse.(Float64, line[end-columns+1:end])
        end
        
        new(labels, matrix, (rows, columns))
    end
        
    Model(labels, matrix, shape) = new(labels, matrix, shape)
    
end

In [3]:
struct Word
    pt::String
    en::String
    es::String
end

function loadWords(file_name)
    words = Array{Word}(undef, 0)
    
    for line in eachline(file_name)
        w = Word(split(line)...)
        push!(words, w)
    end
    
    return words
end

loadWords (generic function with 1 method)

In [4]:
function cossineSimilarity(v1, v2)
    return dot(v1, v2)/(norm(v1)*norm(v2))
end

function getVector(model, word)
    idx = findfirst(==(word), model.labels)
    return model.matrix[idx, :]
end

function rankSimilarity(model, v1; top=10)
    rank = []
    for (idx, v2) in enumerate(eachrow(model.matrix))
        try
            push!(rank, (idx,cossineSimilarity(v1, v2)))
        catch
            println(idx)
            println(length(v2))
            println(getVector(model, idx))
        end
    end
    
    sort!(rank, by=r->r[2], rev=true)
    return [ (model.labels[i], r) for (i,r) in rank[1:top]]
end

rankSimilarity (generic function with 1 method)

In [5]:
limit = 100000
pt_50 = Model("glove.pt.050.txt", limit=limit)
en_50 = Model("glove.en.050.txt", limit=limit)
en_100 = Model("glove.en.100.txt", limit=limit)
print("load model ok")

load model ok

In [6]:
#wv_pt_50 = Array{Array{Float64}}(undef, 0)
words = loadWords("words")
wv_pt_50 = []
wv_en_50 = []
wv_en_100  = []

words_pt = []
words_en = []

for w in words
    push!(words_pt, w.pt)
    push!(words_en, w.en)
    
    push!(wv_pt_50, getVector(pt_50, w.pt))
    push!(wv_en_50, getVector(en_50, w.en))
    push!(wv_en_100,  getVector(en_100, w.en))
end

wv_pt_50 = transpose(reshape(reduce(vcat, wv_pt_50), 50, 112))
wv_en_50 = transpose(reshape(reduce(vcat, wv_en_50), 50, 112))
wv_en_100 = transpose(reshape(reduce(vcat, wv_en_100), 100, 112))

wv_pt_50 = Model(words_pt, wv_pt_50, size(wv_pt_50))
wv_en_50 = Model(words_en, wv_en_50, size(wv_en_50))
print("load words ok")
    
rankSimilarity(wv_pt_50, getVector(wv_pt_50, "mulher"))

load words ok

10-element Vector{Tuple{String, Float64}}:
 ("mulher", 0.9999999999999998)
 ("mãe", 0.9205584333066563)
 ("menina", 0.9108908290586678)
 ("irmã", 0.8617577003856278)
 ("homem", 0.8205636455727259)
 ("pai", 0.8042142912258528)
 ("menino", 0.7723450536537506)
 ("rainha", 0.722699940527399)
 ("casa", 0.7108168815910517)
 ("irmão", 0.6850926487026173)

In [8]:
U, V, Q, D1, D2, R0 = svd(copy(wv_pt_50.matrix'), copy(wv_en_50.matrix'))
D1[50,50] = 0
D1[49, 49] = 0
D1[48, 48] = 0
D1[47,47] = 0
copia = Model(words_pt, (D1*R0*Q')', size(wv_pt_50.matrix))
rankSimilarity(copia, getVector(copia, "mulher")), diag(D1)

([("mulher", 0.9999999999999998), ("mãe", 0.921394849264103), ("menina", 0.9072413361196477), ("irmã", 0.8617308557516151), ("homem", 0.8168369590975835), ("pai", 0.8015913258329228), ("menino", 0.7645058904757502), ("rainha", 0.7143938922961118), ("casa", 0.7104452058882259), ("irmão", 0.6852348958152135)], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0])

In [10]:
#avaliando se a ordem importa ou se fazer o transposto é suficiente
U, S, V = LinearAlgebra.svd( transpose(wv_pt_50) * wv_en_50)
translator1 = V * transpose(U)

U, S, V = LinearAlgebra.svd( transpose(wv_en_50) * wv_pt_50)
translator2 = V * transpose(U)

#resultado: igual, diferença na ordem da 15 casa decimal (erro de máquina)
norm(translator1 - transpose(translator2))

1.627173057367424e-13

In [11]:
#usando dimensões diferentes
U, S, V = LinearAlgebra.svd( transpose(wv_en_100) * wv_pt_50)
translator3 = V * transpose(U)

word = "queen" #varia pouco entre quem tem o melhor score, parece favorecer o en com 100 dimensões
result1a = rankSimilarity(pt_50, translator2 * getVector(en_50, word))[1]
result1b = rankSimilarity(pt_50, translator3 * getVector(en_100, word))[1]
println(result1a, result1b)

word = "rainha" #parece favorecer o modelo en com 50 dimensões
result2a = rankSimilarity(en_50, transpose(translator2) * getVector(pt_50, word))[1]
result2b = rankSimilarity(en_100, transpose(translator3) * getVector(pt_50, word))[1]
println(result2a, result2b)

("rainha", 0.8498238306396301)("rainha", 0.7775425505991816)
("queen", 0.8498238306396297)("daughter", 0.7438290230631964)


In [12]:
#usando o generalized svd
pseudo_inversa(M) = [ M[i, j] == 0 ? 0 : 1/M[i,j] for j=1:size(M, 2), i=1:size(M, 1)]
U, V, Q, D1, D2, R0 = svd(copy(wv_en_100'), copy(wv_pt_50'))

rel_pro = U*Matrix(D1)*pinv(Matrix(D2))*V'
#rel_pro = V*Matrix(D2)*pinv(Matrix(D1))*U'

word = "parede"
pt_en =  rankSimilarity(en_100, rel_pro * getVector(pt_50, word))

word = "wall"
en_pt = rankSimilarity(pt_50, rel_pro' * getVector(en_100, word))

println(pt_en)
println(en_pt)

[("onto", 0.5425498715449623), ("concrete", 0.5002466238329982), ("glass", 0.4962575911533338), ("visible", 0.48745301280668063), ("ground", 0.48518285156191865), ("window", 0.48458507857736405), ("roof", 0.48107733200523706), ("cross", 0.4757440657205746), ("landmark", 0.4739111107369525), ("walls", 0.47081290183146907)]
[("artificial", 0.31917010595572204), ("ifc", 0.3172014738614571), ("saatchi", 0.30751564465120657), ("facility", 0.29633417589381245), ("barreira", 0.2921342590739936), ("gigante", 0.2919276204870866), ("enorme", 0.29018642419862894), ("buraco", 0.28646906759287405), ("bilateral", 0.2850038254702518), ("uab", 0.27881141908699014)]


In [9]:
pre_trained = Model("bert_pre_trained.txt")
fine_tuned = Model("bert_fine_tuned.txt")

Model(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"  …  "13849", "13850", "13851", "13852", "13853", "13854", "13855", "13856", "13857", "13858"], [-0.846839309 0.316527665 … -0.119948894 -0.722243428; -1.33063745 0.0863740444 … -0.362909794 -0.958913624; … ; -1.29426849 -0.0411639214 … -0.531373501 -0.828964531; -1.40153825 -0.00174879329 … -0.506277561 -0.812443912], (13859, 768))

In [10]:
#corte = 1000
ptr = pre_trained.matrix
ftr = fine_tuned.matrix

#sanity check: igual ao equivalente em python
#U, S, V = LinearAlgebra.svd( ptr' * ftr)
#t = V * U'
#print( norm(t * pre_trained.matrix' - fine_tuned.matrix' ) )

#testando com o gsvd
U, V, Q, D1, D2, R0 = svd( copy(ptr'), copy(ftr') )

GeneralizedSVD{Float64, Matrix{Float64}, Float64, Vector{Float64}}
U factor:
768×768 Matrix{Float64}:
 -0.0157561   -0.00800074  …   0.0417602   -0.00757467  0.0340669
  0.0403759   -0.0510698      -0.00233342  -0.0256246   0.0327814
  0.0489229    0.0221008      -0.00945335   0.0304271   0.0347773
  0.0454499    0.0666006       0.0226363   -0.018633    0.0337839
  0.00893428  -0.0536798       0.0188663   -0.0290065   0.0325154
 -0.0743343   -0.0189231   …  -0.00621576   0.0423542   0.0339138
 -0.00994869  -0.0181473       0.0176232    0.0315896   0.0346315
 -0.0555386   -0.0177298       0.017741    -0.00894197  0.0343377
  0.0112      -0.0101952      -0.00603635  -0.00985477  0.0354194
  0.0422447   -0.0118826       0.0119449   -0.00617351  0.0337181
 -0.0245246   -0.0377828   …   0.00260209  -0.0240956   0.0344884
  0.00676223  -0.00886053      0.0331352    0.0155149   0.0340115
 -0.0468199    0.0393838       0.0213483   -0.0328872   0.0351048
  ⋮                        ⋱   ⋮        

In [23]:
#H = R0*Q'
#writedlm( "H.csv",  H, ' ')
#writedlm( "D1.csv",  D1, ' ')
A = ptr'
B = ftr'

m, n = size(A)
p, n = size(B)
r = rank([A; B])
l = rank(B)
#k = l-r
n, m, p, r, l, m-r+l

(13859, 768, 768, 1536, 768, 0)

In [49]:
#768x768

R0_linha = R0[:,13859-1535:13859] #upper triangular (K+L x K+L), K+L=1536
rel_pro = D1 * R0_linha' * D2'
#rel_pro = R0_linha
#rel_pro = V * D2 * pinv(D1) * U'
#norm(rel_pro * pre_trained.matrix' - fine_tuned.matrix')
writedlm( "tradutor_julia.csv",  rel_pro, ' ')
#R0_linha

In [22]:
#for i = 228:760
l_pro = V * D2 * pinv(D1) * U'
#el_pro = V * U'

writedlm( "tradutor_julia.csv",  rel_pro, ' ')
norm(rel_pro * pre_trained.matrix' - fine_tuned.matrix' ) 

2988.241832616594

In [25]:
rel_pro' * rel_pro

768×768 Matrix{Float64}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0