In [1]:
using LinearAlgebra
using DelimitedFiles

In [2]:
struct Model
    labels::Vector{String}
    matrix::Matrix{Float64}
    shape::Tuple{Int64, Int64}

    function Model(file_name; limit=nothing)
        file = open(file_name, "r")
        rows, columns = parse.(Int64, split(readline(file)))
        rows = limit == nothing ? rows : min(rows, limit)
    
        matrix = Array{Float64}(undef, rows, columns)
        labels = Array{String}(undef, rows)
    
        for i in 1:rows
            line = split(readline(file))    
            labels[i] = line[1]
            matrix[i, :] .= parse.(Float64, line[end-columns+1:end])
        end
        
        new(labels, matrix, (rows, columns))
    end
    
end

In [3]:
struct Word
    pt::String
    en::String
    es::String
end

function loadWords(file_name)
    words = Array{Word}(undef, 0)
    
    for line in eachline(file_name)
        w = Word(split(line)...)
        push!(words, w)
    end
    
    return words
end

loadWords (generic function with 1 method)

In [4]:
function cossineSimilarity(v1, v2)
    return dot(v1, v2)/(norm(v1)*norm(v2))
end

function getVector(model, word)
    idx = findfirst(==(word), model.labels)
    return model.matrix[idx, :]
end

function rankSimilarity(model, v1; top=10)
    rank = []
    for (idx, v2) in enumerate(eachrow(model.matrix))
        try
            push!(rank, (idx,cossineSimilarity(v1, v2)))
        catch
            println(idx)
            println(length(v2))
            println(getVector(model, idx))
        end
    end
    
    sort!(rank, by=r->r[2], rev=true)
    return [ (model.labels[i], r) for (i,r) in rank[1:top]]
end

rankSimilarity (generic function with 1 method)

In [5]:
limit = 100000
words = loadWords("words")
pt_50 = Model("glove.pt.050.txt", limit=limit)
en_50 = Model("glove.en.050.txt", limit=limit)
en_100 = Model("glove.en.100.txt", limit=limit)
print("load model ok")

load model ok

In [6]:
wv_pt_50 = Array{Array{Float64}}(undef, 0)
wv_en_50 = []
wv_en_100  = []

for w in words
    
    push!(wv_pt_50, getVector(pt_50, w.pt))
    push!(wv_en_50, getVector(en_50, w.en))
    push!(wv_en_100,  getVector(en_100, w.en))
end

wv_pt_50 = transpose(reshape(reduce(vcat, wv_pt_50), 50, 109))
wv_en_50 = transpose(reshape(reduce(vcat, wv_en_50), 50, 109))
wv_en_100 = transpose(reshape(reduce(vcat, wv_en_100), 100, 109))

print("load words ok")

load words ok

In [7]:
#avaliando se a ordem importa ou se fazer o transposto é suficiente
U, S, V = LinearAlgebra.svd( transpose(wv_pt_50) * wv_en_50)
translator1 = V * transpose(U)

U, S, V = LinearAlgebra.svd( transpose(wv_en_50) * wv_pt_50)
translator2 = V * transpose(U)

#resultado: igual, diferença na ordem da 15 casa decimal (erro de máquina)
norm(translator1 - transpose(translator2))

2.0846459779133275e-13

In [8]:
#usando dimensões diferentes
U, S, V = LinearAlgebra.svd( transpose(wv_en_100) * wv_pt_50)
translator3 = V * transpose(U)

word = "queen" #varia pouco entre quem tem o melhor score, parece favorecer o en com 100 dimensões
result1a = rankSimilarity(pt_50, translator2 * getVector(en_50, word))[1]
result1b = rankSimilarity(pt_50, translator3 * getVector(en_100, word))[1]
println(result1a, result1b)

word = "rainha" #parece favorecer o modelo en com 50 dimensões
result2a = rankSimilarity(en_50, transpose(translator2) * getVector(pt_50, word))[1]
result2b = rankSimilarity(en_100, transpose(translator3) * getVector(pt_50, word))[1]
println(result2a, result2b)

("rainha", 0.8498238306396296)("rainha", 0.7775425505991818)
("queen", 0.8498238306396301)("daughter", 0.7438290230631963)


In [17]:
#usando o generalized svd
pseudo_inversa(M) = [ M[i, j] == 0 ? 0 : 1/M[i,j] for j=1:size(M, 2), i=1:size(M, 1)]
U, V, Q, D1, D2, R0 = svd(copy(wv_en_50'), copy(wv_pt_50'))

rel_pro = U*Matrix(D1)*pinv(Matrix(D2))*V'
#rel_pro = V*Matrix(D2)*pinv(Matrix(D1))*U'

word = "porta"
pt_en =  rankSimilarity(en_50, rel_pro * getVector(pt_50, word))

word = "door"
en_pt = rankSimilarity(pt_50, rel_pro' * getVector(en_50, word))

println(pt_en)
println(en_pt)

[("the", NaN), (",", NaN), (".", NaN), ("of", NaN), ("to", NaN), ("and", NaN), ("in", NaN), ("a", NaN), ("\"", NaN), ("'s", NaN)]
[(",", NaN), ("de", NaN), (".", NaN), ("a", NaN), ("o", NaN), ("e", NaN), ("que", NaN), ("do", NaN), ("da", NaN), ("em", NaN)]


In [8]:
pre_trained = Model("bert_pre_trained.txt")
fine_tuned = Model("bert_fine_tuned.txt")

Model(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"  …  "13849", "13850", "13851", "13852", "13853", "13854", "13855", "13856", "13857", "13858"], [-0.846839309 0.316527665 … -0.119948894 -0.722243428; -1.33063745 0.0863740444 … -0.362909794 -0.958913624; … ; -1.29426849 -0.0411639214 … -0.531373501 -0.828964531; -1.40153825 -0.00174879329 … -0.506277561 -0.812443912], (13859, 768))

In [11]:
corte = 1000
ptr = pre_trained.matrix
ftr = fine_tuned.matrix

#sanity check: igual ao equivalente em python
U, S, V = LinearAlgebra.svd( ptr' * ftr)
t = V * U'
print( norm(t * pre_trained.matrix' - fine_tuned.matrix' ) )

#testando com o gsvd
U, V, Q, D1, D2, R0 = svd( copy(ptr'), copy(ftr') )

1178.783150618428

GeneralizedSVD{Float64, Matrix{Float64}}
U factor:
768×768 Matrix{Float64}:
 -0.0157561   -0.00800074  …   0.0417602   -0.00757467  0.0340669
  0.0403759   -0.0510698      -0.00233342  -0.0256246   0.0327814
  0.0489229    0.0221008      -0.00945335   0.0304271   0.0347773
  0.0454499    0.0666006       0.0226363   -0.018633    0.0337839
  0.00893428  -0.0536798       0.0188663   -0.0290065   0.0325154
 -0.0743343   -0.0189231   …  -0.00621576   0.0423542   0.0339138
 -0.00994869  -0.0181473       0.0176232    0.0315896   0.0346315
 -0.0555386   -0.0177298       0.017741    -0.00894197  0.0343377
  0.0112      -0.0101952      -0.00603635  -0.00985477  0.0354194
  0.0422447   -0.0118826       0.0119449   -0.00617351  0.0337181
 -0.0245246   -0.0377828   …   0.00260209  -0.0240956   0.0344884
  0.00676223  -0.00886053      0.0331352    0.0155149   0.0340115
 -0.0468199    0.0393838       0.0213483   -0.0328872   0.0351048
  ⋮                        ⋱   ⋮                        
 -0.04296

In [12]:
#for i = 228:760
#rel_pro = V *D2 * pinv(D1) * U'
rel_pro = V * D2 * pinv(D1) * U'

writedlm( "tradutor_julia.csv",  rel_pro, ' ')
norm(rel_pro * pre_trained.matrix' - fine_tuned.matrix' ) 

2672.731045631968