In [1]:
using LinearAlgebra

In [2]:
struct Model
    labels::Vector{String}
    matrix::Matrix{Float64}
    shape::Tuple{Int64, Int64}

    function Model(file_name; limit=nothing)
        file = open(file_name, "r")
        rows, columns = parse.(Int64, split(readline(file)))
        rows = limit == nothing ? rows : min(rows, limit)
    
        matrix = Array{Float64}(undef, rows, columns)
        labels = Array{String}(undef, rows)
    
        for i in 1:rows
            line = split(readline(file))    
            labels[i] = line[1]
            matrix[i, :] .= parse.(Float64, line[end-columns+1:end])
        end
        
        new(labels, matrix, (rows, columns))
    end
    
end

In [3]:
struct Word
    pt::String
    en::String
    es::String
end

function loadWords(file_name)
    words = Array{Word}(undef, 0)
    
    for line in eachline(file_name)
        w = Word(split(line)...)
        push!(words, w)
    end
    
    return words
end

loadWords (generic function with 1 method)

In [4]:
function cossineSimilarity(v1, v2)
    return dot(v1, v2)/(norm(v1)*norm(v2))
end

function getVector(model, word)
    idx = findfirst(==(word), model.labels)
    return model.matrix[idx, :]
end

function rankSimilarity(model, v1; top=10)
    rank = []
    for (idx, v2) in enumerate(eachrow(model.matrix))
        try
            push!(rank, (idx,cossineSimilarity(v1, v2)))
        catch
            println(idx)
            println(length(v2))
            println(getVector(model, idx))
        end
    end
    
    sort!(rank, by=r->r[2], rev=true)
    return [ (model.labels[i], r) for (i,r) in rank[1:top]]
end

rankSimilarity (generic function with 1 method)

In [5]:
limit = 100000
words = loadWords("words")
pt_50 = Model("glove.pt.050.txt", limit=limit)
en_50 = Model("glove.en.050.txt", limit=limit)
en_100 = Model("glove.en.100.txt", limit=limit)
print("load model ok")

load model ok

In [6]:
wv_pt_50 = Array{Array{Float64}}(undef, 0)
wv_en_50 = []
wv_en_100  = []

for w in words
    
    push!(wv_pt_50, getVector(pt_50, w.pt))
    push!(wv_en_50, getVector(en_50, w.en))
    push!(wv_en_100,  getVector(en_100, w.en))
end

wv_pt_50 = transpose(reshape(reduce(vcat, wv_pt_50), 50, 109))
wv_en_50 = transpose(reshape(reduce(vcat, wv_en_50), 50, 109))
wv_en_100 = transpose(reshape(reduce(vcat, wv_en_100), 100, 109))

print("load words ok")

load words ok

In [7]:
#avaliando se a ordem importa ou se fazer o transposto é suficiente
U, S, V = LinearAlgebra.svd( transpose(wv_pt_50) * wv_en_50)
translator1 = V * transpose(U)

U, S, V = LinearAlgebra.svd( transpose(wv_en_50) * wv_pt_50)
translator2 = V * transpose(U)

#resultado: igual, diferença na ordem da 15 casa decimal (erro de máquina)
translator1 - transpose(translator2)

50×50 Matrix{Float64}:
  6.05072e-15   9.85323e-16   7.42462e-16  …   1.95677e-15   3.06547e-15
 -8.04912e-16   4.2466e-15   -3.9968e-15      -8.19657e-16  -8.67362e-17
 -5.96745e-15   4.82947e-15  -4.71845e-16     -9.49826e-15   4.16334e-16
  1.27676e-15  -4.08007e-15   9.00321e-16      2.41474e-15   8.74301e-16
 -4.30211e-16  -6.45317e-15   1.94289e-15      5.85643e-15  -2.82413e-15
 -2.44249e-15  -4.87024e-16   3.38618e-15  …  -3.46945e-15  -2.05391e-15
  4.44089e-16   7.34135e-15   3.40006e-16     -1.39055e-14  -1.38778e-15
 -1.66533e-15  -2.33147e-15   8.90954e-15     -5.90847e-15  -3.68455e-15
  5.27356e-16   3.10169e-15   2.158e-15       -3.16414e-15  -1.33227e-15
 -3.07133e-15  -7.05859e-15   1.22125e-15      9.74221e-15  -9.99201e-16
 -4.77049e-16   4.30211e-15   1.25594e-15  …  -1.20182e-14  -1.38778e-15
 -3.83027e-15   3.05311e-15  -1.11022e-15     -2.77903e-15  -1.16573e-15
  3.747e-15     6.10623e-16   8.74301e-16     -3.45991e-15   4.996e-16
  ⋮                           

In [8]:
#usando dimensões diferentes
U, S, V = LinearAlgebra.svd( transpose(wv_en_100) * wv_pt_50)
translator3 = V * transpose(U)

word = "floor" #varia pouco entre quem tem o melhor score, parece favorecer o en com 100 dimensões
result1a = rankSimilarity(pt_50, translator2 * getVector(en_50, word))[1]
result1b = rankSimilarity(pt_50, translator3 * getVector(en_100, word))[1]
println(result1a, result1b)

word = "chão" #parece favorecer o modelo en com 50 dimensões
result2a = rankSimilarity(en_50, transpose(translator2) * getVector(pt_50, word))[1]
result2b = rankSimilarity(en_100, transpose(translator3) * getVector(pt_50, word))[1]
println(result2a, result2b)

("debaixo", 0.863570322804008)("chão", 0.8406395466126464)
("floor", 0.8196048072225328)("floor", 0.7891998045017863)


In [97]:
#usando o generalized svd
U, V, Q, D1, D2, R0 = svd(copy(transpose(wv_en_50)), copy(transpose(wv_pt_50)))
D2 = D2[:, 51:end]
D1 = D1[:, 1:50]

(D1 / D2)
#translator4 = (U * (D1 / D') * U') 
#word = "girl" 
#rankSimilarity(pt_50, transpose(translator4) * getVector(en_50, word))

50×50 SparseArrays.SparseMatrixCSC{Float64, Int64} with 50 stored entries:
⠑⢄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠑⢄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠑⢄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠑⢄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠑⢄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠑⢄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠑⢄⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠑⢄⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠑⢄⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠑⢄⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠑⢄⠀⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠑⢄⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠑