# Download the IMDB Dataset

In [18]:
function pretty_print_review_and_label(i)
    println(labels[i] * "\t\t" * reviews[i][:80]*"...")
end
g = open("reviews.txt", "r");
reviews = map(x -> x[1:end-1], readlines(g))
close(g)

g = open("labels.txt", "r")
labels = map(x -> x[1:end-1], readlines(g))
close(g)

# Capturing Word Correlation in Input Data

In [19]:
onehots = Dict()
onehots["cat"] = [1,0,0,0]
onehots["the"] = [0,1,0,0]
onehots["dog"] = [0,0,1,0]
onehots["sat"] = [0,0,0,1]

sentence = ["the","cat","sat"]
x = onehots[sentence[1]] +
    onehots[sentence[2]] +
    onehots[sentence[3]]

println("Sent Encoding:" ,x)

Sent Encoding:[1, 1, 0, 1]


# Predicting Movie Reviews

In [26]:
f = open("reviews.txt")
raw_reviews = readlines(f)
close(f)

f = open("labels.txt")
raw_labels = readlines(f)
close(f)

tokens = collect(Set(map(x -> split(x, " "), raw_reviews)))

vocab = Set()
for sent in tokens
    for word in sent
        if length(word)>0
            push!(vocab, word)
        end
    end
end
vocab = collect(vocab)

word2index = Dict()
for (i,word) in enumerate(vocab)
    word2index[word] = i
end

input_dataset = []
for sent in tokens
    sent_indices = []
    for word in sent
        try
            push!(sent_indices, word2index[word])
        catch
            nothing
        end
    end
    push!(input_dataset, sent_indices)
end

target_dataset = []
for label in raw_labels
    if label == "positive"
        push!(target_dataset, 1)
    else
        push!(target_dataset, 0)
    end
end

In [94]:
using Random: seed!
seed!(1);

sigmoid(x) = 1/(1 + exp(-x))

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2 .* rand(hidden_size, length(vocab)) .- 0.1
weights_1_2 = 0.2 .* rand(1, hidden_size) .- 0.1

correct,total = (0,0)

for iter=1:iterations
    global correct,total
    
    for i=1:length(input_dataset)-1000
        x,y = (input_dataset[i],target_dataset[i])
        layer_1 = sigmoid.(sum(weights_0_1[:,x]; dims=2)) #embed + sigmoid
        layer_2 = sigmoid.(weights_1_2 * layer_1) # linear + softmax

        layer_2_delta = layer_2[1] - y # compare pred with truth
        layer_1_delta = weights_1_2' * layer_2_delta #backprop
        layer_2_delta .* layer_1
        weights_0_1[:,x] .-= layer_1_delta .* alpha
        weights_1_2 .-= layer_2_delta .* layer_1' .* alpha
        
        if abs(layer_2_delta) < 0.5
            correct += 1
        end
        total += 1
        
        if (i%10 == 9)
            progress = string(i/length(input_dataset))
            print("Iter: $(iter) Progress: $(progress[3:4]).$(progress[5:6])% Training Accuracy: $(correct/total)% \r")
        end
    end
    println()
end




correct,total = (0,0)
for i=length(input_dataset)-1000+1:length(input_dataset)
    global correct,total
    x = input_dataset[i]
    y = target_dataset[i]
    
    layer_1 = sigmoid.(sum(weights_0_1[:,x]; dims=2))
    layer_2 = sigmoid.(weights_1_2 * layer_1)
    
    if abs(layer_2[1] - y) < 0.5
        correct += 1
    end
    total += 1 
end

println("Test Accuracy: $(correct / total)")

Iter: 1 Progress: 95.97% Training Accuracy: 0.447801163228587%   
Iter: 2 Progress: 95.97% Training Accuracy: 0.4768310286395682%  
Test Accuracy: 0.504


In [95]:
tokens[1]

168-element Array{SubString{String},1}:
 "ok"
 "now"
 ""
 "lets"
 "see"
 "."
 "what"
 "was"
 "funny"
 "in"
 "the"
 "first"
 "movie"
 ⋮
 "out"
 "of"
 "ten"
 "since"
 "it"
 "has"
 "some"
 "funny"
 "parts"
 "."
 ""
 ""

# Comparing Word Embeddings

In [129]:
function similar(target = "beautiful")
    target_index = word2index[target]
    scores = Dict()
    for (word,index) in word2index
        raw_difference = weights_0_1[:,index] .- (weights_0_1[:,target_index])
        squared_difference = raw_difference .* raw_difference
        scores[word] = -sqrt(sum(squared_difference))
    end
    scores = sort(collect(scores), by = x -> x[2])
    return scores[end-10:end]
end

similar (generic function with 2 methods)

In [130]:
print(similar("beautiful"))

Pair{Any,Any}["filmy" => -0.6606711188723169, "confab" => -0.6600965247133369, "cmara" => -0.6600060084860495, "parameters" => -0.6591945678386412, "theron" => -0.6576485630760832, "faithful" => -0.6561845107874287, "deluders" => -0.6555951081367472, "diwana" => -0.6501706963541887, "alliances" => -0.6497885466271454, "brasseur" => -0.6434690598261698, "beautiful" => -0.0]

In [131]:
print(similar("terrible"))

Pair{Any,Any}["fangs" => -0.666296956359614, "sleazier" => -0.6646779152677983, "regrettable" => -0.6624437193693834, "abortion" => -0.6613846456145038, "ghoulies" => -0.6611451961709435, "inveighing" => -0.6547246882828751, "kkk" => -0.6545768734779763, "sumo" => -0.6449526409110887, "disinherit" => -0.6203307006148524, "benedick" => -0.6051667690036869, "terrible" => -0.0]

In [153]:
using Random: seed!, shuffle!
using Statistics: mean
seed!(1)

f = open("reviews.txt")
raw_reviews = readlines(f)
close(f)

tokens = collect(Set(map(x -> split(x, " "), raw_reviews)))

vocab = Set()
for sent in tokens
    for word in sent
        push!(vocab, word)
    end
end
vocab = collect(vocab)
pushfirst!(vocab, "")

word2index = Dict()
for (i,word) in enumerate(vocab)
    word2index[word] = i
end


concatenated = []
input_dataset = []

for sent in tokens
    sent_indices = []
    for word in sent
        try
            push!(sent_indices, word2index[word])
            push!(concatenated, word2index[word])
        catch
            nothing
        end
    end
    push!(input_dataset, sent_indices)
end
shuffle!(input_dataset);

In [None]:
alpha, iterations = (0.05, 2)
hidden_size,window,negative = (50,2,5)

weights_0_1 = (rand(hidden_size, length(vocab)) .- 0.5) .* 0.2
weights_1_2 = zeros(hidden_size, length(vocab))

layer_2_target = zeros(negative+1)
layer_2_target[1] = 1

function similar(target = "beautiful")
    target_index = word2index[target]
    scores = Dict()
    for (word,index) in word2index
        raw_difference = weights_0_1[:,index] .- (weights_0_1[:,target_index])
        squared_difference = raw_difference .* raw_difference
        scores[word] = -sqrt(sum(squared_difference))
    end
    scores = sort(collect(scores), by = x -> x[2])
    return scores[end-10:end]
end

sigmoid(x) = 1/(1 + exp(-x))

for (rev_i,review) in enumerate(repeat(input_dataset, iterations))
    for target_i=1:length(review)
    # since it's really expensive to predict every vocabulary
    # we're only going to predict a random subset  
        target_samples = cat([review[target_i]],
            concatenated[floor.(Int, rand(negative) .* length(concatenated))];dims=1)
        
        left_context = review[maximum([1,target_i-window]):target_i-1]
        right_context = review[target_i+1:minimum([length(review),target_i+window])]
        
        layer_1 = mean(weights_0_1[:,cat(left_context,right_context;dims=1)];dims=2)
        layer_2 = sigmoid.(weights_1_2[:,target_samples]' * layer_1)
        
        layer_2_delta = layer_2 .- layer_2_target
        layer_1_delta = weights_1_2[:,target_samples] * layer_2_delta
        
        weights_0_1[:,cat(left_context,right_context;dims=1)] .-= layer_1_delta .* alpha
        weights_1_2[:,target_samples] .-= layer_2_delta' .* layer_1 .* alpha
    end
    if ((rev_i-1)%250 ==0)
        progress = string(rev_i*iterations/length(input_dataset))
        print("Iter: $(rev_i) Progress: $(progress[3:4]).$(progress[5:6])% $(similar("terrible")) \r")
    end
end
println()
print(similar("terrible"))

# King - Man + Woman ~= Queen

In [184]:
function analogy(positive=["terrible","good"],negative=["bad"])
    norms = sum(weights_0_1 .* weights_0_1;dims=1)
    normed_weights = weights_0_1 .* norms
    
    query_vect = zeros(length(weights_0_1[:,1]))
    for word in positive
        query_vect .+= normed_weights[:,word2index[word]]
    end
    for word in negative
        query_vect .-= normed_weights[:,word2index[word]]
    end
    
    scores = Dict()
    for (word,index) in word2index
        raw_difference = weights_0_1[:,index] .- query_vect
        squared_difference = raw_difference .* raw_difference
        scores[word] = -sqrt(sum(squared_difference))
    end
    scores = sort(collect(scores), by = x -> x[2])
    return scores[end-10:end]
    
end

analogy (generic function with 3 methods)

In [None]:
analogy(["terrible","good"],["bad"])

In [None]:
analogy(["elizabeth","he"],["she"])