In [1]:
using CSV, DataFrames
using Random
using Base.Iterators: partition
import StatsBase.sample, StatsBase.Weights
using Flux
using Flux: onehot, onecold, onehotbatch
using Flux: crossentropy, throttle, params
using Zygote
using BSON, JLD2, Statistics
using CUDA

In [2]:
use_cuda = true
if use_cuda && CUDA.functional()
    device = gpu
    @info "Training on GPU"
else
    device = cpu
    @info "Training on CPU"
end

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining on GPU


In [52]:

# Load the data

data_file = "reviews_cleaner.csv"
df = CSV.read(data_file, DataFrame)
texts = df.text


6990278-element Vector{String}:
 "I've taken a lot of spin classe"[93m[1m ⋯ 768 bytes ⋯ [22m[39m"'s kicking your butt in class!"
 "Family diner. Had the buffet. E"[93m[1m ⋯ 279 bytes ⋯ [22m[39m"ns. Next to the Clarion Hotel."
 "Wow!  Yummy, different,  delici"[93m[1m ⋯ 182 bytes ⋯ [22m[39m"new!   You'll be glad you did!"
 "Cute interior and owner (?) gav"[93m[1m ⋯ 473 bytes ⋯ [22m[39m"ll try one of the draft wines."
 "I am a long term frequent custo"[93m[1m ⋯ 280 bytes ⋯ [22m[39m". NEVER going back to dmitris!"
 "Loved this tour! I grabbed a gr"[93m[1m ⋯ 743 bytes ⋯ [22m[39m" my favorite parts of my trip!"
 "Amazingly amazing wings and hom"[93m[1m ⋯ 131 bytes ⋯ [22m[39m" checking out this hidden gem."
 "This easter instead of going to"[93m[1m ⋯ 465 bytes ⋯ [22m[39m"ke they put a bit more effort."
 "Had a party of 6 here for hibac"[93m[1m ⋯ 463 bytes ⋯ [22m[39m" the money I wouldn't go back."
 "My experience with Shalimar was"[93m[1m ⋯ 948 bytes ⋯ 

In [None]:


function preprocess_text(text)
    
    text = lowercase(text)
    text = replace(text, r"[[:punct:]]+" => " ")
    return split(text)
end

sentences = [preprocess_text(text) for text in texts]

tokens = vcat(sentences...)
alphabet = unique(tokens)

freqs = Dict{String, Int}()
for t in tokens
    freqs[t] = get(freqs, t, 0) + 1
end

# Ensure "UNK" is always in the alphabet and freqs
push!(alphabet, "UNK")
freqs["UNK"] = 0

# Replace singleton tokens with an "unknown" marker
for sentence in sentences
    for i in 1:length(sentence)
        if get(freqs, sentence[i], 0) == 1
            sentence[i] = "UNK"
        end
    end
end

tokens = vcat(sentences...)
alphabet = unique(tokens)

# Subsampling
train_alphabet = [word for word in alphabet if rand() < (1  - sqrt(1e-5/(freqs[word]/length(tokens))))]

# Parse sentences to create context words
function parse_sentences!(sentences, K, alphabet)
    context_words = Dict(token => [] for token in alphabet)
    for sentence in sentences
        for (i, word) in enumerate(sentence)
            word ∉ alphabet && continue
            for j in i-K:i+K
                (j < 1 || j > length(sentence) || i == j) && continue
                context_word = sentence[j]
                push!(context_words[word], context_word)
                unique!(context_words[word])
            end
        end
    end
    map(token -> isempty(context_words[token]) && delete!(context_words, token), alphabet)
    filter!(token -> haskey(context_words, token) == true, alphabet)
    return context_words
end

# Set the context window size
window = 5
context_words = parse_sentences!(sentences, window, alphabet)

# Unigram sampler
function unigram_sampler(alphabet, freqs, τ = 0.75)
    sum_probs = sum(values(freqs).^τ)
    return [freqs[token].^τ / sum_probs for token in alphabet]
end

token_weights = Weights(unigram_sampler(alphabet, freqs))

# Define the model
latent_dim = 300
input_embedding = Dense(length(alphabet), latent_dim) |> device
output_embedding = Dense(length(alphabet), latent_dim) |> device

# Generate batch data for training
function gen_batch(wordlist, ℓ = 1, K = 2*window)
    data = []
    for word in wordlist
        for context_word_sample in rand(context_words[word], ℓ)
            neg_samples = Vector{Int}(undef, K)
            for i in 1:K
                neg_samples[i] = sample(1:length(alphabet), token_weights)
                while alphabet[neg_samples[i]] ∈ context_words[word]
                    neg_samples[i] = sample(1:length(alphabet), token_weights)
                end
            end
            push!(data, (onehot(word, alphabet), onehot(context_word_sample, alphabet), 
                    onehotbatch(alphabet[neg_samples], alphabet)))
        end
    end
    data  
end

# Define the negative sampling loss function
function negative_sampling_loss(word, context_word, neg_samples) 
     l_context = -log(σ(transpose(output_embedding(context_word)) * input_embedding(word)))  
     l_negative = - sum(log.(σ.(transpose(-output_embedding(neg_samples))* input_embedding(word))))
    return l_context + l_negative
end

In [11]:
train_alphabet

812-element Vector{SubString{String}}:
 "me"
 "of"
 "being"
 "because"
 "my"
 "never"
 "got"
 "to"
 "see"
 "this"
 "place"
 "hope"
 "they"
 ⋮
 "pictures"
 "perspective"
 "cielo"
 "shot"
 "picture"
 "truck"
 "honestly"
 "looked"
 "hairs"
 "max"
 "single"
 "bummed"

In [12]:


batch_size = 64
neg_sampling = 2 * window
epochs = 100
opt = ADAM(0.003)

# Create data loader
w2v_data = Flux.DataLoader(train_alphabet, batchsize=batch_size, shuffle=true)

# Training loop
@info("Beginning training loop...")
loss = Inf
last_improvement = 0
for epoch in 1:epochs
    @info "Epoch $epoch"
    for word_batch in w2v_data
        data = gen_batch(word_batch)
        ps = params(input_embedding, output_embedding)
        _, back = Zygote.pullback(ps) do
            losses = [negative_sampling_loss(dt[1], dt[2], dt[3]) for dt in data]
            sum(losses) / length(losses)
        end
        grads = back(1f0)
        Flux.Optimise.update!(opt, ps, grads)
    end
    new_loss = begin
        losses = [negative_sampling_loss(dt[1], dt[2], dt[3]) for dt in gen_batch(rand(collect(w2v_data)))]
        sum(losses) / length(losses)
    end
    if new_loss < loss
        loss = new_loss
        @info "new best embedding!"
        @info("loss = $loss")
        model_params = cpu.(params(input_embedding, output_embedding))
        BSON.@save "word2vec.bson" model_params
        jldsave("word2vec.jld2"; input_embedding=cpu.(params(input_embedding)), 
                                output_embedding=cpu.(params(output_embedding)))
        last_improvement = epoch
    end
    if epoch - last_improvement ≥ 15
        @warn(" -> We're calling this converged.")
        break
    end
end


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mBeginning training loop...
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mEpoch 1
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mnew best embedding!
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mloss = 6.164083
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mEpoch 2
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mnew best embedding!
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mloss = 3.2834537
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mEpoch 3
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mEpoch 4
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mnew best embedding!
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mloss = 3.0657735
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mEpoch 5
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mEpoch 6
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mEpoch 7
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mnew best embedding!
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mloss = 2.9987671
[36m[1m[ [22m[39m[36m[1mInfo

In [28]:
df.text_clean

100-element Vector{String}:
 "disrespectful bouncer accuses d"[93m[1m ⋯ 40 bytes ⋯ [22m[39m" bankrupt attitude keep pushing"
 "ralphs park nice restaurant don"[93m[1m ⋯ 287 bytes ⋯ [22m[39m"d hire cook brennan family ugh"
 "cute spot friendly awesome serv"[93m[1m ⋯ 27 bytes ⋯ [22m[39m"licious happened upon spot glad"
 "time good food wish dine right come back whole"
 "nice spot sobro love tucked awa"[93m[1m ⋯ 149 bytes ⋯ [22m[39m"rvation walk away feeling full"
 "served rude lady burped face di"[93m[1m ⋯ 28 bytes ⋯ [22m[39m"eeded anything else walked away"
 "dr youngblood amazing he best d"[93m[1m ⋯ 44 bytes ⋯ [22m[39m"e office admin really cool well"
 "ordered pizza take couple time "[93m[1m ⋯ 38 bytes ⋯ [22m[39m"ling day good day burnt cheesed"
 "view food great usually get sea"[93m[1m ⋯ 149 bytes ⋯ [22m[39m"ays attentive friendly service"
 "last night girlfriend experienc"[93m[1m ⋯ 781 bytes ⋯ [22m[39m"sible would give 0 star rating"
 "love place 

In [47]:
print(get_embedding("disrespectful", alphabet, input_embedding))

nothing

In [44]:
for word in split(df.text_clean[1])
    print(word)
    print("\n")
end

disrespectful
bouncer
accuses
drunk
cologne
never
got
see
place
hope
go
bankrupt
attitude
keep
pushing


In [None]:
function preprocess_text(text)
    # Convert to lowercase
    text = lowercase(text)
    # Remove punctuation and handle multiple punctuation
    text = replace(text, r"[[:punct:]]+" => " ")
    # Split into words (tokens)
    return split(text)
end

# Apply preprocessing
df.text_clean = preprocess_text.(df.text)

In [51]:
# Load the pre-trained embeddings
latent_dim = 300
input_embedding = Dense(length(alphabet), latent_dim)

model_params = JLD2.load("word2vec.jld2")
Flux.loadparams!(input_embedding, model_params["input_embedding"])

# Function to get the embedding for a word
function get_embedding(word, alphabet, input_embedding)
    if word in alphabet
        return input_embedding(onehot(word, alphabet))
    else
        return nothing
    end
end

# Calculate mean embedding for each review
function mean_embedding(review, alphabet, input_embedding)
    embeddings = [get_embedding(word, alphabet, input_embedding) for word in split(review) if get_embedding(word, alphabet, input_embedding) !== nothing]
    if length(embeddings) == 0
        return fill(0.0, latent_dim)
    else
        return mean(hcat(embeddings...), dims=2)[:]
    end
end

# Calculate mean vector for each review
df.mean_vector = mean_embedding.(df.text_clean, Ref(alphabet), Ref(input_embedding))


# Save the updated DataFrame to a new CSV file
CSV.write("G:/LLM/own60k.csv", df)

"G:/LLM/own.csv"