Add Language Model tutorial files, update cmake and readme

mfl28 · mfl28 · commit dd2af8284b52 · 2019-12-08T19:25:04.000+01:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -22,6 +22,7 @@ add_subdirectory("tutorials/intermediate/convolutional_neural_network")
 add_subdirectory("tutorials/intermediate/deep_residual_network")
 add_subdirectory("tutorials/intermediate/recurrent_neural_network")
 add_subdirectory("tutorials/intermediate/bidirectional_recurrent_neural_network")
+add_subdirectory("tutorials/intermediate/language_model")
 
 # The following code block is suggested to be used on Windows.
 # According to https://github.com/pytorch/pytorch/issues/25457,
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ $ ./scripts.sh build
 * [Deep Residual Network](https://github.com/prabhuomkar/pytorch-cpp/tree/master/tutorials/intermediate/deep_residual_network/src/main.cpp)
 * [Recurrent Neural Network](https://github.com/prabhuomkar/pytorch-cpp/tree/master/tutorials/intermediate/recurrent_neural_network/src/main.cpp)
 * [Bidirectional Recurrent Neural Network](https://github.com/prabhuomkar/pytorch-cpp/tree/master/tutorials/intermediate/bidirectional_recurrent_neural_network/src/main.cpp)
-* [Language Model (RNN-LM)]()
+* [Language Model (RNN-LM)](https://github.com/prabhuomkar/pytorch-cpp/tree/master/tutorials/intermediate/language_model/src/main.cpp)
 
 #### 3. Advanced
 * [Generative Adversarial Networks]()
diff --git a/tutorials/intermediate/language_model/CMakeLists.txt b/tutorials/intermediate/language_model/CMakeLists.txt
@@ -0,0 +1,43 @@
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+
+project(language-model VERSION 1.0.0 LANGUAGES CXX)
+
+# Files
+set(SOURCES src/main.cpp
+            src/rnn_lm.cpp
+            src/corpus.cpp
+            src/dictionary.cpp
+            src/clip_grad_norm.cpp
+)
+
+set(HEADERS include/rnn_lm.h
+            include/corpus.h
+            include/dictionary.h
+            include/clip_grad_norm.h
+)
+
+set(EXECUTABLE_NAME language-model)
+
+
+add_executable(${EXECUTABLE_NAME} ${SOURCES} ${HEADERS})
+target_include_directories(${EXECUTABLE_NAME} PRIVATE include)
+
+target_link_libraries(${EXECUTABLE_NAME} "${TORCH_LIBRARIES}")
+
+set_target_properties(${EXECUTABLE_NAME} PROPERTIES
+  CXX_STANDARD 11
+  CXX_STANDARD_REQUIRED YES
+)
+
+# The following code block is suggested to be used on Windows.
+# According to https://github.com/pytorch/pytorch/issues/25457,
+# the DLLs need to be copied to avoid memory errors.
+# See https://pytorch.org/cppdocs/installing.html.
+if (MSVC)
+  file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
+  add_custom_command(TARGET ${EXECUTABLE_NAME}
+                     POST_BUILD
+                     COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                     ${TORCH_DLLS}
+                     $<TARGET_FILE_DIR:${EXECUTABLE_NAME}>)
+endif (MSVC)
diff --git a/tutorials/intermediate/language_model/include/clip_grad_norm.h b/tutorials/intermediate/language_model/include/clip_grad_norm.h
@@ -0,0 +1,9 @@
+// Copyright 2019 Markus Fleischhacker
+#pragma once
+
+#include <torch/torch.h>
+#include <vector>
+
+namespace nn_utils {
+    void clip_grad_l2_norm(std::vector<torch::Tensor> parameters, double max_norm);
+}  // namespace nn_utils
diff --git a/tutorials/intermediate/language_model/include/corpus.h b/tutorials/intermediate/language_model/include/corpus.h
@@ -0,0 +1,19 @@
+// Copyright 2019 Markus Fleischhacker
+#pragma once
+
+#include <torch/torch.h>
+#include <string>
+#include "dictionary.h"
+
+namespace data_utils {
+class Corpus {
+ public:
+    explicit Corpus(const std::string& path) : path_(path) {}
+    torch::Tensor get_data(int64_t batch_size);
+    const Dictionary& get_dictionary() const { return dictionary_; }
+ private:
+    std::string path_;
+    Dictionary dictionary_;
+};
+}  // namespace data_utils
+
diff --git a/tutorials/intermediate/language_model/include/dictionary.h b/tutorials/intermediate/language_model/include/dictionary.h
@@ -0,0 +1,18 @@
+// Copyright 2019 Markus Fleischhacker
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace data_utils {
+class Dictionary {
+ public:
+    int64_t add_word(const std::string& word);
+    std::string word_at_index(int64_t index) const { return idx2word_[index]; }
+    size_t size() const { return word2idx_.size(); }
+ private:
+    std::unordered_map<std::string, size_t> word2idx_;
+    std::vector<std::string> idx2word_;
+};
+}  // namespace data_utils
diff --git a/tutorials/intermediate/language_model/include/rnn_lm.h b/tutorials/intermediate/language_model/include/rnn_lm.h
@@ -0,0 +1,17 @@
+// Copyright 2019 Markus Fleischhacker
+#pragma once
+
+#include <torch/torch.h>
+
+class RNNLMImpl : public torch::nn::Module {
+ public:
+    RNNLMImpl(int64_t vocab_size, int64_t embed_size, int64_t hidden_size, int64_t num_layers);
+    torch::nn::RNNOutput forward(torch::Tensor x, torch::Tensor h);
+
+ private:
+    torch::nn::Embedding embed;
+    torch::nn::LSTM lstm;
+    torch::nn::Linear linear;
+};
+
+TORCH_MODULE(RNNLM);
diff --git a/tutorials/intermediate/language_model/src/clip_grad_norm.cpp b/tutorials/intermediate/language_model/src/clip_grad_norm.cpp
@@ -0,0 +1,37 @@
+// Copyright 2019 Markus Fleischhacker
+#include "clip_grad_norm.h"
+#include <torch/torch.h>
+#include <vector>
+#include <algorithm>
+
+namespace nn_utils {
+    // Clips gradient norm of a vector of tensors
+    //
+    // Source (slightly modified):
+    // https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/nn/utils/clip_grad.h
+    void clip_grad_l2_norm(std::vector<torch::Tensor> parameters, double max_norm) {
+        std::vector<torch::Tensor> params_with_grad;
+
+        for (const auto& param : parameters) {
+            auto& grad = param.grad();
+            if (grad.defined()) {
+                params_with_grad.push_back(param);
+            }
+        }
+
+        double total_norm = 0.0;
+
+        for (const auto& param : params_with_grad) {
+            auto param_norm = param.grad().data().norm(2.0);
+            total_norm += std::pow(param_norm.item().toDouble(), 2.0);
+        }
+        total_norm = std::pow(total_norm, 1.0 / 2.0);
+
+        auto clip_coef = max_norm / (total_norm + 1e-6);
+        if (clip_coef < 1) {
+            for (auto& param : params_with_grad) {
+                param.grad().data().mul_(clip_coef);
+            }
+        }
+    }
+}  // namespace nn_utils
diff --git a/tutorials/intermediate/language_model/src/corpus.cpp b/tutorials/intermediate/language_model/src/corpus.cpp
@@ -0,0 +1,32 @@
+// Copyright 2019 Markus Fleischhacker
+
+#include "corpus.h"
+#include <torch/torch.h>
+#include <fstream>
+#include <sstream>
+#include <exception>
+#include <algorithm>
+
+namespace data_utils {
+    torch::Tensor Corpus::get_data(int64_t batch_size) {
+        if (std::ifstream file{path_}) {
+            std::vector<int64_t> ids;
+
+            for (std::string line; std::getline(file, line);) {
+                std::istringstream line_stream(line);
+
+                for (std::string word; line_stream >> word;) {
+                    ids.push_back(dictionary_.add_word(word));
+                }
+                // End of sequence marker
+                ids.push_back(dictionary_.add_word("<eos>"));
+            }
+
+            int64_t num_batches = ids.size() / batch_size;
+            return torch::from_blob(ids.data(), {batch_size, num_batches},
+                torch::TensorOptions().dtype(torch::kInt64)).clone();
+        } else {
+            throw std::runtime_error("Could not read file at path: " + path_);
+        }
+    }
+}  // namespace data_utils
diff --git a/tutorials/intermediate/language_model/src/dictionary.cpp b/tutorials/intermediate/language_model/src/dictionary.cpp
@@ -0,0 +1,18 @@
+// Copyright 2019 Markus Fleischhacker
+#include "dictionary.h"
+
+namespace data_utils {
+    int64_t Dictionary::add_word(const std::string& word) {
+        auto it = word2idx_.find(word);
+
+        if (it == word2idx_.end()) {
+            idx2word_.push_back(word);
+
+            auto new_index = idx2word_.size() - 1;
+            word2idx_[word] = new_index;
+            return new_index;
+        }
+
+        return it->second;
+    }
+}  // namespace data_utils
diff --git a/tutorials/intermediate/language_model/src/main.cpp b/tutorials/intermediate/language_model/src/main.cpp
@@ -0,0 +1,129 @@
+// Copyright 2019 Markus Fleischhacker
+#include <torch/torch.h>
+#include <iostream>
+#include <iomanip>
+#include "rnn_lm.h"
+#include "corpus.h"
+#include "clip_grad_norm.h"
+
+using data_utils::Corpus;
+using nn_utils::clip_grad_l2_norm;
+
+int main() {
+    std::cout << "Language Model\n\n";
+
+    // Device
+    torch::Device device(torch::cuda::is_available() ? torch::kCUDA : torch::kCPU);
+
+    // Hyper parameters
+    const int64_t embed_size = 128;
+    const int64_t hidden_size = 1024;
+    const int64_t num_layers = 1;
+    const int64_t num_epochs = 5;
+    const int64_t num_samples = 1000;  // the number of words to be sampled
+    const int64_t batch_size = 20;
+    const int64_t sequence_length = 30;
+    const double learning_rate = 0.002;
+
+    // Load "Penn Treebank" dataset
+    // See https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/language_model/data/
+    const std::string penn_treebank_data_path = "../../../../tutorials/intermediate/language_model/data/train.txt";
+
+    Corpus corpus(penn_treebank_data_path);
+
+    auto ids = corpus.get_data(batch_size);
+    int64_t vocab_size = corpus.get_dictionary().size();
+
+    // Model
+    RNNLM model(vocab_size, embed_size, hidden_size, num_layers);
+    model->to(device);
+
+    // Optimizer
+    auto optimizer = torch::optim::Adam(model->parameters(), torch::optim::AdamOptions(learning_rate));
+
+    // Set floating point output precision
+    std::cout << std::fixed << std::setprecision(4);
+
+    std::cout << "Training...\n";
+
+    // Train the model
+    for (size_t epoch = 0; epoch != num_epochs; ++epoch) {
+        // Initialize running metrics
+        float running_loss = 0.0;
+        float running_perplexity = 0.0;
+        size_t running_num_samples = 0;
+
+        // Set initial hidden- and cell-states (stacked into one tensor)
+        auto state = torch::zeros({2, num_layers, batch_size, hidden_size}).to(device).detach();
+
+        for (size_t i = 0; i < ids.size(1) - sequence_length; i += sequence_length) {
+            // Transfer data and target labels to device
+            auto data = ids.slice(1, i, i + sequence_length).to(device);
+            auto target = ids.slice(1, i + 1, i + 1 + sequence_length).to(device);
+
+            // Forward pass
+            auto rnn_output = model->forward(data, state);
+            auto output = rnn_output.output;
+            state = rnn_output.state.detach();
+
+            // Calculate loss
+            auto loss = torch::nll_loss(output, target.reshape(-1));
+
+            // Update running metrics
+            running_loss += loss.item().toFloat() * data.size(0);
+            running_perplexity += torch::exp(loss).item().toFloat() * data.size(0);
+            running_num_samples += data.size(0);
+
+            // Backward pass and optimize
+            optimizer.zero_grad();
+            loss.backward();
+            clip_grad_l2_norm(model->parameters(), 0.5);
+            optimizer.step();
+        }
+
+        auto sample_mean_loss = running_loss / running_num_samples;
+        auto sample_mean_perplexity = running_perplexity / running_num_samples;
+
+        std::cout << "Epoch [" << (epoch + 1) << "/" << num_epochs << "], Trainset - Loss: "
+            << sample_mean_loss << ", Perplexity: " << sample_mean_perplexity << '\n';
+    }
+
+    std::cout << "Training finished!\n\n";
+    std::cout << "Generating samples...\n";
+
+    const std::string sample_output_path = "../../../../tutorials/intermediate/language_model/data/sample.txt";
+
+    // Generate samples
+    model->eval();
+    torch::NoGradGuard no_grad;
+
+    std::ofstream sample_output_file(sample_output_path);
+
+    // Set initial hidden- and cell-states (stacked into one tensor)
+    auto state = torch::zeros({2, num_layers, 1, hidden_size}).to(device);
+
+    // Select one word-id at random
+    auto prob = torch::ones(vocab_size);
+    auto data = prob.multinomial(1).unsqueeze(1).to(device);
+
+    for (size_t i = 0; i != num_samples; ++i) {
+        // Forward pass
+        auto rnn_output = model->forward(data, state);
+        auto out = rnn_output.output;
+        state = rnn_output.state;
+
+        // Sample one word id
+        prob = out.exp();
+        auto word_id = prob.multinomial(1).item();
+
+        // Fill input data with sampled word id for the next time step
+        data.fill_(word_id);
+
+        // Write the word corresponding to the id to the file
+        auto word = corpus.get_dictionary().word_at_index(word_id.toLong());
+        word = (word == "<eos>") ? "\n" : word + " ";
+        sample_output_file << word;
+    }
+    std::cout << "Finished generating samples!\nSaved output to " << sample_output_path << "\n";
+}
+
diff --git a/tutorials/intermediate/language_model/src/rnn_lm.cpp b/tutorials/intermediate/language_model/src/rnn_lm.cpp
@@ -0,0 +1,21 @@
+// Copyright 2019 Markus Fleischhacker
+#include "rnn_lm.h"
+#include <torch/torch.h>
+
+RNNLMImpl::RNNLMImpl(int64_t vocab_size, int64_t embed_size, int64_t hidden_size, int64_t num_layers)
+    : embed(vocab_size, embed_size),
+    lstm(torch::nn::LSTMOptions(embed_size, hidden_size).layers(num_layers).batch_first(true)),
+    linear(hidden_size, vocab_size) {
+    register_module("embed", embed);
+    register_module("lstm", lstm);
+    register_module("linear", linear);
+}
+
+torch::nn::RNNOutput RNNLMImpl::forward(torch::Tensor x, torch::Tensor h) {
+    auto lstm_out = lstm->forward(embed->forward(x), h);
+    auto out = lstm_out.output;
+    out = out.reshape({-1, out.size(2)});
+    out = linear->forward(out);
+    out = torch::log_softmax(out, 1);
+    return {out, lstm_out.state};
+}