feature: optimiza llama.cpp loading, fix llama.cpp tokenizer, unify l…

…ogger (#75) * feat: add lora for llama.cpp * update: optimiza llama.cpp loading, fix llama.cpp tokenizer, unify logger
Atome-FE · May 18, 2023 · 5368612 · 5368612
1 parent 827cf0c
commit 5368612
Show file tree

Hide file tree

Showing 37 changed files with 301 additions and 224 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,8 @@ members = [
     "packages/hf-tokenizer",
     "packages/core",
     "packages/llama-cpp",
-    "packages/rwkv-cpp"
+    "packages/rwkv-cpp",
+    "packages/common-rs"
 ]
 
 [profile.release]

diff --git a/example/js/langchain/langchain.js b/example/js/langchain/langchain.js
@@ -17,6 +17,7 @@ const config = {
     useMlock: false,
     embedding: true,
     useMmap: true,
+    nGpuLayers: 0
 };
 const run = async () => {
     await llama.load(config);

diff --git a/example/js/llama-cpp/abortable.js b/example/js/llama-cpp/abortable.js
@@ -15,6 +15,7 @@ const config = {
     useMlock: false,
     embedding: false,
     useMmap: true,
+    nGpuLayers: 0
 };
 const template = `How are you?`;
 const prompt = `A chat between a user and an assistant.

diff --git a/example/js/llama-cpp/embedding.js b/example/js/llama-cpp/embedding.js
@@ -15,6 +15,7 @@ const config = {
     useMlock: false,
     embedding: true,
     useMmap: true,
+    nGpuLayers: 0
 };
 const prompt = `Who is the president of the United States?`;
 const params = {

diff --git a/example/js/llama-cpp/inference.js b/example/js/llama-cpp/inference.js
@@ -15,6 +15,7 @@ const config = {
     useMlock: false,
     embedding: false,
     useMmap: true,
+    nGpuLayers: 0
 };
 const template = `How are you?`;
 const prompt = `A chat between a user and an assistant.

diff --git a/example/js/llama-cpp/tokenize.js b/example/js/llama-cpp/tokenize.js
@@ -15,10 +15,11 @@ const config = {
     useMlock: false,
     embedding: false,
     useMmap: true,
+    nGpuLayers: 0,
 };
 const content = "how are you?";
 const run = async () => {
     await llama.load(config);
-    await llama.tokenize({ content, nCtx: 2048 }).then(console.log);
+    await llama.tokenize(content).then(console.log);
 };
 run();
diff --git a/example/js/rwkv-cpp/inference_chat.js b/example/js/rwkv-cpp/inference_chat.js
@@ -0,0 +1,50 @@
+import { LLM } from "llama-node";
+import { RwkvCpp } from "llama-node/dist/llm/rwkv-cpp.js";
+import path from "path";
+const modelPath = path.resolve(process.cwd(), "../ggml-rwkv-4_raven-7b-v9-Eng99%-20230412-ctx8192-Q4_1_0.bin");
+const tokenizerPath = path.resolve(process.cwd(), "../20B_tokenizer.json");
+const rwkv = new LLM(RwkvCpp);
+const config = {
+    modelPath,
+    tokenizerPath,
+    nThreads: 4,
+    enableLogging: true,
+};
+const prompt = `The following is a coherent verbose detailed conversation between a girl named Alice and her friend Bob. Alice is very intelligent, creative and friendly. Alice is unlikely to disagree with Bob, and Alice doesn't like to ask Bob questions. Alice likes to tell Bob a lot about herself and her opinions. Alice usually gives Bob kind, helpful and informative advices.\n\nBob: Hello Alice, how are you doing?\n\nAlice: Hi! Thanks, I'm fine. What about you?\n\nBob: I am fine. It's nice to see you. Look, here is a store selling tea and juice.\n\nAlice: Sure. Let's go inside. I would like to have some Mocha latte, which is my favourite!\n\nBob: What is it?\n\nAlice: Mocha latte is usually made with espresso, milk, chocolate, and frothed milk. Its flavors are frequently sweet.\n\nBob: Sounds tasty. I'll try it next time. Would you like to chat with me for a while?\n\nAlice: Of course! I'm glad to answer your questions or give helpful advices. You know, I am confident with my expertise. So please go ahead!\n\n`;
+const run = async () => {
+    await rwkv.load(config);
+    // init session data
+    const params = {
+        maxPredictLength: 2048,
+        topP: 0.1,
+        temp: 0.1,
+        prompt,
+        sessionFilePath: path.resolve(process.cwd(), "../../session1.bin"),
+        isSkipGeneration: true,
+        isOverwriteSessionFile: true
+    };
+    await rwkv.createCompletion(params, (response) => {
+        process.stdout.write(response.token);
+    });
+    // reuse session data, you don't need process all prompt once the session already initialized
+    const params2 = {
+        maxPredictLength: 2048,
+        // For better Q&A accuracy and less diversity, reduce top_p (to 0.5, 0.2, 0.1 etc.)
+        topP: 0.1,
+        // Sampling temperature. It could be a good idea to increase temperature when top_p is low.
+        temp: 0.1,
+        prompt: 'Bob: Who are you?\\n\\nAlice: ',
+        endString: '\n\n',
+        sessionFilePath: path.resolve(process.cwd(), "../../session1.bin"),
+        // set to false will keep the initial state of session
+        isOverwriteSessionFile: true,
+        // Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+        presencePenalty: 0.2,
+        // Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+        frequencyPenalty: 0.2
+    };
+    await rwkv.createCompletion(params2, (response) => {
+        process.stdout.write(response.token);
+    });
+};
+run();
diff --git a/example/ts/llama-cpp/tokenize.ts b/example/ts/llama-cpp/tokenize.ts
@@ -18,15 +18,15 @@ const config: LoadConfig = {
     useMlock: false,
     embedding: false,
     useMmap: true,
-    nGpuLayers: 0
+    nGpuLayers: 0,
 };
 
 const content = "how are you?";
 
 const run = async () => {
     await llama.load(config);
 
-    await llama.tokenize({ content, nCtx: 2048 }).then(console.log);
+    await llama.tokenize(content).then(console.log);
 };
 
 run();
diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts
@@ -70,14 +70,14 @@ class InferenceCommand implements yargs.CommandModule {
         const absolutePath = path.isAbsolute(modelPath)
             ? modelPath
             : path.join(process.cwd(), modelPath);
-        if (logger) {
-            Llm.enableLogger();
-        }
-        const llm = await Llm.create({
-            modelPath: absolutePath,
-            modelType,
-            numCtxTokens,
-        });
+        const llm = await Llm.create(
+            {
+                modelPath: absolutePath,
+                modelType,
+                numCtxTokens,
+            },
+            logger ?? true
+        );
         llm.inference(rest, (result) => {
             switch (result.type) {
                 case InferenceResultType.Data:

diff --git a/packages/common-rs/Cargo.toml b/packages/common-rs/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "common-rs"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+env_logger = "0.10.0"
+log = "0.4.17"
+once_cell = "1.17.1"
diff --git a/packages/common-rs/src/lib.rs b/packages/common-rs/src/lib.rs
@@ -0,0 +1 @@
+pub mod logger;
diff --git a/packages/common-rs/src/logger.rs b/packages/common-rs/src/logger.rs
@@ -0,0 +1,39 @@
+use log::{LevelFilter, Log, Metadata, Record};
+use once_cell::sync::Lazy;
+
+pub struct LLamaLogger {
+    enabled: bool,
+}
+
+static mut LLAMA_LOGGER_INNER: LLamaLogger = LLamaLogger { enabled: true };
+pub static mut LLAMA_LOGGER: Lazy<&mut LLamaLogger> = Lazy::new(|| {
+    log::set_max_level(LevelFilter::Info);
+    log::set_logger(unsafe { &LLAMA_LOGGER_INNER }).unwrap();
+    unsafe { &mut LLAMA_LOGGER_INNER }
+});
+
+impl LLamaLogger {
+    pub fn set_enabled(&mut self, enabled: bool) {
+        self.enabled = enabled;
+    }
+
+    pub fn get_singleton() -> &'static mut LLamaLogger {
+        unsafe { &mut LLAMA_LOGGER }
+    }
+}
+
+impl Log for LLamaLogger {
+    fn enabled(&self, metadata: &Metadata) -> bool {
+        metadata.level() <= log::Level::Info
+        // true
+    }
+
+    fn log(&self, record: &Record) {
+        // Check if the record is matched by the logger before logging
+        if self.enabled(record.metadata()) && self.enabled {
+            println!("{} - {}", record.level(), record.args());
+        }
+    }
+
+    fn flush(&self) {}
+}
diff --git a/packages/core/Cargo.toml b/packages/core/Cargo.toml
@@ -9,9 +9,6 @@ crate-type = ["cdylib"]
 [dependencies]
 # Default enable napi4 feature, see https://nodejs.org/api/n-api.html#node-api-version-matrix
 clap = { version = "4.1.8", features = ["derive"] }
-env_logger = "0.10.0"
-log = "0.4"
-once_cell = "1.17.1"
 num_cpus = "1.15.0"
 rand = "0.8.5"
 llm = { git = "https://github.com/rustformers/llm.git", branch = "main" }
@@ -26,6 +23,9 @@ zstd = {version = "0.12", default-features = false}
 anyhow = "1.0.70"
 bincode = "1.3.3"
 
+log = "0.4.17"
+common-rs = { path = "../common-rs" }
+
 
 [build-dependencies]
 napi-build = "2.0.1"
diff --git a/packages/core/example/abortable.ts b/packages/core/example/abortable.ts
@@ -4,14 +4,15 @@ import path from "path";
 const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
 // const persistSession = path.resolve(process.cwd(), "./tmp/session.bin");
 
-Llm.enableLogger();
-
 const run = async () => {
-    const llm = await Llm.create({
-        modelType: ModelType.Llama,
-        modelPath: model,
-        numCtxTokens: 128,
-    });
+    const llm = await Llm.create(
+        {
+            modelType: ModelType.Llama,
+            modelPath: model,
+            numCtxTokens: 128,
+        },
+        true
+    );
 
     const template = `how are you`;
 

diff --git a/packages/core/example/cachesession.ts b/packages/core/example/cachesession.ts
@@ -4,14 +4,15 @@ import path from "path";
 const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
 const saveSession = path.resolve(process.cwd(), "./tmp/session.bin");
 
-Llm.enableLogger();
-
 const run = async () => {
-    const llm = await Llm.create({
-        modelType: ModelType.Llama,
-        modelPath: model,
-        numCtxTokens: 128,
-    });
+    const llm = await Llm.create(
+        {
+            modelType: ModelType.Llama,
+            modelPath: model,
+            numCtxTokens: 128,
+        },
+        true
+    );
 
     const template = `how are you`;
 

diff --git a/packages/core/example/embedding.ts b/packages/core/example/embedding.ts
@@ -4,13 +4,7 @@ import fs from "fs";
 
 const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
 
-Llm.enableLogger();
-
-const getWordEmbeddings = async (
-    llm: LLM,
-    prompt: string,
-    file: string
-) => {
+const getWordEmbeddings = async (llm: LLM, prompt: string, file: string) => {
     const response = await llm.getWordEmbeddings({
         prompt,
         numPredict: 128,
@@ -29,11 +23,14 @@ const getWordEmbeddings = async (
 };
 
 const run = async () => {
-    const llm = await Llm.create({
-        modelType: ModelType.Llama,
-        modelPath: model,
-        numCtxTokens: 128,
-    });
+    const llm = await Llm.create(
+        {
+            modelType: ModelType.Llama,
+            modelPath: model,
+            numCtxTokens: 128,
+        },
+        true
+    );
 
     const dog1 = `My favourite animal is the dog`;
     getWordEmbeddings(llm, dog1, "./example/semantic-compare/dog1.json");

diff --git a/packages/core/example/inference.ts b/packages/core/example/inference.ts
@@ -4,14 +4,15 @@ import path from "path";
 const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
 // const persistSession = path.resolve(process.cwd(), "./tmp/session.bin");
 
-Llm.enableLogger();
-
 const run = async () => {
-    const llm = await Llm.create({
-        modelType: ModelType.Llama,
-        modelPath: model,
-        numCtxTokens: 128,
-    });
+    const llm = await Llm.create(
+        {
+            modelType: ModelType.Llama,
+            modelPath: model,
+            numCtxTokens: 128,
+        },
+        true
+    );
 
     const template = `how are you`;