Skip to content

Commit

Permalink
feature: optimiza llama.cpp loading, fix llama.cpp tokenizer, unify l…
Browse files Browse the repository at this point in the history
…ogger (#75)

* feat: add lora for llama.cpp

* update: optimiza llama.cpp loading, fix llama.cpp tokenizer, unify logger
  • Loading branch information
hlhr202 committed May 18, 2023
1 parent 827cf0c commit 5368612
Show file tree
Hide file tree
Showing 37 changed files with 301 additions and 224 deletions.
17 changes: 12 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ members = [
"packages/hf-tokenizer",
"packages/core",
"packages/llama-cpp",
"packages/rwkv-cpp"
"packages/rwkv-cpp",
"packages/common-rs"
]

[profile.release]
Expand Down
1 change: 1 addition & 0 deletions example/js/langchain/langchain.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const config = {
useMlock: false,
embedding: true,
useMmap: true,
nGpuLayers: 0
};
const run = async () => {
await llama.load(config);
Expand Down
1 change: 1 addition & 0 deletions example/js/llama-cpp/abortable.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const config = {
useMlock: false,
embedding: false,
useMmap: true,
nGpuLayers: 0
};
const template = `How are you?`;
const prompt = `A chat between a user and an assistant.
Expand Down
1 change: 1 addition & 0 deletions example/js/llama-cpp/embedding.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const config = {
useMlock: false,
embedding: true,
useMmap: true,
nGpuLayers: 0
};
const prompt = `Who is the president of the United States?`;
const params = {
Expand Down
1 change: 1 addition & 0 deletions example/js/llama-cpp/inference.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const config = {
useMlock: false,
embedding: false,
useMmap: true,
nGpuLayers: 0
};
const template = `How are you?`;
const prompt = `A chat between a user and an assistant.
Expand Down
3 changes: 2 additions & 1 deletion example/js/llama-cpp/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ const config = {
useMlock: false,
embedding: false,
useMmap: true,
nGpuLayers: 0,
};
const content = "how are you?";
const run = async () => {
await llama.load(config);
await llama.tokenize({ content, nCtx: 2048 }).then(console.log);
await llama.tokenize(content).then(console.log);
};
run();
50 changes: 50 additions & 0 deletions example/js/rwkv-cpp/inference_chat.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { LLM } from "llama-node";
import { RwkvCpp } from "llama-node/dist/llm/rwkv-cpp.js";
import path from "path";
const modelPath = path.resolve(process.cwd(), "../ggml-rwkv-4_raven-7b-v9-Eng99%-20230412-ctx8192-Q4_1_0.bin");
const tokenizerPath = path.resolve(process.cwd(), "../20B_tokenizer.json");
const rwkv = new LLM(RwkvCpp);
const config = {
modelPath,
tokenizerPath,
nThreads: 4,
enableLogging: true,
};
const prompt = `The following is a coherent verbose detailed conversation between a girl named Alice and her friend Bob. Alice is very intelligent, creative and friendly. Alice is unlikely to disagree with Bob, and Alice doesn't like to ask Bob questions. Alice likes to tell Bob a lot about herself and her opinions. Alice usually gives Bob kind, helpful and informative advices.\n\nBob: Hello Alice, how are you doing?\n\nAlice: Hi! Thanks, I'm fine. What about you?\n\nBob: I am fine. It's nice to see you. Look, here is a store selling tea and juice.\n\nAlice: Sure. Let's go inside. I would like to have some Mocha latte, which is my favourite!\n\nBob: What is it?\n\nAlice: Mocha latte is usually made with espresso, milk, chocolate, and frothed milk. Its flavors are frequently sweet.\n\nBob: Sounds tasty. I'll try it next time. Would you like to chat with me for a while?\n\nAlice: Of course! I'm glad to answer your questions or give helpful advices. You know, I am confident with my expertise. So please go ahead!\n\n`;
const run = async () => {
await rwkv.load(config);
// init session data
const params = {
maxPredictLength: 2048,
topP: 0.1,
temp: 0.1,
prompt,
sessionFilePath: path.resolve(process.cwd(), "../../session1.bin"),
isSkipGeneration: true,
isOverwriteSessionFile: true
};
await rwkv.createCompletion(params, (response) => {
process.stdout.write(response.token);
});
// reuse session data, you don't need process all prompt once the session already initialized
const params2 = {
maxPredictLength: 2048,
// For better Q&A accuracy and less diversity, reduce top_p (to 0.5, 0.2, 0.1 etc.)
topP: 0.1,
// Sampling temperature. It could be a good idea to increase temperature when top_p is low.
temp: 0.1,
prompt: 'Bob: Who are you?\\n\\nAlice: ',
endString: '\n\n',
sessionFilePath: path.resolve(process.cwd(), "../../session1.bin"),
// set to false will keep the initial state of session
isOverwriteSessionFile: true,
// Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
presencePenalty: 0.2,
// Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
frequencyPenalty: 0.2
};
await rwkv.createCompletion(params2, (response) => {
process.stdout.write(response.token);
});
};
run();
4 changes: 2 additions & 2 deletions example/ts/llama-cpp/tokenize.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ const config: LoadConfig = {
useMlock: false,
embedding: false,
useMmap: true,
nGpuLayers: 0
nGpuLayers: 0,
};

const content = "how are you?";

const run = async () => {
await llama.load(config);

await llama.tokenize({ content, nCtx: 2048 }).then(console.log);
await llama.tokenize(content).then(console.log);
};

run();
16 changes: 8 additions & 8 deletions packages/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,14 @@ class InferenceCommand implements yargs.CommandModule {
const absolutePath = path.isAbsolute(modelPath)
? modelPath
: path.join(process.cwd(), modelPath);
if (logger) {
Llm.enableLogger();
}
const llm = await Llm.create({
modelPath: absolutePath,
modelType,
numCtxTokens,
});
const llm = await Llm.create(
{
modelPath: absolutePath,
modelType,
numCtxTokens,
},
logger ?? true
);
llm.inference(rest, (result) => {
switch (result.type) {
case InferenceResultType.Data:
Expand Down
9 changes: 9 additions & 0 deletions packages/common-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[package]
name = "common-rs"
version = "0.1.0"
edition = "2021"

[dependencies]
env_logger = "0.10.0"
log = "0.4.17"
once_cell = "1.17.1"
1 change: 1 addition & 0 deletions packages/common-rs/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod logger;
39 changes: 39 additions & 0 deletions packages/common-rs/src/logger.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
use log::{LevelFilter, Log, Metadata, Record};
use once_cell::sync::Lazy;

pub struct LLamaLogger {
enabled: bool,
}

static mut LLAMA_LOGGER_INNER: LLamaLogger = LLamaLogger { enabled: true };
pub static mut LLAMA_LOGGER: Lazy<&mut LLamaLogger> = Lazy::new(|| {
log::set_max_level(LevelFilter::Info);
log::set_logger(unsafe { &LLAMA_LOGGER_INNER }).unwrap();
unsafe { &mut LLAMA_LOGGER_INNER }
});

impl LLamaLogger {
pub fn set_enabled(&mut self, enabled: bool) {
self.enabled = enabled;
}

pub fn get_singleton() -> &'static mut LLamaLogger {
unsafe { &mut LLAMA_LOGGER }
}
}

impl Log for LLamaLogger {
fn enabled(&self, metadata: &Metadata) -> bool {
metadata.level() <= log::Level::Info
// true
}

fn log(&self, record: &Record) {
// Check if the record is matched by the logger before logging
if self.enabled(record.metadata()) && self.enabled {
println!("{} - {}", record.level(), record.args());
}
}

fn flush(&self) {}
}
6 changes: 3 additions & 3 deletions packages/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ crate-type = ["cdylib"]
[dependencies]
# Default enable napi4 feature, see https://nodejs.org/api/n-api.html#node-api-version-matrix
clap = { version = "4.1.8", features = ["derive"] }
env_logger = "0.10.0"
log = "0.4"
once_cell = "1.17.1"
num_cpus = "1.15.0"
rand = "0.8.5"
llm = { git = "https://github.com/rustformers/llm.git", branch = "main" }
Expand All @@ -26,6 +23,9 @@ zstd = {version = "0.12", default-features = false}
anyhow = "1.0.70"
bincode = "1.3.3"

log = "0.4.17"
common-rs = { path = "../common-rs" }


[build-dependencies]
napi-build = "2.0.1"
15 changes: 8 additions & 7 deletions packages/core/example/abortable.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ import path from "path";
const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
// const persistSession = path.resolve(process.cwd(), "./tmp/session.bin");

Llm.enableLogger();

const run = async () => {
const llm = await Llm.create({
modelType: ModelType.Llama,
modelPath: model,
numCtxTokens: 128,
});
const llm = await Llm.create(
{
modelType: ModelType.Llama,
modelPath: model,
numCtxTokens: 128,
},
true
);

const template = `how are you`;

Expand Down
15 changes: 8 additions & 7 deletions packages/core/example/cachesession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ import path from "path";
const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
const saveSession = path.resolve(process.cwd(), "./tmp/session.bin");

Llm.enableLogger();

const run = async () => {
const llm = await Llm.create({
modelType: ModelType.Llama,
modelPath: model,
numCtxTokens: 128,
});
const llm = await Llm.create(
{
modelType: ModelType.Llama,
modelPath: model,
numCtxTokens: 128,
},
true
);

const template = `how are you`;

Expand Down
21 changes: 9 additions & 12 deletions packages/core/example/embedding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,7 @@ import fs from "fs";

const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");

Llm.enableLogger();

const getWordEmbeddings = async (
llm: LLM,
prompt: string,
file: string
) => {
const getWordEmbeddings = async (llm: LLM, prompt: string, file: string) => {
const response = await llm.getWordEmbeddings({
prompt,
numPredict: 128,
Expand All @@ -29,11 +23,14 @@ const getWordEmbeddings = async (
};

const run = async () => {
const llm = await Llm.create({
modelType: ModelType.Llama,
modelPath: model,
numCtxTokens: 128,
});
const llm = await Llm.create(
{
modelType: ModelType.Llama,
modelPath: model,
numCtxTokens: 128,
},
true
);

const dog1 = `My favourite animal is the dog`;
getWordEmbeddings(llm, dog1, "./example/semantic-compare/dog1.json");
Expand Down
15 changes: 8 additions & 7 deletions packages/core/example/inference.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ import path from "path";
const model = path.resolve(process.cwd(), "../../ggml-alpaca-7b-q4.bin");
// const persistSession = path.resolve(process.cwd(), "./tmp/session.bin");

Llm.enableLogger();

const run = async () => {
const llm = await Llm.create({
modelType: ModelType.Llama,
modelPath: model,
numCtxTokens: 128,
});
const llm = await Llm.create(
{
modelType: ModelType.Llama,
modelPath: model,
numCtxTokens: 128,
},
true
);

const template = `how are you`;

Expand Down

0 comments on commit 5368612

Please sign in to comment.