In [6]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-1b5", filename="RWKV-4-Pile-1B5-20220903-8040.pth", cache_dir="/home/kyle/HF-MODEL/rwkv-4-pile-1b5")

'/home/kyle/HF-MODEL/rwkv-4-pile-1b5/models--BlinkDL--rwkv-4-pile-1b5/snapshots/6ea995eaa87a17af560c9b41ce1a3d92355c5a49/RWKV-4-Pile-1B5-20220903-8040.pth'

In [7]:
import os
os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
from rwkv.model import RWKV                         # pip install rwkv
model = RWKV(model='/home/kyle/HF-MODEL/rwkv-4-pile-1b5/models--BlinkDL--rwkv-4-pile-1b5/snapshots/6ea995eaa87a17af560c9b41ce1a3d92355c5a49/RWKV-4-Pile-1B5-20220903-8040.pth', strategy='cuda fp16')

out, state = model.forward([187, 510, 1563, 310, 247], None)   # use 20B_tokenizer.json
print(out.detach().cpu().numpy())                   # get logits
out, state = model.forward([187, 510], None)
out, state = model.forward([1563], state)           # RNN has state (use deepcopy if you want to clone it)
out, state = model.forward([310, 247], state)
print(out.detach().cpu().numpy())                   # same result as above

RWKV_JIT_ON 1 RWKV_CUDA_ON 0 RESCALE_LAYER 6

Loading /home/kyle/HF-MODEL/rwkv-4-pile-1b5/models--BlinkDL--rwkv-4-pile-1b5/snapshots/6ea995eaa87a17af560c9b41ce1a3d92355c5a49/RWKV-4-Pile-1B5-20220903-8040.pth ...
Strategy: (total 24+1=25 layers)
* cuda [float16, float16], store 25 layers
0-cuda-float16-float16 1-cuda-float16-float16 2-cuda-float16-float16 3-cuda-float16-float16 4-cuda-float16-float16 5-cuda-float16-float16 6-cuda-float16-float16 7-cuda-float16-float16 8-cuda-float16-float16 9-cuda-float16-float16 10-cuda-float16-float16 11-cuda-float16-float16 12-cuda-float16-float16 13-cuda-float16-float16 14-cuda-float16-float16 15-cuda-float16-float16 16-cuda-float16-float16 17-cuda-float16-float16 18-cuda-float16-float16 19-cuda-float16-float16 20-cuda-float16-float16 21-cuda-float16-float16 22-cuda-float16-float16 23-cuda-float16-float16 24-cuda-float16-float16 
emb.weight                        f16      cpu  50277  2048 
blocks.0.ln1.weight               f16   cuda:0   2048       

In [8]:

import json

with open('20B_tokenizer.json') as f:
    tokenizer_file = json.load(f)
vocab = tokenizer_file["model"]["vocab"]

In [9]:
from transformers import AutoTokenizer, GPT2TokenizerFast
tokenizer = GPT2TokenizerFast(tokenizer_file='20B_tokenizer.json')

In [10]:
text = "Hello, there Cats are"
tokenizer.encode(text)

[12092, 13, 627, 330, 1832, 403]

In [11]:
out, state = model.forward(tokenizer.encode(text), None)

In [12]:
tokenizer.decode(out.argmax())

' the'

In [13]:
state[-1]

tensor([ 0.2703, -0.3164,  0.1064,  ...,  0.0698, -0.0320, -0.0400],
       device='cuda:0', dtype=torch.float16)

In [17]:
state[-1].shape

torch.Size([2048])

In [18]:
len(state)

120

In [15]:
out, state = model.forward([187, 510, 1563, 310, 247], None)   # use 20B_tokenizer.json
print(out.detach().cpu().numpy())                   # get logits
out, state = model.forward([187, 510], None)
out, state = model.forward([1563], state)           # RNN has state (use deepcopy if you want to clone it)
out, state = model.forward([310, 247], state)

[ -8.828125  -21.828125   -9.359375  ...  -7.7070312  -4.875
  -1.9785156]


In [16]:
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "20B_tokenizer.json") 
ctx = "\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese."
print(ctx, end='')

def my_print(s):
    print(s, end='', flush=True)

args = PIPELINE_ARGS(temperature = 1.0, top_p = 0.7, top_k = 100, # top_k = 0 then ignore
                     alpha_frequency = 0.25,
                     alpha_presence = 0.25,
                     token_ban = [0], # ban the generation of some tokens
                     token_stop = [], # stop generation whenever you see any token here
                     chunk_len = 256) # split input into chunks to save VRAM (shorter -> slower)

pipeline.generate(ctx, token_count=200, args=args, callback=my_print)


In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.

The dragons had been living in the valley for a number of years, and had not come into contact with humans. But they did occasionally steal human food. One of the scientists, who was from China, even managed to successfully catch a dragon using a net.

Scientists have long believed that the world's dragons are related to the oryx, a large land animal that once roamed much of Asia. But no-one has ever seen one. So why are there dragons in this area? The scientists believe that these particular dragons may be descendants of one that was born in India and was later moved to China. Scientists believe that the dragons may have become extinct in this part of China because they are unable to find a suitable habitat to live in. But some scientists think that they may have been bro

"\n\nThe dragons had been living in the valley for a number of years, and had not come into contact with humans. But they did occasionally steal human food. One of the scientists, who was from China, even managed to successfully catch a dragon using a net.\n\nScientists have long believed that the world's dragons are related to the oryx, a large land animal that once roamed much of Asia. But no-one has ever seen one. So why are there dragons in this area? The scientists believe that these particular dragons may be descendants of one that was born in India and was later moved to China. Scientists believe that the dragons may have become extinct in this part of China because they are unable to find a suitable habitat to live in. But some scientists think that they may have been brought here by traders, as it is likely that many people traded with Tibet at one time. The scientists believe that many of the dragons could be descended from one or more of the original"

In [1]:
import os
from rwkv_hf import RWKVModel
from transformers import GPT2TokenizerFast

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

tokenizer = GPT2TokenizerFast(tokenizer_file='20B_tokenizer.json')
text = "I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered controversial I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. While my countrymen mind find it shocking, in reality sex and nudity are a major staple in Swedish cinema. Even Ingmar Bergman, arguably their answer to good old boy John Ford, had sex scenes in his films.<br /><br />I do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in America. I AM CURIOUS-YELLOW is a good film for anyone wanting to study the meat and potatoes (no pun intended) of Swedish cinema. But really, this film doesn't have much of a plot."
tokens = tokenizer.encode(text)
model = RWKVModel()

  from .autonotebook import tqdm as notebook_tqdm


RWKV_JIT_ON 1 RWKV_CUDA_ON 0 RESCALE_LAYER 6

Loading /home/kyle/HF-MODEL/rwkv-4-pile-1b5/models--BlinkDL--rwkv-4-pile-1b5/snapshots/6ea995eaa87a17af560c9b41ce1a3d92355c5a49/RWKV-4-Pile-1B5-20220903-8040.pth ...
Strategy: (total 24+1=25 layers)
* cuda [float16, float16], store 25 layers
0-cuda-float16-float16 1-cuda-float16-float16 2-cuda-float16-float16 3-cuda-float16-float16 4-cuda-float16-float16 5-cuda-float16-float16 6-cuda-float16-float16 7-cuda-float16-float16 8-cuda-float16-float16 9-cuda-float16-float16 10-cuda-float16-float16 11-cuda-float16-float16 12-cuda-float16-float16 13-cuda-float16-float16 14-cuda-float16-float16 15-cuda-float16-float16 16-cuda-float16-float16 17-cuda-float16-float16 18-cuda-float16-float16 19-cuda-float16-float16 20-cuda-float16-float16 21-cuda-float16-float16 22-cuda-float16-float16 23-cuda-float16-float16 24-cuda-float16-float16 
emb.weight                        f16      cpu  50277  2048 
blocks.0.ln1.weight               f16   cuda:0   2048       

In [5]:
model(tokens)[-1].shape

torch.Size([2048])