# Setup Environment
Run this part just once for a new runtime.

## Install Dependencies

In [None]:
! git clone https://github.com/DoubleVII/naughtyLLM
%cd naughtyLLM
! pip install --editable ./
%cd ..

Cloning into 'naughtyLLM'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 11 (delta 3), reused 6 (delta 2), pack-reused 0[K
Receiving objects: 100% (11/11), 4.21 KiB | 4.21 MiB/s, done.
Resolving deltas: 100% (3/3), done.
/content/naughtyLLM
Obtaining file:///content/naughtyLLM
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting accelerate (from naughtyLLM==0.0.0)
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting jsonargparse (from naughtyLLM==0.0.0)
  Downloading jsonargparse-4.31.0-py3-none-any.whl (205 kB)
[2K     [90m━━━━

## 连接云盘和路径

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SummerCamp_data2024

Mounted at /content/drive
/content/drive/MyDrive/SummerCamp_data2024


## 下载模型

In [None]:
import huggingface_hub
import os
# from datasets import load_dataset

repo_id = "TinyLlama/TinyLlama_v1.1_chinese"

local_dir = "/content/drive/MyDrive/SummerCamp_data2024/LLM/" + repo_id

if not os.path.exists(local_dir):
  huggingface_hub.snapshot_download(repo_id=repo_id, local_dir=local_dir, local_dir_use_symlinks=False)


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/7.12k [00:00<?, ?B/s]

'/content/drive/MyDrive/SummerCamp_data2024/LLM/TinyLlama/TinyLlama_v1.1_chinese'

# Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(local_dir)

tokenizer("中国的首都是")

{'input_ids': [1, 29871, 30275, 30356, 30210, 31688, 30769, 30392], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.encode("中国的首都是")


[1, 29871, 30275, 30356, 30210, 31688, 30769, 30392]

In [None]:
tokenizer.decode([1, 29871, 30275, 30356, 30210, 31688, 30769, 30392], skip_special_tokens=True)

'中国的首都是'

In [None]:
tokenizer.decode([1, 29871, 30275, 30356, 30210, 31688, 30769, 30392], skip_special_tokens=False)

'<s> 中国的首都是'

返回tensor类型，并转移到GPU上

In [None]:
tokenizer("中国的首都是", return_tensors="pt").to("cuda")

{'input_ids': tensor([[    1, 29871, 30275, 30356, 30210, 31688, 30769, 30392]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

词表信息

In [None]:
len(tokenizer)

32000

In [None]:
tokenizer.vocab_size

32000

In [None]:
tokenizer.get_vocab()

# Inference

In [None]:
from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(local_dir, device_map="auto", torch_dtype=torch.float16).to("cuda")


In [None]:
input_text = "中国的首都是"
input = tokenizer(input_text, return_tensors="pt").to("cuda")
input

{'input_ids': tensor([[    1, 29871, 30275, 30356, 30210, 31688, 30769, 30392]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [None]:
with torch.no_grad():
  output = model(**input)

In [None]:
output.logits.size()

torch.Size([1, 8, 32000])

In [None]:
probs = torch.softmax(output.logits[0,-1], dim=0)
probs.size()

torch.Size([32000])

In [None]:
sum(probs)

tensor(1.0000, device='cuda:0')

In [None]:
cand_probs, cand_index = probs.topk(dim=0, k=10)

cand_probs, cand_index

(tensor([0.3147, 0.1030, 0.0271, 0.0271, 0.0262, 0.0256, 0.0237, 0.0198, 0.0178,
         0.0175], device='cuda:0'),
 tensor([30662, 29871,   232,   231, 31113, 30429,   233, 30505, 30275, 29901],
        device='cuda:0'))

In [None]:
tokenizer.batch_decode(cand_index)

['北', '', '�', '�', '香', '上', '�', '在', '中', ':']

选择概率最大的token作为当前的预测，开始生成下一个token

In [None]:
pred_index = cand_index[0]
pred_index

tensor(30662, device='cuda:0')

In [None]:
new_input = input.copy()
new_input.input_ids

tensor([[    1, 29871, 30275, 30356, 30210, 31688, 30769, 30392]],
       device='cuda:0')

In [None]:
new_input.input_ids = torch.cat([new_input.input_ids, pred_index[None,None]], dim=1)
new_input.input_ids

tensor([[    1, 29871, 30275, 30356, 30210, 31688, 30769, 30392, 30662]],
       device='cuda:0')

In [None]:
tokenizer.decode(new_input.input_ids[0])

'<s> 中国的首都是北'

In [None]:
with torch.no_grad():
  output = model(new_input.input_ids)
probs = torch.softmax(output.logits[0,-1], dim=0)
cand_probs, cand_index = probs.topk(dim=0, k=10)
tokenizer.batch_decode(cand_index)

['京', '平', '上', '�', '海', '方', '部', '�', '大', '�']

使用generate接口生成完整的句子

In [None]:
with torch.no_grad():
  output = model.generate(**input,max_new_tokens=100,do_sample=False)
tokenizer.batch_decode(output)

['<s> 中国的首都是北京 首都是香港 首都是��Vorlage 首都是�Vorlage 首都是����� Światy 首都是Światy 首都是Światy 首都是Światy 首都是Światy 首都是Światy 首都是Światy 首都是Światy 首都是Światy 首都是Światy 首都是Ś']

Sampling

In [None]:
with torch.no_grad():
  output = model.generate(**input,max_new_tokens=100,do_sample=True)
tokenizer.batch_decode(output)

['<s> 中国的首都是北京。\n北京自古是帝王都城。从春秋战国时代到1368年，北京有700多年的历史。自清朝以来，全国33个省（包括西藏）在首都设立了首都。晚清政府在习伯、郭蒙、唐怀、�']

In [None]:
with torch.no_grad():
  output = model.generate(**input,max_new_tokens=100,do_sample=True,top_k=4)
tokenizer.batch_decode(output)

['<s> 中国的首都是北京.</s>']

In [None]:
with torch.no_grad():
  output = model.generate(**input,max_new_tokens=100,do_sample=True,top_p=0.8)
tokenizer.batch_decode(output)

['<s> 中国的首都是： 北京市 中国国际航天空气 中华仪桑仨仛 中国滚滇 滥滬 滥缬 滥艠 滥兜滬滬滬 ��������������������']