# Lets Load and Run an LLM

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
#Load the model along with its tokenizer

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [None]:
prompt = "What is tokenization: <|assistant|>"

#lets tokenize
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")


In [None]:
print(input_ids)

tensor([[ 1724,   338,  5993,  2133, 29901, 29871, 32001]], device='cuda:0')


In [None]:
for each_id in input_ids[0]:
  print(tokenizer.decode(each_id))

What
is
token
ization
:

<|assistant|>


# Different Tokenization of various Pre-trained Models

In [None]:
def load_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  return tokenizer

def tokenize_sentence(tokenizer, input_sentence):
  token_ids = tokenizer(input_sentence).input_ids
  tokenized_sentence = tokenizer.convert_ids_to_tokens(token_ids)
  return tokenized_sentence

## BERT base uncased
Tokenization Method: WordPiece

In [None]:
model_id = "bert-base-uncased"
tokenizer = load_tokenizer(model_id)
input_sentence = "I love Large Language Models. This is a tutorial on tokenization."
tokenized_sentence = tokenize_sentence(tokenizer, input_sentence)
print(tokenized_sentence)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['[CLS]', 'i', 'love', 'large', 'language', 'models', '.', 'this', 'is', 'a', 'tutor', '##ial', 'on', 'token', '##ization', '.', '[SEP]']


## BERT base cased
Tokenization Method: WordPiece

In [None]:
model_id = "bert-base-cased"
tokenizer = load_tokenizer(model_id)
input_sentence = "I love Large Language Models. This is a tutorial on tokenization."
tokenized_sentence = tokenize_sentence(tokenizer, input_sentence)
print(tokenized_sentence)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

['[CLS]', 'I', 'love', 'Large', 'Language', 'Models', '.', 'This', 'is', 'a', 'tutor', '##ial', 'on', 'token', '##ization', '.', '[SEP]']


## GPT-2
Tokenization Method: Byte Pair Encoding (BPE)

In [None]:
model_id = "gpt2"
tokenizer = load_tokenizer(model_id)
input_sentence = "I love Large Language Models. This is a tutorial on tokenization."
tokenized_sentence = tokenize_sentence(tokenizer, input_sentence)
print(tokenized_sentence)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

['I', 'Ġlove', 'ĠLarge', 'ĠLanguage', 'ĠModels', '.', 'ĠThis', 'Ġis', 'Ġa', 'Ġtutorial', 'Ġon', 'Ġtoken', 'ization', '.']


## Flan-t5
Tokenization Method: SentencePiece

In [None]:
model_id = "google/flan-t5-small"
tokenizer = load_tokenizer(model_id)
input_sentence = "I love Large Language Models. This is a tutorial on tokenization."
tokenized_sentence = tokenize_sentence(tokenizer, input_sentence)
print(tokenized_sentence)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

['▁I', '▁love', '▁Large', '▁Language', '▁Model', 's', '.', '▁This', '▁is', '▁', 'a', '▁tutorial', '▁on', '▁token', 'ization', '.', '</s>']
