<a href="https://colab.research.google.com/github/Ashish265/hands_on_Large_Language_models/blob/main/Chapter_2_Tokens_and_Token_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map='cuda',
    dtype='auto',
    trust_remote_code=False)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [2]:
from transformers import pipeline
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>"

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=20
)

# Print the output
print(tokenizer.decode(generation_output[0]))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|> Subject: Heartfelt Apologies for the Gardening Mishap


Dear


In [3]:
print(input_ids)

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363,   278, 25305,
           293, 16423,   292,   286,   728,   481, 29889, 12027,  7420,   920,
           372,  9559, 29889, 32001]], device='cuda:0')


In [4]:
for id in input_ids[0]:
   print(tokenizer.decode(id))

Write
an
email
apolog
izing
to
Sarah
for
the
trag
ic
garden
ing
m
ish
ap
.
Exp
lain
how
it
happened
.
<|assistant|>


In [5]:
generation_output

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363,   278, 25305,
           293, 16423,   292,   286,   728,   481, 29889, 12027,  7420,   920,
           372,  9559, 29889, 32001,  3323,   622, 29901, 17778, 29888,  2152,
          6225, 11763,   363,   278, 19906,   292,   341,   728,   481,    13,
            13,    13, 29928,   799]], device='cuda:0')

In [6]:
print(tokenizer.decode(3323))
print(tokenizer.decode(622))
print(tokenizer.decode([3323, 622]))
print(tokenizer.decode(29901))

Sub
ject
Subject
:


## Comparing Trained LLM Tokenizers

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

colors_list = [
'102;194;165', '252;141;98', '141;160;203',
'231;138;195', '166;216;84', '255;217;47'
]


def show_tokens(sentence, tokenizer_name):

  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
  token_ids = tokenizer(sentence).input_ids
  for idx, t in enumerate(token_ids):
    print(
        f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +
        tokenizer.decode(t) +
        '\x1b[0m',
        end=' '
    )

In [8]:
text = """
English and CAPITALIZATION
🎵 鸟
show_tokens False None elif == >= else: two tabs:"    " Three tabs: "       "
12.0*50=600
"""

In [None]:

show_tokens(text, "bert-base-cased")

In [None]:
show_tokens(text, "gpt2")

In [None]:
show_tokens(text, "google/flan-t5-small")


In [None]:
# The official is `tiktoken` but this the same tokenizer on the HF platform
show_tokens(text, "Xenova/gpt-4")

In [None]:
show_tokens(text, "bigcode/starcoder2-15b")

In [None]:
show_tokens(text, "facebook/galactica-1.3b")

In [None]:
show_tokens(text, "microsoft/Phi-3-mini-4k-instruct")

# Contextualized Word Embeddings From a Language Model (Like BERT)

In [None]:
from transformers import AutoModel, AutoTokenizer

# load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

# load a language model
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

# tokenize the sentence
tokens = tokenizer('Hello world', return_tensors='pt')

# process the tokens
output = model(**tokens)[0]

In [17]:
output.shape

torch.Size([1, 4, 384])

In [19]:
for token in tokens['input_ids'][0]:
  print(tokenizer.decode(token))

[CLS]
Hello
 world
[SEP]


In [20]:
output

tensor([[[-3.4816,  0.0861, -0.1819,  ..., -0.0612, -0.3911,  0.3017],
         [ 0.1898,  0.3208, -0.2315,  ...,  0.3714,  0.2478,  0.8048],
         [ 0.2071,  0.5036, -0.0485,  ...,  1.2175, -0.2292,  0.8582],
         [-3.4278,  0.0645, -0.1427,  ...,  0.0658, -0.4367,  0.3834]]],
       grad_fn=<NativeLayerNormBackward0>)

# Text Embeddings (For Sentences and Whole Documents)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

vector = model.encode('Best movie ever!')

In [23]:
vector.shape

(768,)

# Word Embeddings Beyond LLMs

In [25]:
!pip install -q gensim

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [26]:
import gensim.downloader as api

# Download embeddings (66MB, glove, trained on wikipedia, vector size: 50)
# Other options include "word2vec-google-news-300"
# More options at https://github.com/RaRe-Technologies/gensim-data
model = api.load("glove-wiki-gigaword-50")



In [27]:
model.most_similar([model['king']], topn=11)

[('king', 1.0000001192092896),
 ('prince', 0.8236179351806641),
 ('queen', 0.7839043140411377),
 ('ii', 0.7746230363845825),
 ('emperor', 0.7736247777938843),
 ('son', 0.766719400882721),
 ('uncle', 0.7627150416374207),
 ('kingdom', 0.7542161345481873),
 ('throne', 0.7539914846420288),
 ('brother', 0.7492411136627197),
 ('ruler', 0.7434253692626953)]

# Recommending songs by embeddings

In [28]:
import pandas as pd
from urllib import request

data = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt')
lines = data.read().decode("utf-8").split('\n')[2:]

playlists = [s.rstrip().split() for s in lines if len(s.split())>1]

# Load song metadata
songs_file = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt')
songs_file = songs_file.read().decode("utf-8").split('\n')
songs = [s.rstrip().split('\t') for s in songs_file]
songs_df = pd.DataFrame(data=songs, columns = ['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

In [29]:
print( 'Playlist #1:\n ', playlists[0], '\n')
print( 'Playlist #2:\n ', playlists[1])

Playlist #1:
  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '2', '42', '43', '44', '45', '46', '47', '48', '20', '49', '8', '50', '51', '52', '53', '54', '55', '56', '57', '25', '58', '59', '60', '61', '62', '3', '63', '64', '65', '66', '46', '47', '67', '2', '48', '68', '69', '70', '57', '50', '71', '72', '53', '73', '25', '74', '59', '20', '46', '75', '76', '77', '59', '20', '43'] 

Playlist #2:
  ['78', '79', '80', '3', '62', '81', '14', '82', '48', '83', '84', '17', '85', '86', '87', '88', '74', '89', '90', '91', '4', '73', '62', '92', '17', '53', '59', '93', '94', '51', '50', '27', '95', '48', '96', '97', '98', '99', '100', '57', '101', '102', '25', '103', '3', '104', '105', '106', '107', '47', '108', '109', '110', '111', '112', '113', '25', '63', '62', '114', '115', '84', '116', '117',

In [30]:
from gensim.models import Word2Vec

# Train our Word2Vec model
model = Word2Vec(
    playlists, vector_size=32, window=20, negative=50, min_count=1, workers=4
)

In [31]:
song_id = 2172

# Ask the model for songs similar to song #2172
model.wv.most_similar(positive=str(song_id))

[('1922', 0.9980372786521912),
 ('11473', 0.9977210164070129),
 ('6626', 0.9965317845344543),
 ('2104', 0.9953141212463379),
 ('6641', 0.9950987100601196),
 ('5634', 0.9948092103004456),
 ('5586', 0.9946373105049133),
 ('2014', 0.9943349361419678),
 ('6658', 0.9936967492103577),
 ('5549', 0.9934738278388977)]

In [32]:
print(songs_df.iloc[2172])

title     Fade To Black
artist        Metallica
Name: 2172 , dtype: object


In [None]:
import numpy as np

def print_recommendations(song_id):
    similar_songs = np.array(
        model.wv.most_similar(positive=str(song_id),topn=5)
    )[:,0]
    return  songs_df.iloc[similar_songs]

# Extract recommendations
print_recommendations(2172)

In [None]:
print_recommendations(2172)

In [None]:
print_recommendations(842)