### Statistical Learning for Data Science 2 (229352)
#### Instructor: Donlapark Ponnoprat

#### [Course website](https://donlapark.pages.dev/229352/)

## Lab #12

### Note: Debug with CPU first, then run the whole notebook with GPU

In [None]:
!pip install pythainlp
!wget http://www.donlapark.cmustat.com/229352/thai_lyrics.tar.xz
!tar xf thai_lyrics.tar.xz

--2024-09-26 11:14:21--  http://www.donlapark.cmustat.com/229352/thai_lyrics.tar.xz
Resolving www.donlapark.cmustat.com (www.donlapark.cmustat.com)... 150.107.31.67
Connecting to www.donlapark.cmustat.com (www.donlapark.cmustat.com)|150.107.31.67|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6672288 (6.4M) [application/x-xz]
Saving to: ‘thai_lyrics.tar.xz.1’


2024-09-26 11:14:24 (2.54 MB/s) - ‘thai_lyrics.tar.xz.1’ saved [6672288/6672288]



In [None]:
from collections import Counter
import csv
from itertools import chain
import numpy as np
import pandas as pd

import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from pythainlp import word_tokenize

# GPT for song lyrics generation

In [None]:
df = pd.read_csv('thai_lyrics.csv', engine='python')
df.tail()

Unnamed: 0.1,Unnamed: 0,url,soup,song_title,artist_name,n_views,lyrics
4908,4908,https://www.siamzone.com/music/thailyric/28644,"<!DOCTYPE HTML>\n\n<html lang=""th"">\n<head>\n<...",ได้ยินความรักบ้างไหม Can You Hear My Heart?,พี.โอ.พี P.O.P,ดู 83 ครั้ง / แชร์,วันเวลาเหมือนเพิ่งผ่านมาเพียงไม่นาน\nความทรงจำ...
4909,4909,https://www.siamzone.com/music/thailyric/28645,"<!DOCTYPE HTML>\n\n<html lang=""th"">\n<head>\n<...",แยกย้ายกันไปเติบโต,ช้างดาบิ๊กบอย Changdabigboy,ดู 84 ครั้ง / แชร์,ถ้าเกิดวันนึงเราต้องแยกย้ายกันไปเติบโต\nฉันจะอ...
4910,4910,https://www.siamzone.com/music/thailyric/28646,"<!DOCTYPE HTML>\n\n<html lang=""th"">\n<head>\n<...",ลองใจดู (LongJaiDoo) (Ost. YStar Challenge),โอมม่อน ภพภัทร เลิศประดิษฐ์ Omnmond,ดู 112 ครั้ง / แชร์,เธอมาจากไหน ยิ่งใกล้ ยิ่งใจสั่น\nคือคนในฝัน ที...
4911,4911,https://www.siamzone.com/music/thailyric/28647,"<!DOCTYPE HTML>\n\n<html lang=""th"">\n<head>\n<...",เปลืองพื้นที่,นิ้ง อังคณา,ดู 245 ครั้ง / แชร์,อยู่ไปกะเปลืองพื้นที่ Memory เขาบ่อยากมีเฮา\nก...
4912,4912,https://www.siamzone.com/music/thailyric/28648,"<!DOCTYPE HTML>\n\n<html lang=""th"">\n<head>\n<...",ตรงนี้ตลอดไป,โน วัน เอลส์ No One Else,ดู 223 ครั้ง / แชร์,รู้ตัวดีว่าไม่คู่ควรกับใครคนไหน\nไม่ใช่คนที่ดี...


### Set hyperparameters

In [None]:
'''# Hyperparameters
LEARNING_RATE = 0.0003
BATCH_SIZE = 12  # 128 for GPU
NUM_EPOCHS = 5
max_len = 64  # 128 for GPU  # Max sequence length
d_model = 32  # 128 for GPU     # Model dimensionality
num_heads = 4       # Number of attention heads
num_layers = 4  # 6 for GPU  # Number of transformer blocks
hidden_dim = 128  # 512 for GPU  # Hidden dimension in feedforward network
dropout_rate = 0.1 # ในการปรับพรารมิเตอร์แต่ละครั้งโนด 10% จะถูก drop ไป
device = 'cuda' if torch.cuda.is_available() else 'cpu'''

"# Hyperparameters\nLEARNING_RATE = 0.0003\nBATCH_SIZE = 12  # 128 for GPU\nNUM_EPOCHS = 5\nmax_len = 64  # 128 for GPU  # Max sequence length\nd_model = 32  # 128 for GPU     # Model dimensionality\nnum_heads = 4       # Number of attention heads\nnum_layers = 4  # 6 for GPU  # Number of transformer blocks\nhidden_dim = 128  # 512 for GPU  # Hidden dimension in feedforward network\ndropout_rate = 0.1 # ในการปรับพรารมิเตอร์แต่ละครั้งโนด 10% จะถูก drop ไป\ndevice = 'cuda' if torch.cuda.is_available() else 'cpu"

* Running with GPU

In [None]:
# Hyperparameters
LEARNING_RATE = 0.0003
BATCH_SIZE = 128  # 128 for GPU
NUM_EPOCHS = 10
max_len = 128  # 128 for GPU  # Max sequence length
d_model = 128  # 128 for GPU     # Model dimensionality
num_heads = 4       # Number of attention heads
num_layers = 6  # 6 for GPU  # Number of transformer blocks
hidden_dim = 512  # 512 for GPU  # Hidden dimension in feedforward network
dropout_rate = 0.1 # ในการปรับพรารมิเตอร์แต่ละครั้งโนด 10% จะถูก drop ไป
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Convert from words to numbers

In [None]:
#[[song , number , one],[song , number , two]] -> [song , number , one , song , number , two]
def flatten(ls):
    """
    Flatten list of list
    """
    return list(chain.from_iterable(ls))

#[song , number ,one, number, two] -> [1,2,3,2,4] and [1,2,3] -> [song , number , one]
def create_lookup_dict(tokenized_lyrics, n_min=None):
    """
    Create lookup dictionary from list of words (lyrics)
    """
    word_counts = Counter(tokenized_lyrics)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    if n_min is not None:
        sorted_vocab = {k: v for k, v in word_counts.items() if v >= n_min}
    vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 0)}
    int_to_vocab = {i: word for word, i in vocab_to_int.items()}
    return (vocab_to_int, int_to_vocab)

In [None]:
'''# เวลา Debug ใช้ 10 เพลงก็แล้วกัน
df = df.iloc[:10, :]  # df.iloc[:1000, :] for GPU หรือจะ 4000 เพลงทั้งหมดก็ได้
tokenized_lyrics = df['lyrics'].map(word_tokenize)
tokenized_lyrics = flatten(tokenized_lyrics)
tokenized_lyrics = [token if token != '\n' else ' ' for token in tokenized_lyrics]
word_counts = Counter(tokenized_lyrics)
vocab_to_int, int_to_vocab = create_lookup_dict(tokenized_lyrics, n_min=None)
vocab_size = len(vocab_to_int)  # number of words in lyrics corpus'''

"# เวลา Debug ใช้ 10 เพลงก็แล้วกัน\ndf = df.iloc[:10, :]  # df.iloc[:1000, :] for GPU หรือจะ 4000 เพลงทั้งหมดก็ได้\ntokenized_lyrics = df['lyrics'].map(word_tokenize)\ntokenized_lyrics = flatten(tokenized_lyrics)\ntokenized_lyrics = [token if token != '\n' else ' ' for token in tokenized_lyrics]\nword_counts = Counter(tokenized_lyrics)\nvocab_to_int, int_to_vocab = create_lookup_dict(tokenized_lyrics, n_min=None)\nvocab_size = len(vocab_to_int)  # number of words in lyrics corpus"

In [None]:
'''print(vocab_size) # จำนวนคำทั้งหมดของ 10 เพลงแรก'''

'print(vocab_size) # จำนวนคำทั้งหมดของ 10 เพลงแรก'

* Running with GPU ใช้เพลง 1000 เพลง

In [None]:
df = df.iloc[:1000, :]
tokenized_lyrics = df['lyrics'].map(word_tokenize)
tokenized_lyrics = flatten(tokenized_lyrics)
tokenized_lyrics = [token if token != '\n' else ' ' for token in tokenized_lyrics]
word_counts = Counter(tokenized_lyrics)
vocab_to_int, int_to_vocab = create_lookup_dict(tokenized_lyrics, n_min=None)
vocab_size = len(vocab_to_int)  # number of words in lyrics corpus

In [None]:
print(vocab_size) # จำนวนคำทั้งหมดของ 4000 เพลงแรก

11962


### Create Features (previous 50 words) and Target (Word 1-51)

In [None]:
tokenized_indices = [vocab_to_int.get(token, 0) for token in tokenized_lyrics]

X, target = [], []
for n in range(0, len(tokenized_indices) - max_len, 1):
  x = tokenized_indices[n: n + max_len] # 0:63
  y = tokenized_indices[n + 1: n + max_len + 1]  # 1:64 output length = input length
  X.append(np.array(x))
  target.append(y)
X = np.array(X)
target = np.array(target)

In [None]:
class MyDataSet(torch.utils.data.Dataset):
  def __init__(self, X, y):
    super(MyDataSet, self).__init__()
    self._X = X
    self._y = y

  def __len__(self):
    return self._X.shape[0]

  def __getitem__(self, index):
    X = self._X[index]
    y = self._y[index]
    return X, y

In [None]:
dataset = MyDataSet(X, target)

trainloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

## Transformer decoder

<center><img src="https://donlapark.pages.dev/229352/Full-GPT-arch.png" alt="GPT" width="700"/></center>

### Exercise: Fill in the code blocks with `TODO ` tag in order to complete the GPT model.

### Positional encoding

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)  # Apply sin to even indices (2i)
        pe[:, 1::2] = torch.cos(position * div_term)  # Apply cos to odd indices (2i+1)
        self.register_buffer('pe', pe.unsqueeze(0))  # Shape: (1, max_len, d_model)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

### Self-Attention (`TODO:` Complete the self-attention block)

<center><img src="https://donlapark.pages.dev/229352/self-attention-matrix-calculation.png" alt="GPT" width="600"/></center>


#### Masked attention

<center><img src="https://donlapark.pages.dev/229352/masked-attention.png" alt="GPT" width="500"/></center>

In [None]:
T = 5

# Add this to your QKᵀ matrix *before* Softmax
torch.triu(torch.full((T, T), float("-inf")).to(device), diagonal=1)

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]], device='cuda:0')

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout_rate):
        super(SelfAttention, self).__init__()
        self.num_heads = num_heads # จำนวนหัวที่เราจะใช้ (4 หัว)
        self.head_dim = d_model // num_heads # จำนวนมิติที่หัวรับเข้าไป (32/4 = 8 มิติ)

        # Linear projections for query, key, and value
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)

        self.att_dropout = nn.Dropout(dropout_rate)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x):
        B, T, C = x.size()  # Batch size+, Time (seq length) = จำนวนคำในหนึ่งประโยค, Embedding size (d_model) = 32 มิติ

        # Linear projections
        q = self.query(x)
        k = self.key(x) # TODO
        v = self.value(x) # TODO

        # Split into multiple heads (C = num_heads * head_dim)
        q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)  # [B, num_heads, T, head_dim] (Matrix จะมีขนาด T*head_dim)
        print(q.shape) # ทดสอบว่าเราทำถูกหรือเปล่า
        k = self.key(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2) # TODO
        v = self.value(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2) # TODO

        """
        Calculate the self-attention with these steps:
        1. Q @ K^T: (B, num_heads, T, head_dim) @ (B, num_heads, head_dim, T) -> (B, num_heads, T, T) # คูณยังไงก็ได้ให้มีขนาด T*T (T = 64 คำ)
        2. Normalize with sqrt(head_dim)
        3. Add attention mask
        4. Apply Softmax
        5. Apply Dropout
        6. Multiply by V (use @): (B, num_heads, T, T) @ (B, num_heads, T, head_dim) -> (B, num_heads, T, head_dim) # shape สุดท้ายจะต้องเป็น T*head_dim (64*8)
        #ดังนั้นเราต้อง print shape ของ Tensor ตลอดการคำนวณ
        7. Transpose then Reshape to (B, T, num_heads*head_dim)
        8. Apply the final linear layer (self.out_proj) # แปลงจาก 32 มิติให้เป็น 32 มิติเหมือนเดิม
        Note: If you got `RuntimeError: input is not contiguous`,
              call the `.contiguous()` method **after transposing** your output. # บาง Step เช่น 7. เมื่อ Transpose แล้วลองรันโค้ดนี้ดู
        """
        #TODO: Your code here
        # 1. Q @ K^T: (B, num_heads, T, head_dim) @ (B, num_heads, head_dim, T) -> (B, num_heads, T, T)
        att = q @ k.transpose(-2, -1)
        # 2. Normalize with sqrt(head_dim)
        att = att/(self.head_dim ** 0.5)
        # 3. Add attention mask
        # Add this to your QKᵀ matrix *before* Softmax
        mask = torch.triu(torch.full((T, T), float("-inf"), device=x.device), diagonal=1) # for CPU mask = torch.triu(torch.full((T, T), float("-inf")).to(device), diagonal=1)
        att = att + mask
        # 4. Apply Softmax
        att = F.softmax(att, dim=-1)
        # 5. Apply Dropout
        att = self.att_dropout(att)
        # 6. Multiply by V (use @): (B, num_heads, T, T) @ (B, num_heads, T, head_dim) -> (B, num_heads, T, head_dim)
        att = att @ v
        # 7. Transpose then Reshape to (B, T, num_heads*head_dim) [call the `.contiguous()` method **after transposing** your output.]
        att = att.transpose(1, 2).contiguous().view(B, T, C)
        # 8. Apply the final linear layer (self.out_proj)
        out = self.out_proj(att)

        return out

In [None]:
'''# ทดสอบว่าเราทำถูกหรือเปล่า สร้าง Random Tensor ขึ้นมา
att = SelfAttention(32, 4, 0.1)
att(torch.rand(9, 16, 32)) # Batch size จะเป็นเลขอะไรก็ได้ แต่ในที่นี่ลองใช้ 9, จำนวนคำในหนึ่งประโยค, จำนวนมิติของเวกเตอร์ของหนึ่งคำ (32 ตามที่โมเดลเรากำหนดไว้)'''

'# ทดสอบว่าเราทำถูกหรือเปล่า สร้าง Random Tensor ขึ้นมา\natt = SelfAttention(32, 4, 0.1)\natt(torch.rand(9, 16, 32)) # Batch size จะเป็นเลขอะไรก็ได้ แต่ในที่นี่ลองใช้ 9, จำนวนคำในหนึ่งประโยค, จำนวนมิติของเวกเตอร์ของหนึ่งคำ (32 ตามที่โมเดลเรากำหนดไว้)'

### Feedforward block

<center><img src="https://donlapark.pages.dev/229352/Feedforward-block.png" alt="GPT" width="400"/></center>

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, d_model)
        self.gelu = nn.GELU()  # Activation function

    def forward(self, x):
        return self.fc2(self.gelu(self.fc1(x)))

### Transformer Decoder block (`TODO:` Fill in code)

<center><img src="https://donlapark.pages.dev/229352/GPT-Decoder.png" alt="GPT" width="400"/></center>

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim, dropout_rate):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(d_model, num_heads, dropout_rate)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.ln1 = nn.LayerNorm(d_model)
        self.feedforward = FeedForward(d_model, hidden_dim)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        #TODO: Your code here
        # 1. Self-Attention คำนวณโดยใช้ self.attention layer
        att = self.attention(x)
        # 2. Add & Norm
        x = self.dropout1(att) + x # Residual Connection (input x)
        x = self.ln1(x) # Apply LayerNorm
        # 3. Feedforward block
        ff = self.feedforward(x) # ส่ง output ให้ผ่านต่อไปด้วย Feedfoward network
        # 4. Add & Norm
        out = self.dropout2(ff) + x # Residual Connection (input x จาก output ที่ได้ก่อนหน้า)
        out = self.ln2(out) # Apply LayerNorm

        return out

### GPT

<center><img src="https://donlapark.pages.dev/229352/GPT.png" alt="GPT" width="150"/></center>

**Note:** The `CrossEntropyLoss` requires output shape = `(batch_size, vocab_size, seq_length)`. Make sure that your output matches this shape!

In [None]:
class GPT(nn.Module):
    def __init__(self, vocab_size, max_len, d_model, num_heads, num_layers, hidden_dim, dropout_rate):
        super(GPT, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(max_len, d_model)
        self.dropout = nn.Dropout(dropout_rate)
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, hidden_dim, dropout_rate) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(d_model)  # Final layer normalization
        self.head = nn.Linear(d_model, vocab_size, bias=False)  # Output layer (vocab_size classes)

    def forward(self, x):
        #TODO: Your code here
        # 1. รับ Input เข้ามาแล้วทำการ Input Embedding
        x = self.token_embedding(x) # input = layer dimension = output
        # 2. เพิ่ม Positional Encoding
        x = self.positional_encoding(x)
        # 3. ทำการ Dropout
        x = self.dropout(x)
        # 4. Transformer Block จำนวน 4 Layers
        for block in self.blocks: # จบ Loop นี้ เราจะได้ Apply ไป 4 block เรียบร้อย
            x = block(x)
        # 5. LayerNorm
        x = self.ln_f(x) # Final Layer Normalization
        # 6. Linear Layer
        out = self.head(x)  # ได้ Output Layer คือ จำนวนคำทั้งหมดของจำนวนเพลงที่เลือกทั้งหมด (vocab_size)
        # 7. ไม่ต้องใส่ Softmax ตัวสุดท้ายเข้าไปนะ
        # The output of the GPT class currently has the shape (batch_size, seq_length, vocab_size).
        out = out.permute(0, 2, 1) #  To match the CrossEntropyLoss requirement of (batch_size, vocab_size, seq_length), we need to permute (transpose) the last two dimensions.

        return out

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPT(
    vocab_size,
    max_len,
    d_model,
    num_heads,
    num_layers,
    hidden_dim,
    dropout_rate
    ).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

### Exercise 2: fill in the code below

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def generate(model, start_word, pad_value=0, predict_len=200):
    # Tokenize the input sentence
    words = word_tokenize(start_word)
    start_word_ids = []
    # List to store the predictions
    predicted = words

    # Words -> Integers
    word_ids = [vocab_to_int.get(word, pad_value) for word in words]

    #[28,15] -> [0,0,28,15]
    current_seq = [np.pad(word_ids, (max_len - len(word_ids) - 1, pad_value), 'constant')]

    for _ in range(predict_len):
        current_seq = torch.LongTensor(np.array(current_seq)).to(device)
        # get the next word probabilities
        p = model(current_seq)[:, :, -1]
        p = nn.Softmax(dim=1)(p).to(device).cpu().detach().numpy()
        # p = [[0.1,0.2,0.05,0.03,0.02,0.3,0.2,0.1]]
        p = p[0]
        # p = [0.1,0.2,0.05,0.03,0.02,0.3,0.2,0.1]

        # Sample from probability distribution p
        word_i = np.random.choice(np.arange(0, p.shape[0]), p=p)
        predicted.append(int_to_vocab[word_i])

        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = current_seq.detach().to(device).cpu().numpy() # Roll on CPU, then move back to GPU if necessary
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    gen_sentences = ''.join(predicted)
    return gen_sentences

### Exercise 3: use `generate` function to generate new text for 10 epochs.

In [None]:
pad_int = vocab_to_int[' ']

for t in range(NUM_EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainloader, model, loss_fn, optimizer)
    with torch.no_grad():
      print(generate(model, 'ฉันก็',
                     pad_value=pad_int, predict_len=200))
print("Done!")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size([1, 4, 127, 32])
torch.Size(

## Extra: Using Transformers Library

Transformers Documentations: https://huggingface.co/docs/transformers/index

### Sequence Classification

In [None]:
from transformers import pipeline

classifier = pipeline(task="sentiment-analysis",
                      model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
classifier("I love to hate you")

### A closer look: Tokenization + Classification

#### Load tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)

#### Tokenize

In [None]:
text = "I love you"

tokens = tokenizer.tokenize(text)

tokens

#### Convert tokens to ids

In [None]:
sentence = tokenizer.convert_tokens_to_ids(tokens)

sentence

#### Convert from sentence to ids directly

In [None]:
sentence = tokenizer(text,  return_tensors="pt")

sentence

#### Use the model to classify on the input ids

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

model(**sentence).logits