In [1]:
import torch
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint
import pandas as pd
from collections import Counter
from copy import deepcopy
import re
from typing import Callable, Iterable, Optional

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class SpecialToken:
    """
    A special token for tokenizers
    """

    def __init__(self, string: str):
        self.string = string.upper()

    def __repr__(self):
        return f"<{self.string}>"
    
    def __eq__(self, other) -> bool:
        if isinstance(other, SpecialToken):
            return self.string == other.string
        else:
            return False
        
    def __hash__(self):
        return hash(self.string)


class Tokenizer:
    """
    A simple word tokenizer
    """

    word_pattern = re.compile(R"\w+|\d+|[^\w\d\s]")

    def __repr__(self):
        return f"Tokenizer({len(self.vocabulary)} tokens)"

    def __init__(self, corpus: Iterable[str], min_frequency: float = 1.0E-6):
        words = [word for document in corpus for word in self._split(document)]
        word_counts = Counter(words)
        word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
        self.vocabulary = [k for k, v in word_counts if v/len(words) >= min_frequency] + [SpecialToken("UNKNOWN"), SpecialToken("END"), SpecialToken("PAD")]
        self.map = {word: i for i, word in enumerate(self.vocabulary)}

    def split(self, document: str) -> list[str]:
        return self.decode(self.encode(document))

    def encode(self, document: str) -> list[int]:
        return [self.map.get(word, self.UNKNOWN) for word in self._split(document)]

    def decode(self, encoded: list[int]) -> list[str]:
        return [i.string if isinstance(i, SpecialToken) else self.vocabulary[i] for i in encoded]

    def _split(self, document: str) -> list[str]:
        return self.word_pattern.findall(document)

    @property
    def PAD(self) -> SpecialToken:
        return self.map[SpecialToken("PAD")]
    
    @property
    def END(self) -> SpecialToken:
        return self.map[SpecialToken("END")]
    
    @property
    def UNKNOWN(self) -> SpecialToken:
        return self.map[SpecialToken("UNKNOWN")]


In [3]:
df = pd.read_csv("../datasets/Twitter_US_Airline_Sentiment.csv")
labels = ['negative', 'neutral', 'positive']
df.columns

Index(['text', 'airline_sentiment'], dtype='object')

In [4]:
tokenizer = Tokenizer(df.text, min_frequency=1.0E-5)
print(tokenizer)

Tokenizer(4314 tokens)


In [5]:
for _, row in df.sample(n=10).iterrows():
    print(row.text)
    print(row.airline_sentiment)
    print(tokenizer.split(row.text))
    print()


@JetBlue Complete waste of an entire day. Pathetic.
negative
['@', 'JetBlue', <UNKNOWN>, 'waste', 'of', 'an', 'entire', 'day', '.', 'Pathetic', '.']

@united you Cancelled Flight my flight. I wait in line to get rebooked, when I'm at the front you make me go to another gate and I lose my place.
negative
['@', 'united', 'you', 'Cancelled', 'Flight', 'my', 'flight', '.', 'I', 'wait', 'in', 'line', 'to', 'get', 'rebooked', ',', 'when', 'I', "'", 'm', 'at', 'the', 'front', 'you', 'make', 'me', 'go', 'to', 'another', 'gate', 'and', 'I', 'lose', 'my', 'place', '.']

@SouthwestAir bumped me to preboard on both flights (because I'm fat?) Whatever the reason, thanks!
positive
['@', 'SouthwestAir', 'bumped', 'me', 'to', 'preboard', 'on', 'both', 'flights', '(', 'because', 'I', "'", 'm', 'fat', '?', ')', <UNKNOWN>, 'the', 'reason', ',', 'thanks', '!']

@SouthwestAir weather where? And at what time Cancelled Flighted? No I can't because meeting was today.
negative
['@', 'SouthwestAir', 'weather', 

In [6]:
df_train = df.sample(frac=0.7)
df = df.drop(index=df_train.index)
df_val = df.sample(frac=0.5)
df_test = df.drop(index=df_val.index)

In [7]:
def accuracy(predicted: torch.Tensor, target: torch.Tensor):
    assert target.shape == predicted.shape
    assert target.dtype == torch.long
    assert predicted.dtype == torch.long
    with torch.no_grad():
        return torch.mean((predicted == target).float()).cpu().item()


def input_to_tensor(df: pd.DataFrame, tokenizer: Tokenizer) -> torch.Tensor:
    encoded = [tokenizer.encode(document) for document in df['text']]
    L = max([len(doc) for doc in encoded])
    encoded = [doc + [tokenizer.PAD]*(L - len(doc)) for doc in encoded]
    return torch.tensor(encoded, dtype=torch.long)


def target_to_tensor(df: pd.DataFrame) -> torch.Tensor:
    map = {k: v for v, k in enumerate(labels)}
    return torch.tensor([map[label] for label in df["airline_sentiment"]], dtype=torch.long)


def data_to_tensor(df: pd.DataFrame, tokenizer: Tokenizer) -> tuple[torch.Tensor]:
    return (input_to_tensor(df, tokenizer), target_to_tensor(df))

In [8]:
class Batchifyer:

    def __init__(self, df: pd.DataFrame, tokenizer: Tokenizer, n_batches: int, batch_size: Optional[int]):
        self.df = df
        self.tokenizer = tokenizer
        self.n_batches = n_batches
        self.batch_size = batch_size
    
    def __iter__(self):
        shuffled = df.sample(frac=1.)
        return (self._batch(shuffled, i) for i in range(self.n_batches))
    
    def _batch(self, shuffled: pd.DataFrame, i: int) -> tuple[torch.Tensor, torch.Tensor]:
        batch_size = self.batch_size or len(shuffled) // self.n_batches
        subset = shuffled.iloc[i*batch_size:(i+1)*batch_size]
        return data_to_tensor(subset, self.tokenizer)

In [9]:
def train_loop(model: torch.nn.Module, optimizer: torch.optim.Optimizer, train_data: Iterable[tuple[torch.Tensor]], val_data: Iterable[tuple[torch.Tensor]], n_steps: int = 1000, patience: int = 100, keep_best: bool = True):
    """
    train the model for the specified number of steps, or untilearly stopping
    """
    best_state = deepcopy(model.state_dict())
    best_step = 0
    best_metric = 0.
    try:
        for step in range(n_steps):
            optimizer.zero_grad()
            # train loss
            model.train()
            losses = []
            for x, y in train_data:
                loss = model.loss(x, y)
                loss.backward()
                losses.append(loss.item())
            loss = sum(losses)/len(losses)
            # val metric
            model.eval()
            metrics = []
            for x, y in val_data:
                metrics.append(model.metric(x, y))
            metric = sum(metrics) / len(metrics)
            # checkpointing
            if metric > best_metric:
                best_metric = metric
                best_step = step
                if keep_best:
                    best_state = deepcopy(model.state_dict())
            elif step - best_step > patience:
                print("early stoping")
                break
            # optimizer steping
            optimizer.step()
            # printing
            print(f"Step {step}: loss = {loss:.3g} metric = {metric:.2%}")

    except KeyboardInterrupt:
        print("interrupted by user")
    if keep_best:
        model.load_state_dict(best_state)

## Exercice 1

Implémenter et entraîner un réseau récurrent pour classifier les tweets

In [10]:
class RNN(torch.nn.Module):
    
    def __init__(self, n_classes: int, tokenizer: Tokenizer, in_features: int, hidden_state_features: int, activation: Callable = torch.relu):
        super().__init__()
        self.tokenizer = tokenizer
        self.hidden_state_features = hidden_state_features
        self.embedding = torch.nn.Embedding(len(tokenizer.vocabulary), in_features)
        self.linear = torch.nn.Linear(in_features + hidden_state_features, in_features + hidden_state_features)
        self.activation = activation
        self.contract = torch.nn.Linear(in_features + hidden_state_features, hidden_state_features)
        self.normalization = torch.nn.LayerNorm(hidden_state_features)
        self.output = torch.nn.Linear(hidden_state_features, n_classes)
    
    def forward(self, X):
        """
        Parameters
        ----------

        X : torch.Tensor
            tensor of long of shape (N, L)
        """
        X = X.to(self.device)
        N, L = X.shape
        H = torch.zeros((N, self.hidden_state_features), dtype=torch.float32, device=X.device)
        for x in X.transpose(0, 1):
            I = self.embedding(x)
            T = torch.cat([I, H], dim=1)
            T = self.linear(T)
            T = self.activation(T)
            T = self.contract(T)
            H = torch.where(x.unsqueeze(1) == self.tokenizer.PAD, H, T)
        return self.output(H)
    
    def predict(self, X: torch.Tensor) -> torch.Tensor:
        self.eval()
        with torch.no_grad():
            Y = self(X)
        return Y.max(dim=1).indices
    
    def loss(self, X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
        y_pred = self(X)
        return F.cross_entropy(y_pred, Y.to(y_pred.device))

    def metric(self, X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
        y_pred = self.predict(X)
        return accuracy(y_pred, Y.to(y_pred.device))

    @property
    def device(self) -> torch.device:
        return self.output.weight.device

In [11]:
model = RNN(len(labels), tokenizer, 100, 100)
model.to("cuda:0")
optimizer = torch.optim.Adam(model.parameters(), lr=1.0E-4)
train = Batchifyer(df_train, tokenizer, n_batches=1, batch_size=None)
val = Batchifyer(df_val, tokenizer, n_batches=1, batch_size=None)
train_loop(model, optimizer, train, val, n_steps=1000, patience=100)

Step 0: loss = 1.12 metric = 21.08%
Step 1: loss = 1.11 metric = 22.97%
Step 2: loss = 1.1 metric = 24.20%
Step 3: loss = 1.1 metric = 30.97%
Step 4: loss = 1.09 metric = 36.77%
Step 5: loss = 1.08 metric = 50.55%
Step 6: loss = 1.07 metric = 52.60%
Step 7: loss = 1.06 metric = 54.37%
Step 8: loss = 1.06 metric = 56.42%
Step 9: loss = 1.05 metric = 57.45%
Step 10: loss = 1.04 metric = 58.79%
Step 11: loss = 1.03 metric = 60.25%
Step 12: loss = 1.03 metric = 61.04%
Step 13: loss = 1.02 metric = 61.48%
Step 14: loss = 1.01 metric = 61.75%
Step 15: loss = 1.01 metric = 61.93%
Step 16: loss = 0.999 metric = 62.04%
Step 17: loss = 0.992 metric = 62.18%
Step 18: loss = 0.986 metric = 62.20%
Step 19: loss = 0.98 metric = 62.23%
Step 20: loss = 0.974 metric = 62.23%
Step 21: loss = 0.968 metric = 62.23%
Step 22: loss = 0.963 metric = 62.25%
Step 23: loss = 0.957 metric = 62.25%
Step 24: loss = 0.952 metric = 62.25%
Step 25: loss = 0.947 metric = 62.25%
Step 26: loss = 0.942 metric = 62.25%
Ste

In [12]:
X, Y = data_to_tensor(df_test, tokenizer)
y_pred = model.predict(X)
acc = accuracy(y_pred, Y.to(y_pred.device))
print(f"accuracy {acc:.3%}")

accuracy 97.222%


## Exercice II

Programmer un modèle type encodeur de transformeur pour classifier les tweets

In [13]:
class AttentionBlock(torch.nn.Module):

    def __init__(self, projection_dim: int, n_heads: int, activation: Callable):
        super().__init__()
        self.projection_dim = projection_dim
        self.n_heads = n_heads
        D = projection_dim * n_heads
        self.q = torch.nn.Linear(D, D, bias=False)
        self.k = torch.nn.Linear(D, D, bias=False)
        self.v = torch.nn.Linear(D, D, bias=False)
        self.intermediate_norm = torch.nn.LayerNorm(D)
        self.expand = torch.nn.Linear(D, 4*D)
        self.activation = activation
        self.contract = torch.nn.Linear(4*D, D)
        self.out_norm = torch.nn.LayerNorm(D)
    
    def forward(self, X: torch.Tensor, mask: torch.Tensor):
        """
        Parameters
        ----------
        X : torch.Tensor
            tensor of shape (N, L, D)
        mask : torch.Tensor
            tensor of boooleans of shape (N, L, L)
        """
        input = X
        N, L, D = X.shape
        X = X.reshape(-1, D)
        Q = self.q(X)
        Q = Q.reshape(N, L, self.n_heads, self.projection_dim).permute(0, 2, 1, 3)
        K = self.k(X).reshape(N, L, self.n_heads, self.projection_dim).permute(0, 2, 1, 3)
        V = self.v(X).reshape(N, L, self.n_heads, self.projection_dim).permute(0, 2, 1, 3)
        S = torch.einsum("nhld, nhkd -> nhlk", Q, K)
        S = torch.masked_fill(S, mask.unsqueeze(1), -float("inf"))
        S = torch.softmax(S, dim=-1)
        S = torch.masked_fill(S, mask.unsqueeze(1), 0)
        X = (S @ V).permute(0, 2, 1, 3).reshape(N, L, D)
        X = self.intermediate_norm((X + input).reshape(-1, D)).reshape(N, L, D)
        intermediate = X
        X = self.expand(X)
        X = self.activation(X)
        X = self.contract(X)
        X = self.out_norm((X + intermediate).reshape(-1, D)).reshape(N, L, D)
        return X

In [14]:
class Transformer(torch.nn.Module):

    def __init__(self, n_classes: int, tokenizer: Tokenizer, n_stages: int, projection_dim: int, n_heads: int, activation: Callable = torch.relu):
        super().__init__()
        self.tokenizer = tokenizer
        self.embedding = torch.nn.Embedding(len(tokenizer.vocabulary), n_heads*projection_dim)
        self.stages = torch.nn.ModuleList()
        for _ in range(n_stages):
            self.stages.append(AttentionBlock(projection_dim, n_heads, activation))
        self.output = torch.nn.Linear(projection_dim * n_heads, n_classes)
    
    def forward(self, X):
        """
        Parameters
        ----------
        X : torch.Tensor
            tensor of shape (N, L)
        """
        X = X.to(self.device)
        mask = (X == self.tokenizer.PAD)
        mask = (mask.unsqueeze(1) | mask.unsqueeze(2))
        X = self.embedding(X)
        for stage in self.stages:
            if self.training:
                X = checkpoint(stage, X, mask)
            else:
                X = stage(X, mask)
        return self.output(X.mean(dim=1))
    
    def predict(self, X: torch.Tensor) -> torch.Tensor:
        self.eval()
        with torch.no_grad():
            Y = self(X)
        return Y.max(dim=1).indices
    
    def loss(self, X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
        y_pred = self(X)
        return F.cross_entropy(y_pred, Y.to(y_pred.device))

    def metric(self, X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
        y_pred = self.predict(X)
        return accuracy(y_pred, Y.to(y_pred.device))

    @property
    def device(self) -> torch.device:
        return self.output.weight.device

In [15]:
model = Transformer(len(labels), tokenizer, 4, 16, 8)
model.to("cuda:0")
optimizer = torch.optim.Adam(model.parameters(), lr=1.0E-3)
train = Batchifyer(df_train, tokenizer, n_batches=1, batch_size=None)
val = Batchifyer(df_val, tokenizer, n_batches=1, batch_size=None)
train_loop(model, optimizer, train, val, n_steps=1000, patience=100)

Step 0: loss = 1.19 metric = 17.12%
Step 1: loss = 1.46 metric = 62.25%
Step 2: loss = 1.34 metric = 62.25%
Step 3: loss = 1.11 metric = 62.25%
Step 4: loss = 1.06 metric = 62.32%
Step 5: loss = 0.974 metric = 63.71%
Step 6: loss = 0.909 metric = 62.25%
Step 7: loss = 0.938 metric = 62.25%
Step 8: loss = 0.959 metric = 62.25%
Step 9: loss = 0.942 metric = 62.25%
Step 10: loss = 0.914 metric = 62.25%
Step 11: loss = 0.891 metric = 62.25%
Step 12: loss = 0.875 metric = 62.25%
Step 13: loss = 0.868 metric = 62.25%
Step 14: loss = 0.871 metric = 63.37%
Step 15: loss = 0.876 metric = 63.68%
Step 16: loss = 0.877 metric = 63.84%
Step 17: loss = 0.868 metric = 63.93%
Step 18: loss = 0.853 metric = 63.64%
Step 19: loss = 0.838 metric = 63.78%
Step 20: loss = 0.827 metric = 63.46%
Step 21: loss = 0.82 metric = 63.55%
Step 22: loss = 0.815 metric = 63.68%
Step 23: loss = 0.81 metric = 65.05%
Step 24: loss = 0.804 metric = 66.07%
Step 25: loss = 0.797 metric = 67.40%
Step 26: loss = 0.787 metric 

In [16]:
X, Y = data_to_tensor(df_test, tokenizer)
y_pred = model.predict(X)
acc = accuracy(y_pred, Y.to(y_pred.device))
print(f"accuracy {acc:.3%}")

accuracy 99.863%
