In [6]:
import torch
import pandas as pd
from collections import Counter
import re
from typing import Callable, Iterable

In [117]:
class SpecialToken:
    """
    A special token for tokenizers
    """

    def __init__(self, string: str):
        self.string = string.upper()

    def __repr__(self):
        return f"<{self.string}>"
    
    def __equal__(self, other) -> bool:
        if isinstance(other, SpecialToken):
            return self.string == other.string
        else:
            return False
        
    def __hash__(self):
        return hash(self.string)


class Tokenizer:
    """
    A simple word tokenizer
    """

    word_pattern = re.compile(R"\w+|\d+|[^\w\d\s]")

    def __repr__(self):
        return f"Tokenizer({len(self.vocabulary)} tokens)"

    def __init__(self, corpus: Iterable[str], min_frequency: float = 1.0E-6):
        words = [word for document in corpus for word in self._split(document)]
        word_counts = Counter(words)
        word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
        self.vocabulary = [k for k, v in word_counts if v/len(words) >= min_frequency] + [SpecialToken("UNKNOWN"), SpecialToken("END"), SpecialToken("PAD")]
        self.map = {word: i for i, word in enumerate(self.vocabulary)}

    def split(self, document: str) -> list[str]:
        return self.decode(self.encode(document))

    def encode(self, document: str) -> list[int]:
        return [self.map.get(word, SpecialToken("UNKNOWN")) for word in self._split(document)]

    def decode(self, encoded: list[int]) -> list[str]:
        return [i.string if isinstance(i, SpecialToken) else self.vocabulary[i] for i in encoded]

    def _split(self, document: str) -> list[str]:
        return self.word_pattern.findall(document)

    @property
    def PAD(self) -> SpecialToken:
        return self.map[SpecialToken("PAD")]
    
    @property
    def END(self) -> SpecialToken:
        return self.map[SpecialToken("END")]
    
    @property
    def UNKNOWN(self) -> SpecialToken:
        return self.map[SpecialToken("UNKNOWN")]


In [129]:
df = pd.read_csv("../datasets/Twitter_US_Airline_Sentiment.csv")
labels = ['negative', 'neutral', 'positive']
df.columns

Index(['text', 'airline_sentiment'], dtype='object')

In [119]:
tokenizer = Tokenizer(df.text, min_frequency=1.0E-5)
print(tokenizer)

Tokenizer(4314 tokens)


In [124]:
for text in df.sample(n=10).text:
    print(text)
    print(tokenizer.split(text))
    print()


@USAirways how can i get ahold of a reservations supervisor?
['@', 'USAirways', 'how', 'can', 'i', 'get', 'ahold', 'of', 'a', 'reservations', 'supervisor', '?']

@VirginAmerica When will VX use all 6 LGA slots instead of 4 today? Adding AUS makes this less likely :(
['@', 'VirginAmerica', 'When', 'will', 'VX', 'use', 'all', '6', 'LGA', 'UNKNOWN', 'instead', 'of', '4', 'today', '?', 'UNKNOWN', 'AUS', 'makes', 'this', 'less', 'likely', ':', '(']

@AmericanAir that's 16+ extra hours of travel time. Missed vacation time and now you guys are messing with my professional life.
['@', 'AmericanAir', 'that', "'", 's', '16', '+', 'extra', 'hours', 'of', 'travel', 'time', '.', 'Missed', 'vacation', 'time', 'and', 'now', 'you', 'guys', 'are', 'messing', 'with', 'my', 'professional', 'life', '.']

@SouthwestAir Thanks I just sent a DM with this info.
['@', 'SouthwestAir', 'Thanks', 'I', 'just', 'sent', 'a', 'DM', 'with', 'this', 'info', '.']

@USAirways horrible travel day w/ your airlines. My 2:20

In [131]:
def accuracy(predicted: torch.Tensor, target: torch.Tensor):
    assert target.shape == predicted.shape
    assert target.dtype == torch.long
    assert predicted.dtype == torch.long
    with torch.no_grad():
        return torch.mean((predicted == target).float()).cpu().item()


def input_to_tensor(df: pd.DataFrame, tokenizer: Tokenizer) -> torch.Tensor:
    encoded = [tokenizer.encode(document) for document in df['text']]
    L = max([len(doc) for doc in encoded])
    encoded = [doc + [tokenizer.PAD]*(L - len(doc)) for doc in encoded]
    return torch.tensor(encoded, dtype=torch.long)

def target_to_tensor():
    pass

## Exercice 1

Implémenter et entraîner un réseau récurrent pour classifier les tweets

In [None]:
class RNN(torch.nn.Module):
    
    def __init__(self, tokenizer: Tokenizer, in_features: int, hidden_state_features: int, activation: Callable = torch.relu):
        super().__init__()
        self.tokenizer = tokenizer
        self.hidden_state_features = hidden_state_features
        self.embedding = torch.nn.Embedding(len(tokenizer.vocabulary), in_features)
        self.linear = torch.nn.Linear(in_features + hidden_state_features, in_features + hidden_state_features)
        self.activation = activation
        self.output = torch.nn.Linear(in_features + hidden_state_features, in_features + hidden_state_features)
        self.normalization = torch.nn.LayerNorm(hidden_state_features)
    
    def forward(self, X):
        """
        Parameters
        ----------

        X : torch.Tensor
            tensor of long of shape (N, L)
        H : torch.Tensor
            tensor of float of shape (N, D)
        """
        N, L = X.shape
        H = torch.zeros((N, self.hidden_state_features), dtype=torch.float32, device=X.device)
        for x in X.transpose(0, 1):
            I = self.embedding(x)
            T = torch.cat([I, H], dim=1)
            T = self.linear(T)
            T = self.activation(T)
            T = self.output(T)
            H = torch.where(x == self.tokenizer.PAD, H, T)