## Pytorch sur des séquences de texte

#### Import des librairies

In [18]:
import torchtext # pip install torchtext
import torch
import torch.nn as nn
import torch.optim as optim
#from torchtext.data import Field, TabularDataset, BucketIterator
from torchtext.legacy.data import Field, TabularDataset, BucketIterator

In [19]:
import spacy
spacy_en = spacy.load('en_core_web_trf')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

#### Définition des prétraitements sur le texte

In [20]:
TEXT = Field(sequential = True, lower = True, include_lengths = False,
            pad_token = "<pad>", unk_token = "<unk>",
            batch_first = True, tokenize = tokenizer)

LABELS = Field(sequential=False, use_vocab=False)

#### Tokenization

#### Création des datasets

In [21]:
train_dataset, test_dataset = TabularDataset.splits(
    path="./", format="csv", 
    train='toto2.csv', test='toto2.csv',
    skip_header = True,
    fields=[('text', TEXT), ('labels', LABELS)])

train_dataset[0].text

['will',
 'be',
 'at',
 'the',
 'london',
 '#',
 'microsoft',
 'partner',
 'business',
 'briefing',
 'tomorrow',
 '-',
 'see',
 'some',
 'of',
 'you',
 'there',
 ':)']

#### Gestion des batchs

In [22]:
device = torch.device('cpu')
train_iter, test_iter = BucketIterator.splits(
    (train_dataset, test_dataset), batch_size=160,
    sort_key = lambda x: len(x.text), device=device,
    sort_within_batch = True, shuffle = True, repeat=False)


#### Gestion du vocabulaire et des word Embeddings

In [23]:
TEXT.build_vocab(train_dataset, min_freq=2, vectors = 'glove.6B.50d')
batch = next(iter(train_iter))

In [24]:
batch.text

tensor([[  17, 3277,    7,  ...,   61,    2,    8],
        [   2,    5,    2,  ...,  218, 1138, 3036],
        [1659, 2026, 2139,  ...,   38,    2,    6],
        ...,
        [ 274,   52,    8,  ...,    2,  432, 1605],
        [  62, 1640, 1672,  ...,    6,    2,   54],
        [  80,   29,  288,  ..., 3647,    2, 5135]])

#### Création du modèle

In [31]:
class LSTMModele(nn.Module):
    def __init__(self, embedding_dim=50):
        super(LSTMModele, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze = False)# une couche qui ne marche qu'avec les imports qui ne marchent pas...
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = embedding_dim, batch_first=True)
        self.fc = nn.Linear(embedding_dim, 4) # 2 car pos neg
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs) # pour faire le lien entre indice et vecteur du mot associé
        outputs, (h_n,c_n) = self.lstm(embeds)
        x = h_n[0]
        x = self.fc(x)
        return x

In [32]:
net = LSTMModele(embedding_dim = 50).to(device)

In [33]:
net

LSTMModele(
  (embeddings): Embedding(8916, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)

In [34]:
criterion = nn.CrossEntropyLoss()
import torch.optim as optim
optimizer = optim.Adam(net.parameters(), lr=0.01)

In [270]:
next(iter(train_iter))

AttributeError: 'Example' object has no attribute 'labels'

Puis code de BOUCLE d'APPRENTISSAGE + MESURE DES PERFORMANCES + accuracy_score IDENTIQUES

In [43]:
%%time

nb_epoch = 200
for epoch in range(nb_epoch):
    i = 0
    for batch in train_iter:
        print("debut, " + str(i))
        data = batch.text.to(device)
        labels = batch.labels.to(device)
        
        outputs = net(data)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print("fin, " + str(i))
        i = i + 1
        
    print ('epoch : ' + str(epoch))

print('Finished Training')

debut, 0
fin, 0
debut, 1
fin, 1
debut, 2
fin, 2
debut, 3
fin, 3
debut, 4
fin, 4
debut, 5
fin, 5
debut, 6
fin, 6
debut, 7
fin, 7
debut, 8
fin, 8
debut, 9
fin, 9
debut, 10
fin, 10
debut, 11
fin, 11
debut, 12
fin, 12
debut, 13
fin, 13
debut, 14
fin, 14
debut, 15
fin, 15
debut, 16
fin, 16
debut, 17
fin, 17
debut, 18
fin, 18
debut, 19
fin, 19
debut, 20
fin, 20
debut, 21
fin, 21
debut, 22
fin, 22


AttributeError: 'Example' object has no attribute 'labels'

#### Mesure des performances

In [None]:
import numpy as np
all_labels = []
all_preds = []

# print(type(train_iter.batches))
# print(type(test_iter.batches))

with torch.no_grad():
    for batch in enumerate(test_iter.batches):
        data = batch.text.to(device)
        labels = batch.labels.to(device)

        outputs = net(data)
        _, predicted = torch.max(outputs.data, 1)

        all_preds.append(predicted.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

all_labels = np.concatenate(all_labels)
all_preds = np.concatenate(all_preds)

ValueError: need at least one array to concatenate

#### Without Word Embeddings

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(all_labels,all_preds)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


nan