## Pytorch sur des séquences de texte

#### Import des librairies

In [1]:
import torchtext # pip install torchtext
import torch
import torch.nn as nn
import torch.optim as optim
#from torchtext.data import Field, TabularDataset, BucketIterator
from torchtext.legacy.data import Field, TabularDataset, BucketIterator

In [2]:
import spacy
spacy_en = spacy.load('en_core_web_trf')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

  return torch._C._cuda_getDeviceCount() > 0


#### Définition des prétraitements sur le texte

In [3]:
TEXT = Field(sequential = True, lower = True, include_lengths = False,
            pad_token = "<pad>", unk_token = "<unk>",
            batch_first = True, tokenize = tokenizer)

LABELS = Field(sequential=False, use_vocab=False)

#### Tokenization

#### Création des datasets

In [4]:
train_dataset, test_dataset = TabularDataset.splits(
    path="./", format="csv", 
    train='toto2.csv', test='toto2.csv',
    skip_header = True,
    fields=[('text', TEXT), ('labels', LABELS)])

train_dataset[0].text

['will',
 'be',
 'at',
 'the',
 'london',
 '#',
 'microsoft',
 'partner',
 'business',
 'briefing',
 'tomorrow',
 '-',
 'see',
 'some',
 'of',
 'you',
 'there',
 ':)']

#### Gestion des batchs

In [5]:
device = torch.device('cpu')
train_iter, test_iter = BucketIterator.splits(
    (train_dataset, test_dataset), batch_size=160,
    sort_key = lambda x: len(x.text), device=device,
    sort_within_batch = True, shuffle = True, repeat=False)


#### Gestion du vocabulaire et des word Embeddings

In [6]:
TEXT.build_vocab(train_dataset, min_freq=2, vectors = 'glove.6B.50d')
batch = next(iter(train_iter))

In [7]:
batch.text

tensor([[  12,   65, 7262,  ...,  116,  175,   10],
        [  17, 3079,    2,  ...,    2,    8,    3],
        [7692,  290, 3699,  ...,  325,    2,  299],
        ...,
        [2947,   12,  108,  ...,  852, 5564,    1],
        [2903, 1441,   10,  ..., 4551,  510,    1],
        [4448,  174,  401,  ...,   38,    4,    1]])

#### Création du modèle

In [8]:
class LSTMModele(nn.Module):
    def __init__(self, embedding_dim=50):
        super(LSTMModele, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze = False)# une couche qui ne marche qu'avec les imports qui ne marchent pas...
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = embedding_dim, batch_first=True)
        self.fc = nn.Linear(embedding_dim, 4) # 2 car pos neg
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs) # pour faire le lien entre indice et vecteur du mot associé
        outputs, (h_n,c_n) = self.lstm(embeds)
        x = h_n[0]
        x = self.fc(x)
        return x

In [9]:
net = LSTMModele(embedding_dim = 50).to(device)

In [10]:
net

LSTMModele(
  (embeddings): Embedding(8916, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)

In [11]:
criterion = nn.CrossEntropyLoss()
import torch.optim as optim
optimizer = optim.Adam(net.parameters(), lr=0.01)

Puis code de BOUCLE d'APPRENTISSAGE + MESURE DES PERFORMANCES + accuracy_score IDENTIQUES

In [12]:
%%time

nb_epoch = 5
for epoch in range(nb_epoch):
    for i in range(0, train_iter.batch_size):
        try:
            batch = next(iter(train_iter))
            data = batch.text.to(device)
            labels = batch.labels.to(device)
            
            outputs = net(data)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        except:
            print("done")
    print ('epoch : ' + str(epoch))

print('Finished Training')

done
done
done
epoch : 0
done
done
done
done
epoch : 1
done
done
done
epoch : 2
done
epoch : 3
epoch : 4
Finished Training
Wall time: 1min 35s


#### Mesure des performances

In [17]:
import numpy as np
all_labels = []
all_preds = []

# print(type(train_iter.batches))
# print(type(test_iter.batches))

with torch.no_grad():
    for i in range(0, train_iter.batch_size):
        try:
            batch = next(iter(train_iter))
            data = batch.text.to(device)
            labels = batch.labels.to(device)

            outputs = net(data)
            _, predicted = torch.max(outputs.data, 1)

            all_preds.append(predicted.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
        except:
            print("error")

all_labels = np.concatenate(all_labels)
all_preds = np.concatenate(all_preds)

error
error
error


#### Without Word Embeddings

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(all_labels,all_preds)

0.8665096034820109