## Pytorch sur des séquences de texte

### File processing

#### File splitting

In [1]:
import math

filename = 'train_label_final.txt'
lines = open(filename,"r", encoding='UTF-8').readlines()
nb_lines = len(lines)
nb_lines_half = math.ceil(nb_lines*0.7)
# Avec le code actuel, comme on fait nb_lines/2 = nb_lines*0.5, la répartition entre les 2 fichiers est de 50%/50%
# Ainsi, si on veut une répartition de 70%/30% pour jeu d'entrainement/validation, il suffit de faire nb_lines*0.7
print(nb_lines_half)

fout = open("output0.txt","wt", encoding='UTF-8')
for i,line in enumerate(lines):
    
    if((i+1)%nb_lines_half) == 0:
        line = str(line).replace('\n', '')
        
    if (i%nb_lines_half) == 0:
        if fout: fout.close()
        fout = open('output%d.txt' % (i/nb_lines_half), 'w', encoding='UTF-8')
        
    fout.write(line)
    
fout.close()

5883


#### File parsing

In [2]:
import re

In [3]:
path = "./output0.txt"
path2 = "./output1.txt"

In [4]:
def convert_txt_to_csv(path, out):
    f = open(path, "r", encoding="UTF-8")
    lines = f.readlines()
    f.close()
    decisions = ["pos", "neg", "neu", "irr"]

    # Future CSV content !
    content = "text,opinion"

    for l in lines:
        # Removing empty lines
        sl = l.split()
        if len(sl) == 0:
            continue

        # Removing no consensus line
        m = re.match(r"[(](.*),(.*),(consensus)[)]", sl[0])
        if m is None:
            continue

        # Parsing opinion
        try:
            opinion = m.string.split(",")[1]
            if opinion in decisions:
                # Opinion string is converted to an integer value
                index = decisions.index(opinion)

                # Removing opinion
                text = re.sub(r"[(](.*),(.*),(consensus)[)]", "", l)

                # Removing comma since comma is CSV separator
                text = text.replace(",", " ")

                # Remvoing extra \n
                text = text.replace("\n", " ")

                # Removing extra space
                text = ' '.join(text.split())

                content = content + "\n" + text + "," + str(index) 
            else:
                # Found opinion is... Wrong !
                continue
        except:
            # Opinion is not found on the line
            continue

    # Writing CSV content to out file
    csv = open(out, "w", encoding="UTF-8")
    csv.write(content)
    csv.close()

In [5]:
convert_txt_to_csv(path, "dataset.csv")
convert_txt_to_csv(path2, "dataset2.csv")

#### Import des librairies

In [6]:
import torchtext # pip3 install torchtext
import torch
import torch.nn as nn
import torch.optim as optim
#from torchtext.data import Field, TabularDataset, BucketIterator
from torchtext.legacy.data import Field, TabularDataset, BucketIterator

In [7]:
# Commandes utiles en cas de lib manquantes :

# !pip3 install -U spacy                       # spaCy lib
# !pip3 install torch torchvision torchaudio   # PyTorch (see <pytorch.org/get-started/locally> for GPU support)
# !python3 -m spacy download en_core_web_trf    # download spaCy trained pipeline

# et pour certains :
# pip install transformers -U

# et pour tester :
# !python3 -m spacy download xx_ent_wiki_sm    # download spaCy trained pipeline

# A partir d'ici, changer les noms des titres... plus ou moins

#### Tokenization

In [8]:
import spacy
# TODO: find a way to merge EN & FR language pipelines
spacy_en = spacy.load('en_core_web_trf')
#spacy_en = spacy.load('xx_ent_wiki_sm') # multi language to test --> plus nul. Voir comment utiliser 2 en meme temps plutot

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

#### Définition des prétraitements sur le texte

In [9]:
TEXT = Field(sequential = True, lower = True, include_lengths = False,
            pad_token = "<pad>", unk_token = "<unk>",
            batch_first = True, tokenize = tokenizer)

LABELS = Field(sequential=False, use_vocab=False)

#### Création des datasets

In [10]:
train_dataset, test_dataset = TabularDataset.splits(
    path="./", format="csv", 
    train='dataset.csv', test='dataset2.csv',
    skip_header = True,
    fields=[('text', TEXT), ('labels', LABELS)])

train_dataset[0].text

['will',
 'be',
 'at',
 'the',
 'london',
 '#',
 'microsoft',
 'partner',
 'business',
 'briefing',
 'tomorrow',
 '-',
 'see',
 'some',
 'of',
 'you',
 'there',
 ':)']

#### Gestion des batchs

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_iter, test_iter = BucketIterator.splits(
    (train_dataset, test_dataset), batch_size=160,
    sort_key = lambda x: len(x.text), device=device,
    sort_within_batch = True, shuffle = True, repeat=False)

#### Gestion du vocabulaire et des word Embeddings

In [12]:
TEXT.build_vocab(train_dataset, min_freq=2, vectors = 'glove.6B.50d')
#TEXT.build_vocab(train_dataset, min_freq=2, vectors = 'glove.6B.100d')
batch = next(iter(train_iter))

In [13]:
batch.text

tensor([[  12,   59,  541,  ...,    0,   64, 1175],
        [  15,  179,    7,  ...,    6,    2,  100],
        [   5,    0,   36,  ...,  127,    2,    5],
        ...,
        [   2,    0,   41,  ...,    6,    1,    1],
        [ 839,  979,  443,  ...,   32,    1,    1],
        [   2,    5,   11,  ...,    0,    1,    1]])

#### Création du modèle

In [15]:
class LSTMModele(nn.Module):
    def __init__(self, embedding_dim=50):
    #def __init__(self, embedding_dim=100):
        super(LSTMModele, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze = False)# une couche qui ne marche qu'avec les imports qui ne marchent pas...
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = embedding_dim, batch_first=True)
        self.fc = nn.Linear(embedding_dim, 4) # 4 car {pos,neg,neu,irr} 
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs) # pour faire le lien entre indice et vecteur du mot associé
        outputs, (h_n,c_n) = self.lstm(embeds)
        x = h_n[0]
        x = self.fc(x)
        return x

In [16]:
net = LSTMModele(embedding_dim = 50).to(device)
#net = LSTMModele(embedding_dim = 100).to(device)
# embedding_dim = 50 car la taille de chaque vecteur de représentation des mots par l'embedding utilisé à partir de
# la base du fichier "glove.6B.50d", est de taille 50.

In [17]:
net

LSTMModele(
  (embeddings): Embedding(1981, 100)
  (lstm): LSTM(100, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=4, bias=True)
)

In [18]:
criterion = nn.CrossEntropyLoss()
import torch.optim as optim
optimizer = optim.Adam(net.parameters(), lr=0.01)

#### Boucle d'apprentissage

In [19]:
%%time

nb_epoch = 200
for epoch in range(nb_epoch):
    for i in range(0, train_iter.batch_size):
        try:
            batch = next(iter(train_iter))
            data = batch.text.to(device)
            labels = batch.labels.to(device)
            
            outputs = net(data)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        except:
            print("done")
    print ('epoch : ' + str(epoch))

print('End of training')

epoch : 0
epoch : 1
epoch : 2
epoch : 3
epoch : 4
epoch : 5
epoch : 6
epoch : 7
epoch : 8
epoch : 9
epoch : 10
epoch : 11
epoch : 12
epoch : 13
epoch : 14
epoch : 15
epoch : 16
epoch : 17
epoch : 18
epoch : 19
epoch : 20
epoch : 21
epoch : 22
epoch : 23
epoch : 24
epoch : 25
epoch : 26
epoch : 27
epoch : 28
epoch : 29
epoch : 30
epoch : 31
epoch : 32
epoch : 33
epoch : 34
epoch : 35
epoch : 36
epoch : 37
epoch : 38
epoch : 39
epoch : 40
epoch : 41
epoch : 42
epoch : 43
epoch : 44
epoch : 45
epoch : 46
epoch : 47
epoch : 48
epoch : 49
epoch : 50
epoch : 51
epoch : 52
epoch : 53
epoch : 54
epoch : 55
epoch : 56
epoch : 57
epoch : 58
epoch : 59
epoch : 60
epoch : 61
epoch : 62
epoch : 63
epoch : 64
epoch : 65
epoch : 66
epoch : 67
epoch : 68
epoch : 69
epoch : 70
epoch : 71
epoch : 72
epoch : 73
epoch : 74
epoch : 75
epoch : 76
epoch : 77
epoch : 78
epoch : 79
epoch : 80
epoch : 81
epoch : 82
epoch : 83
epoch : 84
epoch : 85
epoch : 86
epoch : 87
epoch : 88
epoch : 89
epoch : 90
epoch : 9

#### Mesure des performances

In [23]:
import numpy as np
all_labels = []
all_preds = []

# print(type(train_iter.batches))
# print(type(test_iter.batches))

with torch.no_grad():
    for i in range(0, test_iter.batch_size):
        try:
            batch = next(iter(test_iter))
            data = batch.text.to(device)
            labels = batch.labels.to(device)

            outputs = net(data)
            _, predicted = torch.max(outputs.data, 1)

            all_preds.append(predicted.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
        except:
            print("error")

all_labels = np.concatenate(all_labels)
all_preds = np.concatenate(all_preds)

#### Scores

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(all_labels,all_preds)

0.6375

0.675 --> glove50d
0.64 --> glove100d mdr