--- 

# <center> Project: NLP ENSAE 
## <center> Intents Classification for Neural Text Generation

<center>Work done by : 

##### <center> Ali HAIDAR email : ali.haidar@polytechnique.edu
##### <center> François Bertholom   email : 

---

In [45]:
import numpy as np 
import pandas as pd
from datasets import load_dataset
from keras_preprocessing.sequence import pad_sequences
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert import BertModel
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils import clip_grad_norm_

## Load data 

In [4]:
dataset = load_dataset('silicone','dyda_da')

Found cached dataset silicone (C:/Users/aliha/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
train = pd.DataFrame(data=dataset['train'])
val = pd.DataFrame(data=dataset['validation'])
test = pd.DataFrame(data=dataset['test'])

### Building a classifier based on Bert that takes on input a uterance and gives the label

In [36]:
X_train = train['Utterance']
y_train = np.array(train['Label'])

X_val = val['Utterance']
y_val = np.array(val['Label'])

X_test = test['Utterance']
y_test = np.array(test['Label'])

In [37]:
n_classes = len(np.unique(y_train))
n_classes

4

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
def tokenize(text, tokenizer):
    tokens = list(map(lambda t: ["[CLS]"] + tokenizer.tokenize(t)[:510] + ["[SEP]"] , text)) #The max size of bert input is 512
    tokens_ids = list(map(tokenizer.convert_tokens_to_ids, tokens))
    tokens_ids = pad_sequences(tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int") #Pad to get 512
    return tokens_ids
    

In [9]:
train_tokens_ids = tokenize(X_train, tokenizer)
val_tokens_ids = tokenize(X_val, tokenizer)
test_tokens_ids = tokenize(X_test, tokenizer)

In [28]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
val_masks = [[float(i > 0) for i in ii] for ii in val_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

In [13]:
bert = BertModel.from_pretrained('bert-base-uncased')

100%|██████████| 407873900/407873900 [08:07<00:00, 837229.00B/s] 


In [53]:
#Create an encoder using Bert and decoder as linear layer
class BertMultiClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.1):
        super(BertMultiClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.n_classes = n_classes
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, n_classes)
        self.softmax = nn.Softmax()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.softmax(linear_output)
        return proba

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [58]:
model_clf = BertMultiClassifier(n_classes=4)
model_clf = model_clf.to(device)

BATCH_SIZE = 8
EPOCHS = 10

train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(y_train.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(y_test.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)


train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

param_optimizer = list(model_clf.linear.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(model_clf.parameters(), lr=3e-6)


In [61]:
for epoch_num in range(EPOCHS):
    model_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
       
        logits = model_clf(token_ids, masks)
        
        loss_func = nn.CrossEntropyLoss()
        
        batch_loss = loss_func(logits, labels.squeeze())
        
        train_loss += batch_loss.item()
        
        
        model_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=model_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(X_train) / BATCH_SIZE, train_loss / (step_num + 1)))

In [None]:
model_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = model_clf(token_ids, masks)
        loss_func = nn.CrossEntropyLoss()
        loss = loss_func(logits, labels.squeeze())
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])