--- 

# <center> Project: NLP ENSAE 
## <center> Intents Classification for Neural Text Generation

<center>Work done by : 

##### <center> Ali HAIDAR email : ali.haidar@polytechnique.edu
##### <center> François Bertholom   email : 

---

In [24]:
import numpy as np 
import pandas as pd
from datasets import load_dataset
from keras_preprocessing.sequence import pad_sequences
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert import BertModel
import torch
from torch import nn
from torch.optim import Adam

## Load data 

In [4]:
dataset = load_dataset('silicone','dyda_da')

Found cached dataset silicone (C:/Users/aliha/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
train = pd.DataFrame(data=dataset['train'])
val = pd.DataFrame(data=dataset['validation'])
test = pd.DataFrame(data=dataset['test'])

### Building a classifier based on Bert that takes on input a uterance and gives the label

In [6]:
X_train = train['Utterance']
y_train = train['Label']

X_val = val['Utterance']
y_val = val['Label']

X_test = test['Utterance']
y_test = test['Label']

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
def tokenize(text, tokenizer):
    tokens = list(map(lambda t: ["[CLS]"] + tokenizer.tokenize(t)[:510] + ["[SEP]"] , text)) #The max size of bert input is 512
    tokens_ids = list(map(tokenizer.convert_tokens_to_ids, tokens))
    tokens_ids = pad_sequences(tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int") #Pad to get 512
    return tokens_ids
    

In [9]:
train_tokens_ids = tokenize(X_train, tokenizer)
val_tokens_ids = tokenize(X_val, tokenizer)
test_tokens_ids = tokenize(X_test, tokenizer)

In [13]:
bert = BertModel.from_pretrained('bert-base-uncased')

100%|██████████| 407873900/407873900 [08:07<00:00, 837229.00B/s] 


In [None]:
class BertMultiClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.1):
        super(BertMultiClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.n_classes = n_classes
        self.linear = nn.Linear(768, n_classes)
        self.softmax = nn.Softmax()
    
    def forward(self, tokens):
        _, pooled_output = self.bert(tokens, utput_all=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.softmax(linear_output)
        return proba