--- 

# <center> Project: NLP ENSAE 
## <center> Intents Classification for Neural Text Generation

<center>Work done by : 

##### <center> Ali HAIDAR email : ali.haidar@polytechnique.edu
##### <center> François Bertholom   email : 

---

In [36]:
import numpy as np 
import pandas as pd
from datasets import load_dataset
from keras_preprocessing.sequence import pad_sequences
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert import BertModel
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output


from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate,Flatten
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.preprocessing import text, sequence
from gensim.models import KeyedVectors
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import PolynomialDecay


import process
import models

## Load data 

In [2]:
dataset = load_dataset('silicone','dyda_da')

Found cached dataset silicone (C:/Users/aliha/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train = pd.DataFrame(data=dataset['train'])
val = pd.DataFrame(data=dataset['validation'])
test = pd.DataFrame(data=dataset['test'])

### Building a classifier based on Bert that takes on input a context and gives the label of each utterance

In [39]:
'''
sizeOfTheContext = 5

train_context, test_context = process.context(train, test, sizeOfTheContext)

X_train = train_context['Utterance']
y_train = (train_context['Label'])

X_test = test_context['Utterance']
y_test = test_context['Label']

y_train = np.array([np.array(i) for i in y_train])
y_test =  np.array([np.array(i) for i in y_test])

train_tokens_ids = process.tokenize(X_train)
test_tokens_ids = process.tokenize(X_test)

train_masks = process.mask(train_tokens_ids)
test_mask = process.mask(test_tokens_ids)
train_masks = np.array(train_masks)
test_mask = np.array(test_mask)

y_train_masks  = process.mask(y_train)
y_test_masks  = process.mask(y_test)
y_train_masks = np.array(y_train_masks)
y_test_masks = np.array(y_test_masks)
'''

### Building a classifier based on Bert that takes on input a uterance and gives the label

In [2]:
def get_bert_embed_matrix():
    bert = BertModel.from_pretrained('bert-base-uncased')
    bert_embeddings = list(bert.children())[0]
    bert_word_embeddings = list(bert_embeddings.children())[0]
    mat = bert_word_embeddings.weight.data.numpy()
    return mat

In [None]:
def generate_result(dataset,model):
    
    train = pd.DataFrame(data=dataset['train'])
    val = pd.DataFrame(data=dataset['validation'])
    test = pd.DataFrame(data=dataset['test'])
    label = 'Label'

    train = train.dropna()
    val = val.dropna()
    X_train = train['Utterance']
    y_train = np.array(train[label])

    X_val = val['Utterance']
    y_val = np.array(val[label])

    X_test = test['Utterance']
    y_test = np.array(test[label])

    n_classes = len(np.unique(y_train))
   

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

    train_tokens_ids = process.tokenize(X_train, tokenizer)
    val_tokens_ids = process.tokenize(X_val, tokenizer)
    test_tokens_ids = process.tokenize(X_test, tokenizer)
    
    embedding_matrix = get_bert_embed_matrix()
    model = model.build_model(embedding_matrix, n_classes)

    earlyStopping = EarlyStopping(monitor='val_loss', patience=6, verbose=1, mode='min')
    mcp_save = ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')
    
    NUM_TRAIN_STEPS = (len(train_tokens_ids)//BATCH_SIZE) * EPOCHS

    lr_scheduler = PolynomialDecay(initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps= NUM_TRAIN_STEPS)
    opt = Adam(learning_rate=lr_scheduler,clipnorm=1)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=opt)
    print(model.summary())
    model.fit(
      train_tokens_ids,
      y_train,

      validation_data = (val_tokens_ids,y_val),
      validation_batch_size = 512,
      batch_size=BATCH_SIZE,
      epochs=EPOCHS,
      verbose=1,
      callbacks=[earlyStopping, mcp_save]
    )

    bert_predicted = np.argmax(model.predict(test_tokens_ids, batch_size=128),axis=1)
    acc = (np.sum(bert_predicted == y_test)/len(y_test)) *100
    return acc


In [None]:
BATCH_SIZE = 32
EPOCHS = 30

embedding_matrix = get_bert_embed_matrix()
results = pd.DataFrame(columns=['model','dyda_da', 'dyda_e','maptask', 'meld_e', 'meld_s', 'mrda', 'oasis', 'sem', 'swda','iemocap'])
models = [models.BertMLP1Layer(), models.BertMLP2Layers(), models.BertLstm(), models.BertDoubleLstm()]
for model in models:
  res = [model.__class__.__name__]
  for d in ['dyda_da', 'dyda_e' ,'maptask', 'meld_e', 'meld_s', 'mrda', 'oasis', 'sem', 'swda','iemocap']:
      dataset = load_dataset('silicone',d)
      acc = generate_result(dataset, model, embedding_matrix)
      print("Accuracy on " + d + " :",acc)
      res.append(acc)
  results.loc[len(results)] = res