In [None]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re
import os 

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
def remove_punctuation(data: pd.DataFrame) -> pd.DataFrame:
    """Remove punctuation from text"""
    data["utt"] = data["utt"].str.replace(r"[^\w\s]","", regex=True)
    return data

def lowercase(data: pd.DataFrame) -> pd.DataFrame:
    """Lowercase text"""
    data["utt"] = data["utt"].str.lower()
    return data

def drop_cols(data: pd.DataFrame) -> pd.DataFrame:
    drop = ["worker_id", "slot_method", "judgments"]
    return data.drop(drop, axis=1)

def encode_labels(data: pd.DataFrame) -> pd.DataFrame:
    """Encode labels"""
    le = LabelEncoder()
    le.fit(data['intent'])
    data['intent'] = le.transform(data['intent'])
    return data, le

def decode_labels(data: np.ndarray, le: LabelEncoder) -> np.ndarray:
    """Decode labels"""
    data = le.inverse_transform(data)
    return data

In [None]:
from toolz.functoolz import pipe

df["locale"] = df["locale"].apply(lambda x: x.split("-")[0])

params = [
    remove_punctuation,
    drop_cols,
    lowercase,
]


df = pipe(
    df,
    *params
)


df, encoder = encode_labels(df)

print(f"Finished preprocessing dataset.\n\n")

In [None]:
test_df = df.loc[df['partition'] == 'test']

In [None]:
utterances = [test_df['utt'].values]
num_labels = len(df['intent'].unique())
labels = [test_df['intent'].values]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

models = ["xlm-roberta-base", "microsoft/mdeberta-v3-base"]

In [None]:
num_labels = 60

In [None]:
models_finetuned = {
    "xlm-roberta-base": "./models/xlm-roberta-base-finetuned/xlm-roberta-base-finetuned",
    "microsoft/mdeberta-v3-base": "./models/mdeberta-v3-finetuned/mdeberta-MASSIVE-finetuned"
}

In [None]:
idx = 0

In [None]:
model = models_finetuned[models[idx]]
tokenizer = AutoTokenizer.from_pretrained(models[idx])

In [None]:
model = torch.nn.DataParallel(model)
model = model.to(device)

In [None]:
input_ids = []
attention_masks = []

for utt_ in utterances:
    input_ids_ = []
    attention_masks_ = []
    for utt in utt_:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            utt,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 128,           # Pad & truncate all sentences.
                            truncation = True,
                            padding = 'max_length',
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.    
        input_ids_.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks_.append(encoded_dict['attention_mask'])
    
    input_ids.append(input_ids_)
    attention_masks.append(attention_masks_)

# Convert the lists into tensors.
input_ids = [torch.cat(inp, dim=0) for inp in input_ids]
attention_masks = [torch.cat(att, dim=0) for att in attention_masks]
labels = [torch.tensor(lab) for lab in labels]

# Print sentence 0, now as a list of IDs.
print('Original: ', utterances[0][0])
print('Token IDs:', input_ids[0][0])

In [None]:
test_dataset = TensorDataset(input_ids[0], attention_masks[0], labels[0])

In [None]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
test_dataloader = DataLoader(
            test_dataset,  # The training samples.
            sampler = RandomSampler(test_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )