In [1]:
!pip install torcheval torchmetrics
import numpy as np
import pandas as pd
import os 
import torch
import random
import time
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from transformers import get_linear_schedule_with_warmup
from torcheval.metrics.functional import multiclass_accuracy, multiclass_f1_score
from tqdm.notebook import tqdm
import json
import torchmetrics
from torchmetrics.classification import MulticlassAccuracy

Collecting torcheval
  Downloading torcheval-0.0.6-py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.4/158.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torchtnt>=0.0.5 (from torcheval)
  Downloading torchtnt-0.1.0-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.9/87.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyre-extensions (from torchtnt>=0.0.5->torcheval)
  Downloading pyre_extensions-0.0.30-py3-none-any.whl (12 kB)
Installing collected packages: pyre-extensions, torchtnt, torcheval
Successfully installed pyre-extensions-0.0.30 torcheval-0.0.6 torchtnt-0.1.0
[0m

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
# Check if GPU is available to evaluate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Preprocessing helper functions
def remove_punctuation(data: pd.DataFrame) -> pd.DataFrame:
    """Remove punctuation from text"""
    data["utt"] = data["utt"].str.replace(r"[^\w\s]","", regex=True)
    return data

def lowercase(data: pd.DataFrame) -> pd.DataFrame:
    """Lowercase text"""
    data["utt"] = data["utt"].str.lower()
    return data

def drop_cols(data: pd.DataFrame) -> pd.DataFrame:
    drop = ["worker_id", "slot_method", "judgments"]
    return data.drop(drop, axis=1)

def encode_labels(data: pd.DataFrame) -> pd.DataFrame:
    """Encode labels"""
    le = LabelEncoder()
    le.fit(data['intent'])
    data['intent'] = le.transform(data['intent'])
    return data, le

def decode_labels(data: np.ndarray, le: LabelEncoder) -> np.ndarray:
    """Decode labels"""
    data = le.inverse_transform(data)
    return data

In [4]:
# Assemble the data
df = pd.DataFrame()

for json_file in os.listdir('/kaggle/input/massive-dataset-v1-nlpnlu'):
    if json_file.endswith('.jsonl'):
        df = pd.concat([df, pd.read_json('/kaggle/input/massive-dataset-v1-nlpnlu/' + json_file, lines=True)], ignore_index=True)
        print(f"Added {json_file} to dataframe")

Added ko-KR.jsonl to dataframe
Added zh-TW.jsonl to dataframe
Added ru-RU.jsonl to dataframe
Added th-TH.jsonl to dataframe
Added te-IN.jsonl to dataframe
Added am-ET.jsonl to dataframe
Added jv-ID.jsonl to dataframe
Added cy-GB.jsonl to dataframe
Added hi-IN.jsonl to dataframe
Added fi-FI.jsonl to dataframe
Added mn-MN.jsonl to dataframe
Added ur-PK.jsonl to dataframe
Added km-KH.jsonl to dataframe
Added kn-IN.jsonl to dataframe
Added sl-SL.jsonl to dataframe
Added ro-RO.jsonl to dataframe
Added ml-IN.jsonl to dataframe
Added he-IL.jsonl to dataframe
Added en-US.jsonl to dataframe
Added es-ES.jsonl to dataframe
Added zh-CN.jsonl to dataframe
Added da-DK.jsonl to dataframe
Added nl-NL.jsonl to dataframe
Added ar-SA.jsonl to dataframe
Added sv-SE.jsonl to dataframe
Added tl-PH.jsonl to dataframe
Added is-IS.jsonl to dataframe
Added fr-FR.jsonl to dataframe
Added my-MM.jsonl to dataframe
Added nb-NO.jsonl to dataframe
Added id-ID.jsonl to dataframe
Added az-AZ.jsonl to dataframe
Added af

In [5]:
from toolz.functoolz import pipe
# Preprocess the data using a pipeline
df["locale"] = df["locale"].apply(lambda x: x.split("-")[0])

params = [
    remove_punctuation,
    drop_cols,
    lowercase,
]

df = pipe(
    df,
    *params
)

df, encoder = encode_labels(df)

print(f"Finished preprocessing dataset.\n\n")

Finished preprocessing dataset.




In [6]:
test_df = df.loc[df['partition'] == 'test']

In [7]:
utterances = test_df['utt'].values
langs = test_df['locale'].values
labels = test_df['intent'].values

In [8]:
from more_itertools import locate

lang_to_index_list = {}
for lang in df['locale'].unique():
    lang_to_index_list[lang] = list(locate(langs, lambda x: x == lang))


In [9]:
lang_to_index_list.keys()

dict_keys(['ko', 'zh', 'ru', 'th', 'te', 'am', 'jv', 'cy', 'hi', 'fi', 'mn', 'ur', 'km', 'kn', 'sl', 'ro', 'ml', 'he', 'en', 'es', 'da', 'nl', 'ar', 'sv', 'tl', 'is', 'fr', 'my', 'nb', 'id', 'az', 'af', 'fa', 'ta', 'de', 'sq', 'pt', 'hu', 'pl', 'lv', 'ja', 'vi', 'bn', 'sw', 'it', 'el', 'ms', 'hy', 'ka', 'tr'])

In [10]:
def tokenize(tokenizer):
    input_ids = []
    attention_masks = []

    for utt in utterances:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            utt,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 128,           # Pad & truncate all sentences.
                            truncation = True,
                            padding = 'max_length',
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])


    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels_ = torch.tensor(labels)

    # Print sentence 0, now as a list of IDs.
    print('Original: ', utterances[0])
    print('Token IDs:', input_ids[0])
    return input_ids, attention_masks, labels_

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

models = ["xlm-roberta-base", "microsoft/mdeberta-v3-base"]

models_finetuned = {
    "xlm-roberta-base": "/kaggle/input/models/xlm-roberta-MASSIVE-finetuned",
    "microsoft/mdeberta-v3-base": "/kaggle/input/models/mdeberta-MASSIVE-finetuned"
}

In [30]:
def seed_everything():
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    training_stats = []
    
def load_model_tokenizer(idx: int):
    model = torch.load(models_finetuned[models[idx]])
    model = model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(models[idx])
    return (model, tokenizer)

def test_model(idx: int):
    print("Seeding...")
    seed_everything()
    print("Seeded. Loading model and tokenizer...")
    model, tokenizer = load_model_tokenizer(idx)
    
    print("Loaded. Tokenizing...")
    input_ids_, attention_masks_, labels_ = tokenize(tokenizer)
    
    print("Tokenized. Generating dataloader...")
    dataset_ = TensorDataset(input_ids_, attention_masks_, labels_)
    dataloader = DataLoader(
            dataset_,
            sampler = SequentialSampler(dataset_),
            batch_size = 32)
    
    # Put in eval mode so weird things don't happen
    model.eval()
    
    metrics = {}
    print("Starting prediction")
    # Testing the data in batches
    batches_tqdm = tqdm(enumerate(dataloader), desc=f"Evaluation {models[idx]}", total=len(dataloader))
    
    metric = MulticlassAccuracy(num_classes=60).to(device)
    preds = np.array([])
    targets = np.array([])

    for _, batch in batches_tqdm:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            output= model(input_ids = b_input_ids, attention_mask = b_input_mask,labels = b_labels)

        logits = output.logits
        acc = metric(torch.argmax(logits, dim=1), b_labels)        
        preds = np.append(preds, torch.argmax(logits, dim=1).cpu().numpy())
        targets = np.append(targets, b_labels.cpu().numpy())

    accuracy = metric.compute()
    print(f"Accuracy = {accuracy.item()}")
    metrics['all'] = accuracy.item()
  

    for lang in lang_to_index_list.keys():
        metric = MulticlassAccuracy(num_classes=60)
        preds_ = torch.index_select(torch.tensor(preds), 0, torch.LongTensor(lang_to_index_list[lang]))
        targets_ = torch.index_select(torch.tensor(targets), 0, torch.LongTensor(lang_to_index_list[lang]))
        acc = metric(preds_, targets_)
        metrics[lang] = acc.item()
        print(f"For language {lang}, acc = {acc.item()}:\n {preds_}\n {targets_}\n")

    with open(f"metrics-{models[idx].replace('/','-')}.json", "w") as f:
        json.dump(metrics, f)

In [31]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [32]:
test_model(1)

Seeding...
Seeded. Loading model and tokenizer...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded. Tokenizing...
Original:  이번 주 오전 다섯 시 에 깨워줘
Token IDs: tensor([[     1,    260,  26229,   3920,   4559,   3228,   3730, 155166,   6464,
            260,    874,    260,  56580,  11110, 159035,      2,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      

Evaluation microsoft/mdeberta-v3-base:   0%|          | 0/4740 [00:00<?, ?it/s]

Accuracy = 0.8033444285392761
For language ko, acc = 0.8336948156356812:
 tensor([ 2.,  4., 23.,  ..., 15., 15., 15.], dtype=torch.float64)
 tensor([ 2.,  4., 23.,  ..., 15., 15., 15.], dtype=torch.float64)

For language zh, acc = 0.8262818455696106:
 tensor([ 2.,  4., 23.,  ..., 15., 15., 15.], dtype=torch.float64)
 tensor([ 2.,  4., 23.,  ..., 15., 15., 15.], dtype=torch.float64)

For language ru, acc = 0.8587443828582764:
 tensor([ 2.,  3., 46.,  ..., 15., 15., 15.], dtype=torch.float64)
 tensor([ 2.,  4., 23.,  ..., 15., 15., 15.], dtype=torch.float64)

For language th, acc = 0.766815185546875:
 tensor([ 2.,  4., 23.,  ..., 15., 15., 17.], dtype=torch.float64)
 tensor([ 2.,  4., 23.,  ..., 15., 15., 15.], dtype=torch.float64)

For language te, acc = 0.6881203055381775:
 tensor([ 2.,  4., 11.,  ..., 15., 15., 17.], dtype=torch.float64)
 tensor([ 2.,  4., 23.,  ..., 15., 15., 15.], dtype=torch.float64)

For language am, acc = 0.7904495000839233:
 tensor([ 2.,  4., 30.,  ..., 15., 15.

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
training_stats