In [2]:
# Cite: https://github.com/AI4Bharat/indic-bert

In [3]:
base_dir = '/home/sphi/work/darshan/kn-work/indicBERT-runs/classification-task/'

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=3)

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'sop_classifier.classifier.weight', 'predictions.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.bias', 'sop_classifier.classifier.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indi

In [5]:
# Data
kn_base_dir = base_dir + 'indicnlp-news-articles/kn/'
train_file = kn_base_dir + 'kn-train.csv'
test_file = kn_base_dir + 'kn-test.csv'
valid_file = kn_base_dir + 'kn-valid.csv'

In [6]:
from datasets import load_dataset, ClassLabel
dataset = load_dataset('csv', column_names=['label', 'sentence'], data_files={'train': train_file, 'test': test_file, 'validation': valid_file})
dataset

Using custom data configuration default-b318d9e5241e7c69
Reusing dataset csv (/home/jupyter-admin/.cache/huggingface/datasets/csv/default-b318d9e5241e7c69/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


DatasetDict({
    train: Dataset({
        features: ['label', 'sentence'],
        num_rows: 24000
    })
    test: Dataset({
        features: ['label', 'sentence'],
        num_rows: 3000
    })
    validation: Dataset({
        features: ['label', 'sentence'],
        num_rows: 3000
    })
})

In [7]:
dataset_attributes = [k for k,v in dataset.items()]

In [8]:
# Class labeling & update the dataset
class_names = {l for l in dataset["train"]["label"]}
class_dict = {c:i for i,c in enumerate(class_names)}
print(class_dict)

def class_to_id(row):
    row["label"] = class_dict[row["label"]]
    return row

for attr in dataset_attributes:
    ds_attr = dataset[attr]
    # Overwrite the label column after converting string to int
    ds_attr = ds_attr.map(class_to_id)
    # Type cast the lable column
    new_features = ds_attr.features.copy()
    new_features["label"] = ClassLabel(names=list(class_names))
    ds_attr = ds_attr.cast(new_features)
    dataset[attr] = ds_attr

{'sports': 0, 'entertainment': 1, 'lifestyle': 2}


HBox(children=(FloatProgress(value=0.0, max=24000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [9]:
dataset['train'].features

{'label': ClassLabel(num_classes=3, names=['sports', 'entertainment', 'lifestyle'], names_file=None, id=None),
 'sentence': Value(dtype='string', id=None)}

In [10]:
# Tokenization
MAX_LENGTH=512
def tokenize_function(row):
    return tokenizer(row['sentence'], truncation=True, max_length=MAX_LENGTH)
                         
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'sentence', 'token_type_ids'],
        num_rows: 24000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'sentence', 'token_type_ids'],
        num_rows: 3000
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'sentence', 'token_type_ids'],
        num_rows: 3000
    })
})

In [11]:
# Drop the unwanted columns
for attr in dataset_attributes:
    ds_attr = tokenized_datasets[attr]
    ds_attr = ds_attr.remove_columns(["sentence"])
    tokenized_datasets[attr] = ds_attr

In [12]:
# Data Collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
# (Optional) Check the dimensions
for attr in dataset_attributes:
    print(attr)
    samps = tokenized_datasets[attr][:5]
    batch = data_collator(samps)
    display({k: v.shape for k, v in batch.items()})

train


{'attention_mask': torch.Size([5, 506]),
 'input_ids': torch.Size([5, 506]),
 'token_type_ids': torch.Size([5, 506]),
 'labels': torch.Size([5])}

test


{'attention_mask': torch.Size([5, 512]),
 'input_ids': torch.Size([5, 512]),
 'token_type_ids': torch.Size([5, 512]),
 'labels': torch.Size([5])}

validation


{'attention_mask': torch.Size([5, 512]),
 'input_ids': torch.Size([5, 512]),
 'token_type_ids': torch.Size([5, 512]),
 'labels': torch.Size([5])}

In [14]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
# Fine-tuning on Classification objective
from transformers import TrainingArguments, Trainer
CLS_MODEL_DIR = base_dir + 'model_cp_01'
TRAINING_STEPS=400000
EVALUATION_STEPS=10000
CHECKPOINT_SAVE_STEPS=5000
training_args = TrainingArguments(CLS_MODEL_DIR)
training_args = TrainingArguments(CLS_MODEL_DIR, 
                                  max_steps=TRAINING_STEPS,
                                  evaluation_strategy="steps",
                                  eval_steps=EVALUATION_STEPS,
                                  save_steps=CHECKPOINT_SAVE_STEPS)

In [None]:
# Trainer instance
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
import torch
print('GPU Available:{}'.format(torch.cuda.is_available()))
trainer.train()

***** Running training *****
  Num examples = 24000
  Num Epochs = 134
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 400000


GPU Available:True


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10000,0.1177,0.162879,0.964333,0.964283,0.964272,0.964333
20000,0.0964,0.18224,0.957667,0.957525,0.957699,0.957667
30000,0.1017,0.114583,0.971667,0.971778,0.972428,0.971667
40000,0.0635,0.106062,0.975333,0.975322,0.975502,0.975333
50000,0.0476,0.113259,0.976333,0.976334,0.976402,0.976333
60000,0.14,0.175414,0.965333,0.965249,0.966288,0.965333
70000,0.0871,0.139934,0.971,0.971038,0.971097,0.971
80000,0.0702,0.121687,0.971667,0.971725,0.972314,0.971667
90000,0.0708,0.143988,0.973,0.973049,0.973232,0.973
100000,0.0257,0.164783,0.974,0.973909,0.974493,0.974


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Saving model checkpoint to /home/sphi/work/darshan/kn-work/indicBERT-runs/classification-task/model_cp_01/checkpoint-25000
Configuration saved in /home/sphi/work/darshan/kn-work/indicBERT-runs/classification-task/model_cp_01/checkpoint-25000/config.json
Model weights saved in /home/sphi/work/darshan/kn-work/indicBERT-runs/classification-task/model_

Configuration saved in /home/sphi/work/darshan/kn-work/indicBERT-runs/classification-task/model_cp_01/checkpoint-95000/config.json
Model weights saved in /home/sphi/work/darshan/kn-work/indicBERT-runs/classification-task/model_cp_01/checkpoint-95000/pytorch_model.bin
tokenizer config file saved in /home/sphi/work/darshan/kn-work/indicBERT-runs/classification-task/model_cp_01/checkpoint-95000/tokenizer_config.json
Special tokens file saved in /home/sphi/work/darshan/kn-work/indicBERT-runs/classification-task/model_cp_01/checkpoint-95000/special_tokens_map.json
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order 

In [16]:
# Load the model and Predict on the test data
from transformers import pipeline
model = AutoModelForSequenceClassification.from_pretrained(CLS_MODEL_DIR + '/checkpoint-400000')

In [None]:
Below two cells are not working

In [32]:
# # Test data loader
# from torch.utils.data import DataLoader
# test_data_loader = DataLoader(tokenized_datasets["test"], shuffle=False, batch_size=8, collate_fn=data_collator)

In [39]:
# # Get the predictions on Test dataset
# for batch in test_data_loader:
#     predictions = model(**batch)
#     #TODO Accumulate the predictions here
#     print(predictions.loss, predictions.logits.shape)
#     break

# def evaluate(predictions):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

In [15]:
# Sample predictions
classifer_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
# Use the sentences from original test dataset
test_data = [
    "ಹೂಡಿಕೆ ಮಾಡಿದ ಹಣ ಹೆಚ್ಚು ವೇಗವಾಗಿ ಬೆಳೆಯುತ್ತದೆ ಎಂಬುದರಲ್ಲಿ ಅನುಮಾನ ಬೇಡ.",
    "ಹೀಗಾಗಿ ಈಗ ಸಕ್ರಿಯೆ ರಾಜಕೀಯಕ್ಕೆ ಬಂದಿದ್ದಾರೆ ಎಂದು ರಾಹುಲ್ ಹೇಳಿದ್ದಾರೆ.",
    "ಟೆನ್ ಬೈ ಫೋರ‍್ಟೀನ್‌ ಹಾಲ್‌ಗೆ ನಾನು ಕೆಮ್ಮುತ್ತಾ ಪ್ರವೇಶಿಸುತ್ತಿದ್ದಂತೆಯೇ ಅಲ್ಲಿದ್ದ ಎಲ್ಲರೂ ಅಲರ್ಟ್ ಆದರು."
]
result = classifer_pipeline(test_data)
print(result)
print('Labels for reference:', class_dict)

[{'label': 'LABEL_2', 'score': 0.9273571372032166}, {'label': 'LABEL_0', 'score': 0.9998754262924194}, {'label': 'LABEL_1', 'score': 0.9997266530990601}]
Labels for reference: {'lifestyle': 0, 'entertainment': 1, 'sports': 2}
