In [1]:
!pip install transformers
!pip install datasets #needed for loading metric

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 18.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 64.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 83.4 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 5.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 

##Import needed library

In [2]:
import transformers 
import torch
print(transformers.__version__) #print th tranformer version
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer   #the API for training model in transormer since PyTorch does not provide a training loop
from sklearn.model_selection import train_test_split

4.10.2


##Loading and split the dataset

In [19]:
import pandas as pd
df = pd.read_csv('new_data.csv', usecols=['label', 'feedback'])#, nrows=100)

#  take a sample for easy pipeline testing
df = df.sample(1000)

#fill missing value has this causes runtime error while fiting the model 
print(df.isna().sum())
df['feedback'].fillna('Nothing', inplace = True)
df.isna().sum()

label       0
feedback    0
dtype: int64


label       0
feedback    0
dtype: int64

#Preprocessing the data

1. Encode the label

2. tokenize the text feature

3. Combine the label and text together and convert them into a Dataset object

In [22]:
# Encode the label
candidate_labels = set(df.label)
for i, l in enumerate(set(df.label)):   # to automate the above
  df.loc[df.label == l, 'label'] = i

# Split the dataset
seed = 0
train_texts, train_labels = list(df.feedback), list(df.label)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=.2, random_state=seed) # create test set
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2, random_state=seed)   # create validation set 

# Define model_checkpoint 
model_checkpoint = "distilbert-base-uncased"
# model_checkpoint = 'valhalla/distilbart-mnli-12-9'

# Tokenize the test feauture 
# (truncation=True, padding=True will ensure that all of our sequences are padded
# to the same length and are truncated to be no longer than model’s maximum input length. 
# This will allow us to feed batches of sequences into the model at the same time)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, num_labels=len(candidate_labels))
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Turn the labels and encodings into a Dataset object (using pytorch). 
class feedbkDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])                                     # encode the label
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = feedbkDataset(train_encodings, train_labels)
val_dataset = feedbkDataset(val_encodings, val_labels)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 

The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a model to fine-tune, define the TrainingArguments/TFTrainingArguments and instantiate a Trainer/TFTrainer.

#Fine-tuning the model

### Define our model

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(candidate_labels))
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(candidate_labels))

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 

### Instantiate a TrainingArguments 
to hold all the hyperparameters we can tune for the Trainer


In [None]:
# metric_name = 'accuracy'
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=5,   # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

### Define a function for evaluating the model
needs to takes predictions and labels (grouped in a namedtuple called EvalPrediction)

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

### Instantiate a trainer

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
    # compute_metrics=compute_metrics,   # define compute_metrics function
)

### Train the model 
to do the actual fine tunning of the model

In [None]:
trainer.train()

### See the model evaluation metrics 

In [None]:
trainer.evaluate()
   
#predictions = trainer.predict(tokenized_test)["logits"]  #device=0

#Hyperparameter search

Ignore this section for now

The hyperparameter_search method returns a BestRun objects, which contains the value of the objective maximized (by default the sum of all metrics) and the hyperparameters it used for that run.

In [None]:
#Hyperparameter search (The Trainer supports hyperparameter search using optuna or Ray Tune. )
! pip install optuna
! pip install ray[tune]    #run either


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

#needed because the Trainer will run several trainings
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(candidate_labels))

trainer = Trainer(
    model=model_init,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    # compute_metrics=compute_metrics,   # define compute_metrics function
    # tokenizer=tokenizer,                 # Note sure if necessary
)

best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")   #ustomize the search space by passing a hp_space argument
best_run

In [None]:
#to reproduce the best training, just set the hyperparameters in your TrainingArgument before creating a Trainer:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

In [None]:
#EvalPrediction
from datasets import load_metric
metric = load_metric('glue', actual_task)
metric.compute(predictions=predictions, references=labels)

# Possibly needed

In [None]:
from datasets import Dataset        # needed to make touch dataset
train_dt = Dataset.from_dict(                                    # convert to touch dataset
        {
            "text": list(df.feedback),
            "labels": list(df.label),
        }
      )

train_dt2 = train_dt.map(lambda batch: tokenizer(batch["text"], truncation=True, padding=True), batched=True)
# train_dt2.set_format("torch", columns=["input_ids", "attention_mask", "labels"])     # needed to determine which columns and in what data format we want to access dataset elements (not necessary from my observation so far)

trainer2 = Trainer(
    model=AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(candidate_labels)),                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dt2,         # training dataset
    eval_dataset=train_dt2,             # evaluation dataset
    # compute_metrics=compute_metrics,   # define compute_metrics function
)
trainer2.train()