In [1]:
import transformers
import pandas as pd
from torch.utils.data import Dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import f1_score

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
config = {
    "max_length": 360,
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",
    
    "output_dir": "./my-model",
    "train_batch_size": 64,
    "valid_batch_size": 64,
    "learning_rate": 3e-5,
    "epochs": 3,
    
    "debug": True,
}

In [3]:
class TextDataset:
    
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        enc = self.tokenizer(
            row["review"],
            max_length=self.max_length,
            truncation=True,
            padding="max_length"
        )
        
        return {
            "input_ids": torch.tensor(enc["input_ids"]),
            "attention_mask": torch.tensor(enc["attention_mask"]),
            "label": torch.tensor(row["label"]),
        }

In [4]:
df = pd.read_csv('./IMDB Dataset.csv')

In [5]:
id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

if config["debug"]:
    print("DEBUG MODE!")
    df = df.sample(10_000, random_state=123)

print(df.shape)
df.head()

DEBUG MODE!
(10000, 3)


Unnamed: 0,review,sentiment,label
11872,"This movie was beyond awful, it was a pimple o...",negative,0
40828,As of this writing John Carpenter's 'Halloween...,positive,1
36400,I must admit a slight disappointment with this...,positive,1
5166,Oh dear! The BBC is not about to be knocked of...,negative,0
30273,its a totally average film with a few semi-alr...,negative,0


In [6]:
tokeniser = transformers.AutoTokenizer.from_pretrained(config["model_path"])



In [7]:
train, valid = train_test_split(
    df,
    test_size=0.2,
    random_state=1123,
    shuffle=True,
    stratify=df["label"]
)

In [8]:
train_ds = TextDataset(train, tokeniser, config['max_length'])
valid_ds = TextDataset(valid, tokeniser, config['max_length'])

In [9]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(config['model_path'], num_labels=len(id2label))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def compute_metrics(eval_data):
    logits, labels = eval_data   
    preds = np.argmax(logits, -1)
    return {
        "f1": f1_score(labels, preds)
    }


In [11]:
training_args = transformers.TrainingArguments(
    output_dir=config["output_dir"],
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=config["train_batch_size"],
    per_device_eval_batch_size=config["valid_batch_size"],
    learning_rate=config["learning_rate"],
    num_train_epochs=config["epochs"],
    load_best_model_at_end=True,
)




In [12]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokeniser,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()
trainer.save_state()
trainer.save_model()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
                                                 
 33%|███▎      | 125/375 [01:27<02:23,  1.74it/s]

{'eval_loss': 0.3595399856567383, 'eval_f1': 0.8752515090543259, 'eval_runtime': 9.1994, 'eval_samples_per_second': 217.406, 'eval_steps_per_second': 3.478, 'epoch': 1.0}


                                                 
 67%|██████▋   | 250/375 [02:56<01:15,  1.65it/s]

{'eval_loss': 0.32198113203048706, 'eval_f1': 0.8807247106190237, 'eval_runtime': 9.4739, 'eval_samples_per_second': 211.106, 'eval_steps_per_second': 3.378, 'epoch': 2.0}


                                                 
100%|██████████| 375/375 [04:21<00:00,  1.72it/s]

{'eval_loss': 0.3163885772228241, 'eval_f1': 0.8816658202133062, 'eval_runtime': 8.5724, 'eval_samples_per_second': 233.307, 'eval_steps_per_second': 3.733, 'epoch': 3.0}


100%|██████████| 375/375 [04:21<00:00,  1.43it/s]

{'train_runtime': 261.6531, 'train_samples_per_second': 91.724, 'train_steps_per_second': 1.433, 'train_loss': 0.36464139811197915, 'epoch': 3.0}





In [54]:
df = pd.read_csv("./IMDB Dataset.csv")

id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

print(df.shape)
df.head()

(50000, 3)


Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [55]:
model = transformers.AutoModelForSequenceClassification.from_pretrained("./my-model")

In [56]:
train, test = train_test_split(
    df,
    test_size=0.2,
    random_state=1123,
    shuffle=True,
    stratify=df["label"]
)

test_ds = TextDataset(test, tokeniser, config['max_length'])
dl = torch.utils.data.DataLoader(
    test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=2,
)

In [91]:
a, b, c = test_ds[0].items()

In [58]:
model.to("cuda")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-1

In [94]:
for idx, batch in enumerate(test_ds):
    
    batch = {key: value.to('cuda') for key, value in batch.items()}
    with torch.no_grad():
        out = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
    
    if idx == 5:
        break

In [60]:
batch.items()

dict_items([('input_ids', tensor([  101,  1045,  2066,  4393,  5691,  1010,  1045,  2066,  1038,  1011,
         5691,  1010,  1045,  2293,  1038,  4393,  5691,  1012,  2021,  2023,
         2028,  2038,  3053,  2498,  2183,  2005,  2009,  1012,  2070,  1997,
         1996,  3772,  2003,  9202,  1010,  2926,  2011,  1017,  1997,  1996,
         3287,  5260,  1012,  1996,  2466,  2003,  2025,  3327,  5875,  1012,
         2012,  1037,  5816,  2460,  6070,  2781,  2009,  2145,  3849,  2205,
         2146,  1998,  2017,  1005,  2222,  2424,  4426,  3435,  1011,  2830,
         2075,  3243,  1037,  2978,  1012,  2045,  2024,  2019,  9643,  2843,
         1997, 18577,  1011, 11865,  4393,  4491,  1012,  2614,  4658,  1029,
         2009,  3475,  1005,  1056,  2043,  2009,  1005,  1055,  2589,  2006,
         1037,  2659,  5166,  1012,  2009,  4152, 23563,  2200,  2855,  1012,
         2045,  2003,  2070,  3576,  2668,  1998, 13638,  1010,  2498,  2000,
         2131,  7568,  2055,  1012,  2

In [84]:
pipe = transformers.pipeline(
    "text-classification",
    model="./my-model/",
    batch_size=4,
    device='cuda'
)

In [85]:
pipe(["I hate this", "This is wrong", "This is right", "Horrible. An utter waste of time", "Awesome movie", "Amazing, superb, fascinating. Enjoyed every minute of it"])

[{'label': 'LABEL_1', 'score': 0.5533989071846008},
 {'label': 'LABEL_0', 'score': 0.65948086977005},
 {'label': 'LABEL_1', 'score': 0.9100386500358582},
 {'label': 'LABEL_1', 'score': 0.5751544237136841},
 {'label': 'LABEL_1', 'score': 0.9028724431991577},
 {'label': 'LABEL_1', 'score': 0.9086151719093323}]