In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import model_selection, metrics
import torch
import transformers


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [6]:
config = {
    "max_length": 360,
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",
    
    "output_dir": "./my-model",
    "train_batch_size": 64,
    "valid_batch_size": 64,
    "learning_rate": 3e-5,
    "epochs": 3,
    
    "debug": True,
}

In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])
class TextDataset:

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        enc = enc = tokenizer(
            row["text"],
            add_special_tokens=True,
            max_length=config["max_length"],
            padding="max_length",
            truncation=True
        )

        return {
            "input_ids": torch.tensor(enc["input_ids"]),
            "attention_mask": torch.tensor(enc["attention_mask"]),
            "label": torch.tensor(row["label"]),
        }

config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [10]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv').rename(columns={"review": "text"})
df.head()

Unnamed: 0,text,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

if config["debug"]:
    print("DEBUG MODE!")
    df = df.sample(10_000, random_state=123)

print(df.shape)
df.head()

DEBUG MODE!
(10000, 3)


Unnamed: 0,text,sentiment,label
11872,"This movie was beyond awful, it was a pimple o...",negative,0
40828,As of this writing John Carpenter's 'Halloween...,positive,1
36400,I must admit a slight disappointment with this...,positive,1
5166,Oh dear! The BBC is not about to be knocked of...,negative,0
30273,its a totally average film with a few semi-alr...,negative,0


In [12]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])



In [13]:
train, valid = model_selection.train_test_split(
    df,
    test_size=0.2,
    random_state=23,
    shuffle=True,
    stratify=df["label"]
)

In [14]:
train_ds = TextDataset(train)
valid_ds = TextDataset(valid)

In [15]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(config["model_path"])

pytorch_model.bin:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
def compute_metrics(eval_data):
   
    preds = eval_data.predictions.argmax(-1)
    labels = eval_data.label_ids 
    print(eval_data)
    print(preds)
    print(labels)

    return {
        'accuracy': metrics.accuracy_score(labels, preds),
        'precision': metrics.precision_score(labels, preds),
        'recall': metrics.recall_score(labels, preds),
        'classification_report': metrics.classification_report(labels, preds, target_names=list(id2label.values()), output_dict=True)




    }

training_args = transformers.TrainingArguments(
     output_dir="./results",                      # Directory for storing results
    eval_strategy="steps",                 # Evaluate every few steps
    per_device_train_batch_size=config['train_batch_size'],              # Batch size per device during training
    per_device_eval_batch_size=config['train_batch_size'],               # Batch size per device during evaluation
    num_train_epochs=config['epochs'],                          # Total number of training epochs
    warmup_steps=500,                            # Number of warmup steps for learning rate scheduler
    save_total_limit=2,
    logging_dir=None,                            # Disable logging directory
    logging_strategy="no",
    report_to=[]# Limit the total amount of checkpoints`

)

In [18]:
trainer = transformers.Trainer(
    model=model,                                 # The model to be trained
    args=training_args,                          # The training arguments, defined above
    train_dataset=train_ds,                 # The training dataset
    eval_dataset=valid_ds,                   # The evaluation dataset
    tokenizer=tokenizer,                         # The tokenizer
    compute_metrics=compute_metrics, 
)

In [19]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss


TrainOutput(global_step=189, training_loss=0.5647390860098379, metrics={'train_runtime': 114.9602, 'train_samples_per_second': 208.768, 'train_steps_per_second': 1.644, 'total_flos': 249110795520000.0, 'train_loss': 0.5647390860098379, 'epoch': 3.0})

In [20]:
trainer.save_state()

In [21]:
trainer.save_model()