# Text Classfication using TinyBert
* Dataset: <https://www.kaggle.com/columbine/imdb-dataset-sentiment-analysis-in-csv-format>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os 

import torch
import torch.nn as nn

import transformers
import torchflare.callbacks as cbs
import torchflare.metrics as metrics
import torchflare.criterion as crit
from torchflare.experiments import Experiment
from torchflare.datasets import TextDataloader


In [2]:
df = pd.read_csv("Train.csv")

In [3]:
train_df , valid_df =train_test_split(df , stratify = df.label,  test_size = 0.1, random_state = 42)

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

train_dl = TextDataloader.from_df(
                        df = train_df,
                        input_col = 'text',
                        label_cols = 'label',
                        tokenizer = tokenizer,
                        max_len = 128).get_loader(batch_size = 16 , shuffle = True)

valid_dl = TextDataloader.from_df(
                        df = valid_df,
                        input_col = 'text',
                        label_cols = 'label',
                        tokenizer = tokenizer,
                        max_len = 128).get_loader(batch_size = 16)

In [5]:
class Model(torch.nn.Module):

    def __init__(self,dropout , out_features):

        super(Model , self).__init__()
        self.bert = transformers.BertModel.from_pretrained(
            "prajjwal1/bert-tiny", return_dict=False
        )
        self.bert_drop = nn.Dropout(dropout)
        self.out = nn.Linear(128, out_features)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _ , o_2 = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        b_o = self.bert_drop(o_2)
        output = self.out(b_o)
        return output


In [6]:
metric_list = [metrics.Accuracy(num_classes=2, multilabel=False)]

callbacks = [
    cbs.EarlyStopping(monitor="accuracy", patience=2, mode = "max"),
    cbs.ModelCheckpoint(monitor="accuracy" , mode = "max"),
    cbs.ReduceLROnPlateau(mode = "max" , patience = 2),
    cbs.NeptuneLogger(project_dir = "notsogenius/dl-experiments",
                     api_token = os.environ.get("NEPTUNE_API_TOKEN"),
                     experiment_name = "IMDB_CLASSIFICATION",
                     tags = ["tiny-bert" , "text_classification"])
]

In [12]:
exp = Experiment(
    num_epochs=3,
    fp16=True,
    device="cuda",
    seed=42,
)

# Compiling the experiment
exp.compile_experiment(
    module=Model,
    module_params = {"dropout" : 0.3 , "out_features" : 1},
    optimizer="AdamW",
    optimizer_params = {"lr" : 3e-4},
    criterion= crit.BCEWithLogitsFlat, # Using BCEWithLogitsFlat since I dont want to handle shapes my outputs and targets.
    metrics=metric_list,
    main_metric="accuracy",
)

# Training the models.
exp.fit_loader(train_dl = train_dl , valid_dl = valid_dl)


Epoch: 1/3

Epoch: 2/3

Epoch: 3/3


In [13]:
exp.get_logs()

Unnamed: 0,Epoch,train_loss,train_accuracy,val_loss,val_accuracy
0,1,0.44003,0.885407,0.390419,0.884267
1,2,0.285289,0.884076,0.371715,0.883471
2,3,0.180547,0.888953,0.44962,0.888294
