## Text Classification using torchflare.
***
* Dataset: https://www.kaggle.com/columbine/imdb-dataset-sentiment-analysis-in-csv-format

In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
import transformers

import torchflare.metrics as metrics
import torchflare.criterion as crit
import torchflare.callbacks as cbs
from torchflare.datasets import SimpleDataloader
from torchflare.experiments import Experiment

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
df = pd.read_csv("Train.csv")

<IPython.core.display.Javascript object>

In [5]:
df.text = df.text.apply(lambda x: x.lower())
train_df, valid_df = train_test_split(df, test_size=0.3)

<IPython.core.display.Javascript object>

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

train_dl = SimpleDataloader.text_data_from_df(
    df=train_df, input_col="text", label_cols="label", tokenizer=tokenizer, max_len=128
).get_loader(batch_size=16, shuffle=True, num_workers=0)

valid_dl = SimpleDataloader.text_data_from_df(
    df=valid_df, input_col="text", label_cols="label", tokenizer=tokenizer, max_len=128
).get_loader(batch_size=16, shuffle=False)

<IPython.core.display.Javascript object>

In [23]:
class Model(torch.nn.Module):
    def __init__(self):

        super(Model, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(
            "prajjwal1/bert-tiny", return_dict=False
        )
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(128, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, o_2 = self.bert(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )

        b_o = self.bert_drop(o_2)
        output = self.out(b_o)
        return output

<IPython.core.display.Javascript object>

In [24]:
model = Model()

<IPython.core.display.Javascript object>

In [25]:
metric_list = [metrics.Accuracy(num_classes=2, multilabel=False, threshold=0.6)]
callbacks = [
    cbs.EarlyStopping(monitor="accuracy", patience=5),
    cbs.ModelCheckpoint(monitor="accuracy"),
]
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

<IPython.core.display.Javascript object>

In [26]:
exp = Experiment(
    num_epochs=3,
    save_dir="./models",
    model_name="text_cls.bin",
    fp16=False,
    using_batch_mixers=False,
    device="cuda",
    compute_train_metrics=True,
    seed=42,
)

<IPython.core.display.Javascript object>

In [27]:
exp.compile_experiment(
    model=model,
    optimizer="AdamW",
    optimizer_params=dict(model_params=optimizer_parameters, lr=3e-4),
    callbacks=callbacks,
    scheduler="ReduceLROnPlateau",
    scheduler_params=dict(mode="max", patience=2),
    criterion=crit.BCEWithLogitsFlat,
    metrics=metric_list,
    main_metric="accuracy",
)

<IPython.core.display.Javascript object>

In [28]:
exp.perform_sanity_check(train_dl)

Sanity Check Completed. Model Forward Pass and Loss Computation Successful
Output Shape : torch.Size([16, 1])
Loss for a batch :0.7377960681915283


<IPython.core.display.Javascript object>

In [29]:
exp.run_experiment(train_dl=train_dl, valid_dl=valid_dl)

Epoch,train_loss,train_accuracy,val_loss,val_accuracy,Time
0,0.46017,0.77157,0.37027,0.83358,01:50
1,0.2893,0.87754,0.39725,0.82,01:58
2,0.17351,0.93443,0.44552,0.82067,01:59


<IPython.core.display.Javascript object>

In [30]:
test_df = pd.read_csv("test.csv")
test_df.text = test_df.text.apply(lambda x: x.lower())

<IPython.core.display.Javascript object>

In [31]:
test_dl = SimpleDataloader.text_data_from_df(
    df=test_df, input_col="text", label_cols=None, tokenizer=tokenizer, max_len=128
).get_loader(batch_size=16, shuffle=True)

<IPython.core.display.Javascript object>

In [32]:
ops = []
for op in exp.infer(path="./models/text_cls.bin", test_loader=test_dl):
    z = torch.sigmoid(op).numpy()
    ops.extend(z)

ops = np.concatenate(ops)

<IPython.core.display.Javascript object>