In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -qq transformers
!pip install -qq torchflare


[K     |████████████████████████████████| 2.5MB 7.6MB/s 
[K     |████████████████████████████████| 901kB 51.2MB/s 
[K     |████████████████████████████████| 3.3MB 49.7MB/s 
[K     |████████████████████████████████| 92kB 5.7MB/s 
[K     |████████████████████████████████| 735.5MB 24kB/s 
[K     |████████████████████████████████| 51.0MB 116kB/s 
[K     |████████████████████████████████| 17.3MB 333kB/s 
[K     |████████████████████████████████| 15.3MB 341kB/s 
[K     |████████████████████████████████| 10.3MB 176kB/s 
[K     |████████████████████████████████| 71kB 10.9MB/s 
[K     |████████████████████████████████| 3.0MB 50.9MB/s 
[K     |████████████████████████████████| 102kB 9.6MB/s 
[K     |████████████████████████████████| 9.9MB 45.6MB/s 
[K     |████████████████████████████████| 22.3MB 1.3MB/s 
[K     |████████████████████████████████| 38.2MB 88kB/s 
[31mERROR: torchtext 0.10.0 has requirement torch==1.9.0, but you'll have torch 1.8.0 which is incompatible.[0m
[31mER

In [17]:
import torchflare
import re
import torchflare.callbacks as cbs
import pandas as pd
import transformers
import torch
import sklearn.metrics as skm
from torchflare.experiments import Experiment, ModelConfig
import torchflare.criterion as crit
from torchflare.datasets import TextDataloader
from torchflare.metrics import MetricMeter, _BaseMetric
import pickle

In [2]:
def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)

In [3]:
def apply_preprocessing(df):
    df.text = df.text.apply(lambda x: x.lower())
    df.text = df.text.apply(
        lambda x: re.sub(r"^https?:\/\/.*[\r\n]*", "", x, flags=re.MULTILINE)
    )
    df.text = df.text.apply(lambda x: remove_emoji(x))
    return df 

In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/Hahakathon/train.csv")
valid_df = pd.read_csv("/content/drive/MyDrive/Hahakathon/dev.csv")

In [43]:
#Change Backbone name here.
BACKBONE_NAME = "prajjwal1/bert-tiny" 

In [44]:
tokenizer = transformers.AutoTokenizer.from_pretrained(BACKBONE_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=285.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [45]:
train_dl = TextDataloader.from_df(
    df=train_df,
    input_col="text",
    label_cols="is_humor",
    tokenizer=tokenizer,
    max_len=256,
).get_loader(batch_size=16, shuffle=True)

valid_dl = TextDataloader.from_df(
    df=valid_df,
    input_col="text",
    label_cols="is_humor",
    tokenizer=tokenizer,
    max_len=256,
).get_loader(batch_size=16, shuffle = False)

In [48]:
class SklearnF1(MetricMeter, _BaseMetric):
    def __init__(self):

        self.f1 = skm.f1_score
        self._outputs = None
        self._targets = None
        self.reset()

    def handle(self):
        return self.f1.__name__.lower()

    def accumulate(self, outputs: torch.Tensor, targets: torch.Tensor):
        """Method to accumulate the outputs and targets.
        Args:
            outputs(torch.Tensor) : raw logits from the network.
            targets(torch.Tensor) : Ground truth targets
        """
        outputs, targets = self.detach_tensor(outputs), self.detach_tensor(targets)
        outputs = torch.argmax(outputs, dim=1)
        # print(outputs)
        self._outputs.append(outputs)
        self._targets.append(targets)

    def reset(self):
        """Resets the accumulation lists."""
        self._outputs = []
        self._targets = []

    @property
    def value(self):

        outputs = torch.cat(self._outputs)
        targets = torch.cat(self._targets)
        f1_score = self.f1(targets.numpy(), outputs.numpy())
        return torch.tensor(f1_score)

In [64]:
class BackBoneModel(torch.nn.Module):
    def __init__(self, out_features, model_path):
        super(BackBoneModel, self).__init__()
        self.model = transformers.AutoModel.from_pretrained(
            model_path, return_dict=False
        )
        in_features = self.model.pooler.dense.out_features
        self.linear = torch.nn.Linear(in_features, out_features)
        #self.dropout = torch.nn.Dropout(0.2)

    def forward(self, x):

        _, o_2 = self.model(**x)
        #op = op.last_hidden_state[:, 0]
        #op = self.dropout(o_2)
        op = self.linear(o_2)
        return op

In [65]:
@cbs.on_experiment_end(order = cbs.CallbackOrder.EXTERNAL)
def save_pickle_experiment(experiment : "Experiment"):
    prefix = BACKBONE_NAME.split("/")[1]
    pickle.dump(experiment.history , open(f"{prefix}-experiment.pkl" , "wb"))

In [66]:
config = ModelConfig(
    nn_module=BackBoneModel,
    module_params={"out_features": 2, "model_path": BACKBONE_NAME},
    optimizer="AdamW",
    optimizer_params={"lr": 1e-5, "weight_decay":1e-3},
    criterion="cross_entropy",
)
callbacks = [
    cbs.ModelCheckpoint(
        file_name="bert-tiny.bin",
        save_dir="./",
        mode="max",
        monitor="val_f1_score",
    ),
    cbs.CosineAnnealingWarmRestarts(T_0=1),
    save_pickle_experiment
]
metric_list = [SklearnF1()]

In [67]:
bert_exp = Experiment(num_epochs=5, seed=42, fp16=True, device="cuda")

In [68]:
bert_exp.compile_experiment(model_config=config, callbacks=callbacks, metrics=metric_list)

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
bert_exp.fit_loader(train_dl, valid_dl)


Epoch: 1/5

Epoch: 2/5

Epoch: 3/5

Epoch: 4/5

Epoch: 5/5


In [70]:
test_df = pd.read_csv("/content/drive/MyDrive/Hahakathon/gold-test-27446.csv")

In [71]:
test_dl = TextDataloader.from_df(
    df=test_df,
    input_col="text",
    label_cols=None,
    tokenizer=tokenizer,
    max_len=256,
).get_loader(batch_size=16, shuffle = False)

In [72]:
outputs = []
for op in bert_exp.predict_on_loader(test_dl = test_dl, path_to_model= "/content/bert-small.bin", device = "cuda"):
    outputs.extend(torch.argmax(op , dim = 1))

In [73]:
test_f1 = skm.f1_score(test_df.is_humor.values , outputs)

In [74]:
test_f1

0.8717532467532467