# Acquire the data
- This notebook is supposed to be run using Google Colab
- The notebook will be accessing the datasets from Google Drive
- To get started:
  - Download the data from https://zindi.africa/hackathons/to-vaccinate-or-not-to-vaccinate-its-not-a-question/data
  - Create a folder in which you would like to add your data e.g: `vaccinate`
  - Upload the data to a sub-folder called `data` Acquisatione.g: `vaccinate/data`
  - Create two sub-folders called `models` and `submissions` e.g: `vaccinate/models` and `vaccinate/submissions`
  - Add the pre-trained model to the `models` directory or fine-tune the model and save it to the `models` directory
- Now, you are ready to run the notebook

In [1]:
from google.colab import drive
drive.mount('/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /drive/


In [2]:
% cd "/drive/My Drive/vaccinate"

/drive/My Drive/vaccinate


In [3]:
! pip install transformers==2.8.0

Collecting transformers==2.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 23.0MB/s eta 0:00:01[K     |█▏                              | 20kB 30.1MB/s eta 0:00:01[K     |█▊                              | 30kB 27.6MB/s eta 0:00:01[K     |██▎                             | 40kB 14.0MB/s eta 0:00:01[K     |███                             | 51kB 11.8MB/s eta 0:00:01[K     |███▌                            | 61kB 11.9MB/s eta 0:00:01[K     |████                            | 71kB 11.2MB/s eta 0:00:01[K     |████▋                           | 81kB 11.6MB/s eta 0:00:01[K     |█████▎                          | 92kB 11.6MB/s eta 0:00:01[K     |█████▉                          | 102kB 11.7MB/s eta 0:00:01[K     |██████▍                         | 112kB 11.7MB/s eta 0:00:01[K     |███████                    

In [0]:
import torch
import random
import numpy as np
seed = 42
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(seed)

# Load the models

In [0]:
import numpy as np
from scipy.special import softmax
from torch import nn, optim, tensor, argmax
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

class CustomDataset(Dataset):
    '''
    This class provides a way to tokenize the dataset such that it is used
    by pytorch's dataloader
    '''
    def __init__(self, dataframe, model_name, max_seq_length=50, pad_to_max_length=True, is_test=False):
        self.len = len(dataframe)
        self.data = dataframe
        self.max_seq_length = max_seq_length
        self.pad_to_max_length = pad_to_max_length
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.is_test = is_test

    def encode(self, sentence):
        # This adds [CLS] and [SEP] by default
        return self.tokenizer.encode(
            sentence, max_length=self.max_seq_length, pad_to_max_length=self.pad_to_max_length)

    def __getitem__(self, index):
        # This function is used by the dataloader
        item = self.data.iloc[index]
        text = item['safe_text']
        if self.is_test:
          label = -1
        else:
          # Pytorch doesn't support using -1 as a label
          # Thus, labels are mapped to [0, 1, 2] instead of [-1, 0, 1]
          label = 1 + int(item['label'])
        X = self.encode(text)
        return tensor(X), label

    def __len__(self):
        # This function is used by the dataloader
        return self.len

class BertModel:
    def __init__(self, model_name, config):
        self.model_name = model_name
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config).cuda()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def get_params(self):
        return self.model.parameters()

    def fit(self, train_df, epochs=4, learning_rate=1e-05, weight_decay=1e-5, params=None):
        if not params:
            params = {'batch_size': 30,
                     'shuffle': False,
                     'drop_last': False,
                     'num_workers': 1}

        train_dataloader = DataLoader(
            CustomDataset(train_df, self.model_name, max_seq_length=50, pad_to_max_length=True),
            **params)

        # Create optimizer
        # nn.MSELoss() was also used but didn't improve the results
        loss_function = nn.CrossEntropyLoss()

        optimizer = optim.Adam(params = self.get_params(), lr=learning_rate, weight_decay=weight_decay)

        self.model.train()
        average_loss = 0

        for epoch in range(epochs):
            for batch_number, (tokens, labels) in enumerate(train_dataloader):
                tokens = tokens.cuda()
                labels = labels.cuda()

                optimizer.zero_grad()
                # Forward pass
                outputs = self.model.forward(tokens)[0]
                predicted = argmax(outputs, dim=1)
                loss = loss_function(outputs, labels)
                # Backward pass
                loss.backward()
                optimizer.step()
                average_loss += loss
                if batch_number % 100 == 0:
                    average_loss = average_loss / 100
                    print(f'Epoch {epoch}: Batch {batch_number} -> Average loss {average_loss}')
                    average_loss = 0

    def predict(self, df, params=None):
        '''This function only chooses the most probable class'''
        if not params:
            params = {'batch_size': 100,
                     'shuffle': False,
                     'drop_last': False,
                     'num_workers': 1}
        dataloader = DataLoader(
            CustomDataset(df, self.model_name, max_seq_length=50, pad_to_max_length=True, is_test=True),
            **params)
        predictions = []
        self.model.eval()
        with torch.no_grad():
          for batch_number, (tokens, labels) in enumerate(dataloader):
              tokens = tokens.cuda()

              outputs = self.model.forward(tokens)[0]
              predicted = argmax(outputs, dim=1)
              predictions = predictions + predicted.tolist()
        # Pytorch doesn't support using -1 as a label
        # Thus, labels are mapped to [0, 1, 2] instead of [-1, 0, 1] on training the model
        # They are remapped again to [-1, 0, 1] by subtracting one
        return [p-1 for p in predictions]

    def predict_prob(self, df, params=None):
        '''This function only chooses the expected value for the prediction class.
        If the probabilities of labels [-1, 0, 1] were [P-1, P0, P1],
        then the predicted class is (-1 * P-1) + 0 * P0 + (1 *P1).
        This is used since the evaluation criteria was Least Square Error and not F1 score.
        '''
        if not params:
            params = {'batch_size': 100,
                     'shuffle': False,
                     'drop_last': False,
                     'num_workers': 1}
        dataloader = DataLoader(
            CustomDataset(df, self.model_name, max_seq_length=50, pad_to_max_length=True, is_test=True),
            **params)
        predictions = []
        self.model.eval()
        label_scores = np.array([[-1,0,1]]).T
        with torch.no_grad():
          for batch_number, (tokens, labels) in enumerate(dataloader):
              tokens = tokens.cuda()

              outputs = self.model.forward(tokens)[0]
              probabilities = softmax(outputs.cpu().detach().numpy(), axis=1)

              # Find the most expected value
              scores = np.matmul(probabilities, label_scores).T.tolist()
              predictions = predictions + scores
        return predictions

    def save_model(self, directory):
        # Model save
        torch.save(self.model.state_dict(), str(Path(directory, self.model_name)))

In [0]:
labels = [0, 1, 2]
label_map = {i:i for i in labels}
num_labels = len(labels)
config = AutoConfig.from_pretrained('bert-large-cased',
                           num_labels=num_labels,
                           id2label=label_map,
                           label2id={label: i for i, label in enumerate(labels)})
full_model = BertModel('bert-large-cased', config)

HBox(children=(IntProgress(value=0, description='Downloading', max=625, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=1338740706, style=ProgressStyle(description…




In [0]:
state_dict = torch.load('models/bert-large-cased-0')

In [8]:
full_model.model.load_state_dict(state_dict)

<All keys matched successfully>

# Load the datasets

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [0]:
train_df, dev_df = train_test_split(pd.read_csv('data/Train.csv').dropna(), random_state=42)

In [0]:
test_df = pd.read_csv('data/Test.csv')

# Generate final scores

In [0]:
sample_submission = pd.read_csv('data/SampleSubmission.csv')
submission_df = sample_submission.merge(test_df, how='left', on='tweet_id')
submission_df['safe_text'].fillna('', inplace=True)

In [0]:
submission_pred = sum(full_model.predict_prob(submission_df), [])

In [0]:
submission_df['target'] = submission_pred

In [0]:
submission_df[['tweet_id', 'target']].to_csv('submissions/submission.csv', index=False)