In [None]:
!pip install transformers
!pip install psycopg2-binary



In [None]:
import os

import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from tensorflow.keras.preprocessing.sequence import pad_sequences
import psycopg2

In [None]:
def get_df(db_url) -> pd.DataFrame:
    conn = psycopg2.connect(db_url)
    curs = conn.cursor()
    curs.execute("SELECT * FROM training;")
    cols = [k[0] for k in curs.description]
    rows = curs.fetchall()
    df = pd.DataFrame(rows, columns=cols)
    curs.close()
    conn.close()
    return df

In [None]:
#db_url = "postgresql://frankenbert:66rP3cmBEgw6EHiw45PJds9X-ji8nNZc@frankenbert.cjb51vwbigfl.us-east-1.rds.amazonaws.com:5432/postgres"
df = get_df(db_url)
df

Unnamed: 0,id,tweets,labels
0,6,A New Mexico State Police officer killed two p...,5
1,7,I found some police badge stickers at work and...,0
2,8,"According to media reports, police said they a...",1
3,9,The male who has the knife is now running nort...,0
4,10,Changing the assignments of already existing c...,0
...,...,...,...
6102,6108,In Detroit they are still having police treati...,2
6103,6109,#PoliceBrutality the neighbors would not have ...,2
6104,6110,Police cars revolving lightPolice violence has...,3
6105,6111,The police used unnecessary violence against d...,2


In [None]:
df.to_csv('training.csv')

In [None]:
def bert_trainer(path: str, output_dir: str, epochs: int):
    max_len = 280
    if torch.cuda.is_available():
        print("CUDA Active")
        device = torch.device("cuda")
    else:
        print("CPU Active")
        device = torch.device("cpu")
    df = pd.read_csv(path)
    sentences = df["tweets"].values
    labels = df["labels"].values
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-uncased',
        do_lower_case=True,
    )
    inputs = [
        tokenizer.encode(sent, add_special_tokens=True) for sent in sentences
    ]
    inputs_ids = pad_sequences(
        inputs,
        maxlen=max_len,
        dtype="long",
        value=0,
        truncating="post",
        padding="post",
    )
    attention_masks = [
        [int(token_id != 0) for token_id in sent] for sent in inputs_ids
    ]
    train_inputs = torch.tensor(inputs_ids)
    train_labels = torch.tensor(labels)
    train_masks = torch.tensor(attention_masks)
    batch_size = 32
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(
        train_data,
        sampler=train_sampler,
        batch_size=batch_size,
    )
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=12,
        output_attentions=False,
        output_hidden_states=False,
    )
    if torch.cuda.is_available():
        model.cuda()
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps,
    )
    loss_values = []
    print('\nTraining...')
    for epoch_i in range(1, epochs + 1):
        print(f"\nEpoch: {epoch_i}")
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
            )
            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        loss_values.append(avg_train_loss)
        print(f"Average Loss: {avg_train_loss}")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print(f"\nSaving model to {output_dir}")
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("Finished!")

In [None]:
bert_trainer("training.csv", 'saved_model', epochs=50)

CUDA Active


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Training...

Epoch: 1
Average Loss: 0.9427920916941778

Epoch: 2
Average Loss: 0.5130270653994295

Epoch: 3
Average Loss: 0.3999939202404147

Epoch: 4
Average Loss: 0.31254229343530393

Epoch: 5
Average Loss: 0.23405990437531346

Epoch: 6
Average Loss: 0.19103372527081616

Epoch: 7
Average Loss: 0.15396156801759261

Epoch: 8
Average Loss: 0.13505141779351296

Epoch: 9
Average Loss: 0.12124714727820682

Epoch: 10
Average Loss: 0.11290303208860349

Epoch: 11
Average Loss: 0.09822823303746771

Epoch: 12
Average Loss: 0.08058960462739954

Epoch: 13
Average Loss: 0.07493957937256243

Epoch: 14
Average Loss: 0.0720008559395798

Epoch: 15
Average Loss: 0.06488169519918244

Epoch: 16
Average Loss: 0.06352500006476057

Epoch: 17
Average Loss: 0.059061573447137874

Epoch: 18
Average Loss: 0.052373448522098326

Epoch: 19
Average Loss: 0.05355791257589529

Epoch: 20
Average Loss: 0.048830773622889796

Epoch: 21
Average Loss: 0.04614090703303453

Epoch: 22
Average Loss: 0.04646028986740711

Epoch:

In [None]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification


class FrankenBert:
    """
    Implements BertForSequenceClassification and BertTokenizer
    for binary classification from a saved model
    """

    def __init__(self, path: str):
        """
        If there's a GPU available, tell PyTorch to use the GPU.
        Loads model and tokenizer from saved model directory (path)
        """
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.model = BertForSequenceClassification.from_pretrained(path)
        self.tokenizer = BertTokenizer.from_pretrained(path)
        self.model.to(self.device)

    def predict(self, text: str):
        """
        Makes a binary classification prediction based on saved model
        """
        inputs = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=280,
            return_tensors='pt',
        ).to(self.device)
        output = self.model(**inputs)
        prediction = output[0].softmax(1)
        tensors = prediction.detach().cpu().numpy()
        result = np.argmax(tensors)
        confidence = tensors[0][result]
        return f"Rank: {result}, {100 * confidence:.2f}%"

In [None]:
model = FrankenBert('saved_model')

In [None]:
model.predict("Mickey Mouse is in the house")

'Rank: 0, 99.99%'

In [None]:
model.predict("Cops gave me a speeding ticket for walking too fast")

'Rank: 1, 99.98%'

In [None]:
model.predict("The cops showed up but didn't do anything")

'Rank: 1, 99.95%'

In [None]:
model.predict("Cops held that guy with a neck hold")

'Rank: 2, 95.76%'

In [None]:
model.predict("Cops punched me and pushed me to the ground")

'Rank: 3, 99.87%'

In [None]:
model.predict("Cops hit her with a baton")

'Rank: 3, 99.81%'

In [None]:
model.predict("Cops sprayed my mom with pepper spray")

'Rank: 4, 99.93%'

In [None]:
model.predict("Cops shot rubber bullets at the crowd")

'Rank: 5, 67.64%'

In [None]:
model.predict("Police used tear gas on a pedestrian for no reason")

'Rank: 4, 99.95%'

In [None]:
model.predict("Cops killed that women")

'Rank: 5, 93.78%'

In [None]:
model.predict("Yesterday I saw a policeman hit a poor person behind my house. I wonder whats going on")

'Rank: 1, 97.10%'

In [None]:
model.predict("Man ran up to me and pepper sprayed me. I've called the cops, but they have not gotten themselves involved yet.")

'Rank: 4, 99.91%'

In [None]:
model.prdict("People gathered to protest. Cops show up and are using batons to disperse the gathering.")