In [1]:
import pandas as pd
import numpy as np
import datasets
from tqdm.auto import tqdm
import os
from torch.utils.data import WeightedRandomSampler
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler, DataCollatorWithPadding
from sklearn.utils.class_weight import compute_class_weight
import torch
from torch.utils.data import DataLoader 

In [10]:
def load_data_and_rename_columns(path:str, rename_dict={}, drop_columns_dict={}, drop_congress_number=[])->pd.DataFrame:
    #load data
    df = pd.read_pickle(path)
    df = df.loc[~df['cong'].isin(drop_congress_number)].copy()
    df = df.rename(columns=rename_dict)
    df = df.drop(columns=drop_columns_dict)
    return df
def clear():
    os.system( 'cls' )
    
def train_validation_split(df, train_frac,eval_frac,random_seed):
    df_train = df.sample(frac=train_frac, random_state=random_seed)
    df_eval = df.sample(frac=eval_frac, random_state=random_seed)
    return df_train, df_eval

def create_dataset_object_from_pandas_dataframe(df,columns_to_be_removed):
    dataset = datasets.Dataset.from_pandas(df).remove_columns(columns_to_be_removed)
    return dataset

def tokenizer_function(bill):
    return tokenizer(bill["sentences"], truncation=True, padding="max_length")

def create_weighted_sampler(dataset):
    class_sample_count = np.array([len(np.where(dataset["labels"] == t)[0]) for t in np.unique(dataset["labels"])])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in dataset["labels"]])

    samples_weight = torch.from_numpy(samples_weight)
    samples_weight = samples_weight.double()
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    return sampler


def train(model, train_dataloader, eval_dataloader, loss_function, optimizer, num_epochs, lr_scheduler_function, device):
    

    model.train().to(device)
    num_training_steps = num_epochs * len(train_dataloader)
    progress_bar = tqdm(range(num_training_steps))
    lr_scheduler = get_scheduler(
        lr_scheduler_function,
        optimizer=optimizer,
        num_warmup_steps=num_training_steps/10,
        num_training_steps=num_training_steps
    )
    model.train()
    for _ in range(num_epochs):
        train_targs, train_preds = [], []
        val_targs, val_preds = [], []
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            loss = loss_function(outputs.logits, batch["labels"])
            loss.backward()
            optimizer.step()
            progress_bar.set_postfix(loss=loss.item())
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            
            predictions = torch.argmax(logits, dim=-1)            

            #Getting metrics
            train_targs += list(batch["labels"].cpu().numpy())
            train_preds += list(predictions.cpu().numpy())

            progress_bar.update(1)
        
        
        print('-----------Training Metrics-----------')
        print('Accuracy: {}'.format(accuracy_score(train_targs, train_preds)))
        print('F1: {}'.format(f1_score(train_targs, train_preds)))
        print('Precision: {}'.format(precision_score(train_targs, train_preds)))
        print('Recall: {}'.format(recall_score(train_targs, train_preds)))
        print('Confusion Matrix:')
        print(confusion_matrix(train_targs, train_preds))
        model.eval()
        
        with torch.no_grad():   
            for batch in eval_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)            

                #Getting metrics
                val_targs += list(batch["labels"].cpu().numpy())
                val_preds += list(predictions.cpu().numpy())

        print('-----------Validation Metrics-----------')
        print('Accuracy: {}'.format(accuracy_score(val_targs, val_preds)))
        print('F1: {}'.format(f1_score(val_targs, val_preds)))
        print('Precision: {}'.format(precision_score(val_targs, val_preds)))
        print('Recall: {}'.format(recall_score(val_targs, val_preds)))
        print('Confusion Matrix:')
        print(confusion_matrix(val_targs, val_preds))
        print('-' * 66)


In [43]:
print("Loading data...")
df = load_data_and_rename_columns('data/processed/bert_data.pickle',
            rename_dict={"status":"labels"},
            drop_columns_dict={'bill_id','cong'},
            drop_congress_number=[])

print("Combining list of sentences with [SEP] tokens...")
df['sentences'] = df['sentences'].apply(lambda x: '[SEP] '.join(x))

print("Training validation split...")
df_train, df_eval = train_validation_split(df, 0.8, 0.2, random_seed=3060)
# class_weights = list(float(x) for x in compute_class_weight('balanced', classes=df_train["labels"].unique(), y=df_train["labels"]))

print("Converting pandas df to dataset-object...")
dataset_train = create_dataset_object_from_pandas_dataframe(df_train, "__index_level_0__")
dataset_eval = create_dataset_object_from_pandas_dataframe(df_eval, "__index_level_0__")
dataset = datasets.DatasetDict({"train":dataset_train,"eval":dataset_eval})

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

print("Applying tokenizer")
dataset_tokenized = dataset.map(tokenizer_function, batched=True)
dataset_tokenized["train"] = dataset_tokenized["train"].remove_columns("sentences")
dataset_tokenized["eval"] = dataset_tokenized["eval"].remove_columns("sentences")
dataset_tokenized.set_format("torch") #converting lists to tensors

print("Creating dataloader with batches using a weighted sampler")
sampler = create_weighted_sampler(dataset_tokenized["train"])
train_dataloader = DataLoader(dataset_tokenized["train"], batch_size=6, drop_last=True, sampler=sampler)
eval_dataloader = DataLoader(dataset_tokenized["eval"], batch_size=6, drop_last = True)

print("Loading model")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

print("Training model")
#Arguments:
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
num_epochs = 4
lr_scheduler_function = "linear"
# class_weights = torch.tensor(device=device)
loss_function = torch.nn.CrossEntropyLoss()

#Training function
train(model, train_dataloader, eval_dataloader,loss_function, optimizer, num_epochs, lr_scheduler_function, device)

print("Saving model")
torch.save(model, "results/BERT_finetuned_congress_103_115_4_epochs_80pct_train_20_val.pt")

print("Done")

Loading data...
Combining list of sentences with [SEP] tokens...
Training validation split...
Converting pandas df to dataset-object...
Loading tokenizer...
Applying tokenizer


100%|██████████| 18/18 [00:14<00:00,  1.25ba/s]
100%|██████████| 5/5 [00:02<00:00,  1.83ba/s]


Creating dataloader with batches using a weighted sampler
Loading model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training model


 25%|██▌       | 2948/11792 [1:08:44<1:18:12,  1.88it/s, loss=0.00281]

-----------Training Metrics-----------
Accuracy: 0.8368950701040253
F1: 0.8408978106215188
Precision: 0.8324052844196965
Recall: 0.8495654111878761
Confusion Matrix:
[[7179 1535]
 [1350 7624]]
-----------Validation Metrics-----------
Accuracy: 0.9647218453188603
F1: 0.6623376623376623
Precision: 0.504950495049505
Recall: 0.9622641509433962
Confusion Matrix:
[[4113  150]
 [   6  153]]
------------------------------------------------------------------


 50%|█████     | 5896/11792 [2:33:39<50:06,  1.96it/s, loss=0.00334]     

-----------Training Metrics-----------
Accuracy: 0.9803821800090456
F1: 0.9802019740971073
Precision: 0.9768023652490334
Recall: 0.9836253292110386
Confusion Matrix:
[[8751  204]
 [ 143 8590]]
-----------Validation Metrics-----------
Accuracy: 0.9771596562641339
F1: 0.7577937649880095
Precision: 0.6124031007751938
Recall: 0.9937106918238994
Confusion Matrix:
[[4163  100]
 [   1  158]]
------------------------------------------------------------------


 75%|███████▌  | 8844/11792 [3:58:36<1:31:30,  1.86s/it, loss=0.00107]   

-----------Training Metrics-----------
Accuracy: 0.99400723654455
F1: 0.9939380075488963
Precision: 0.992235670244348
Recall: 0.9956461961503208
Confusion Matrix:
[[8892   68]
 [  38 8690]]
-----------Validation Metrics-----------
Accuracy: 0.9972862957937585
F1: 0.9634146341463414
Precision: 0.9349112426035503
Recall: 0.9937106918238994
Confusion Matrix:
[[4252   11]
 [   1  158]]
------------------------------------------------------------------


100%|██████████| 11792/11792 [5:23:57<00:00,  1.45s/it, loss=0.000188]   

-----------Training Metrics-----------
Accuracy: 0.9983604703753958
F1: 0.998376714245732
Precision: 0.9978740069374511
Recall: 0.9988799283154122
Confusion Matrix:
[[8741   19]
 [  10 8918]]


100%|██████████| 11792/11792 [5:30:44<00:00,  1.68s/it, loss=0.000188]

-----------Validation Metrics-----------
Accuracy: 0.998869289914066
F1: 0.9845201238390092
Precision: 0.9695121951219512
Recall: 1.0
Confusion Matrix:
[[4258    5]
 [   0  159]]
------------------------------------------------------------------
Saving model





Done


In [47]:
df = load_data_and_rename_columns('data/processed/bert_data.pickle',
            rename_dict={"status":"labels"},
            drop_columns_dict={'bill_id','cong'},
            drop_congress_number=[])
df

Unnamed: 0,sentences,labels
0,"[This Act may be cited as the ""Public Housing ...",0
1,"[This Act may be cited as the ""Targeted Econom...",0
2,[The Administrator shall conduct a study of St...,0
3,[Medicaid Benefits Continued for 36 Months for...,0
4,"[This Act may be cited as the ""Expedited Consi...",0
...,...,...
22110,"[In this section the term ""Administrative Proc...",0
22111,[Section 222 of the Communications Act of 1934...,0
22112,"[This Act may be cited as the ""Safe Drinking W...",0
22113,"[This Act may be cited as the ""Preserving Data...",0


In [49]:
print("Loading data...")
df = load_data_and_rename_columns('data/processed/bert_data.pickle',
            rename_dict={"status":"labels"},
            drop_columns_dict={'bill_id','cong'})

print("Combining list of sentences with [SEP] tokens...")
df['sentences'] = df['sentences'].apply(lambda x: '[SEP] '.join(x))

print("Converting pandas df to dataset-object...")
dataset_test = create_dataset_object_from_pandas_dataframe(df, "__index_level_0__")
dataset_test = datasets.DatasetDict({"test":dataset_test})

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

print("Applying tokenizer")
dataset_test_tokenized = dataset_test.map(tokenizer_function, batched=True)
dataset_test_tokenized["test"] = dataset_test_tokenized["test"].remove_columns("sentences")
dataset_test_tokenized.set_format("torch") #converting lists to tensors

print("Creating dataloader")
test_dataloader = DataLoader(dataset_test_tokenized["test"], batch_size=6, drop_last=True)

test_targs, test_preds = [], []


print("Calculating predictions of the voting behaviour of the 115th congress")
with torch.no_grad():   
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)            

        #Getting metrics
        test_targs += list(batch["labels"].cpu().numpy())
        test_preds += list(predictions.cpu().numpy())

    print('-----------Test Metrics-----------')
    print('Accuracy: {}'.format(accuracy_score(test_targs, test_preds)))
    print('F1: {}'.format(f1_score(test_targs, test_preds)))
    print('Precision: {}'.format(precision_score(test_targs, test_preds)))
    print('Recall: {}'.format(recall_score(test_targs, test_preds)))
    print('Confusion Matrix:')
    print(confusion_matrix(test_targs, test_preds))
    print('-' * 66)


Loading data...
Combining list of sentences with [SEP] tokens...
Converting pandas df to dataset-object...
Loading tokenizer...
Applying tokenizer


100%|██████████| 23/23 [00:22<00:00,  1.03ba/s]


Creating dataloader
Calculating predictions of the voting behaviour of the 115th congress
-----------Test Metrics-----------
Accuracy: 0.9918136589778381
F1: 0.8747404844290658
Precision: 0.9362962962962963
Recall: 0.8207792207792208
Confusion Matrix:
[[21297    43]
 [  138   632]]
------------------------------------------------------------------


In [37]:
df = load_data_and_rename_columns('data/processed/bert_data.pickle',
            rename_dict={"status":"labels"},
            drop_columns_dict={'bill_id'},
            drop_congress_number=list(range(100,115)))

In [38]:
df

Unnamed: 0,sentences,labels,cong
17649,"[This Act may be cited as the ""Federal Executi...",0,115
17650,"[This Act may be cited as the ""CFPB Constituti...",0,115
17651,"[This Act may be cited as the ""Commute Less Ac...",0,115
17652,"[This Act may be cited as the ""Zero Waste Deve...",0,115
17653,"[This Act may be cited as the ""Synthetics Traf...",0,115
...,...,...,...
22110,"[In this section the term ""Administrative Proc...",0,115
22111,[Section 222 of the Communications Act of 1934...,0,115
22112,"[This Act may be cited as the ""Safe Drinking W...",0,115
22113,"[This Act may be cited as the ""Preserving Data...",0,115


In [25]:
dataset_test_tokenized

DatasetDict({
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'sentences'],
        num_rows: 1414
    })
})

## Using `Transformer` library

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# training_args = TrainingArguments("test_trainer")

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["eval"],
    tokenizer = tokenizer,
    data_collator = data_collator
)

trainer.train()


NameError: name 'TrainingArguments' is not defined

In [None]:
model(torch.tensor(dataset_tokenized["train"]["input_ids"][0]))

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
torch.tensor(dataset_tokenized["train"]["input_ids"][0])

tensor([  101,  2023,  2552,  2089,  2022,  6563,  2004,  1996,  1000, 17581,
         3171,  5386,  2552,  1000,  1012,   102,  2000,  2393,  6469,  1998,
         2490,  1996, 25954,  2797,  4753,  1999,  2430,  1998,  2789,  2885,
         2044,  1996,  2991,  1997,  1996,  4068,  2813,  1010,  3519,  1010,
         2083, 26465,  1997,  1996,  2490,  2005,  2264,  2647,  7072,  1006,
         6534,  1007,  2552,  1997,  2960,  1998,  1996,  4071,  2490,  2552,
         1010,  9362,  3053,  1002,  1015,  1010,  3263,  1010,  2199,  1010,
         2199,  2005,  1996,  2142,  2163,  4034,  2005,  2248,  2458,   102,
         1996,  3800,  1997,  1996,  9562,  1998, 11453,  1011,  2137,  6960,
         4636,  2003,  2000,  5326,  2062,  4235,  4207, 14165,  2083,  2797,
         4753,  2458,  1998,  1996,  6043,  1998,  6078,  9530,  8566,  6895,
         3726,  2045,  3406,  1999,  9562,  1998, 11453,  1010,  2164,  2083,
        10940,  1010, 12702,  4135,  6962,  1010, 10067, 10518, 

In [None]:
len(dataset_tokenized['train']['input_ids'][0])

512

In [None]:
dataset_tokenized["train"]

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'sentences'],
    num_rows: 6634
})