In [1]:
import pandas as pd
import numpy as np
import datasets
from tqdm.auto import tqdm
import os
from torch.utils.data import WeightedRandomSampler
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, roc_auc_score, average_precision_score 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler, DataCollatorWithPadding
from sklearn.utils.class_weight import compute_class_weight
import torch
from torch.utils.data import DataLoader 
os.chdir("C:/Users/espen/Documents/SDS/deeplearningproject")

## Helper functions

In [2]:
def load_data_and_rename_columns(path:str, rename_dict={}, drop_columns_dict={}, drop_congress_number=[])->pd.DataFrame:
    #load data
    df = pd.read_pickle(path)
    df = df.loc[~df['cong'].isin(drop_congress_number)].copy()
    df = df.rename(columns=rename_dict)
    df = df.drop(columns=drop_columns_dict)
    return df
    
def train_validation_split(df, train_frac,eval_frac,random_seed):
    df_train = df.sample(frac=train_frac, random_state=random_seed)
    df_eval = df.sample(frac=eval_frac, random_state=random_seed)
    return df_train, df_eval

def create_dataset_object_from_pandas_dataframe(df,columns_to_be_removed):
    dataset = datasets.Dataset.from_pandas(df).remove_columns(columns_to_be_removed)
    return dataset

def tokenizer_function(bill):
    return tokenizer(bill["sentences"], truncation=True, padding="max_length")

def create_weighted_sampler(dataset):
    class_sample_count = np.array([len(np.where(dataset["labels"] == t)[0]) for t in np.unique(dataset["labels"])])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in dataset["labels"]])

    samples_weight = torch.from_numpy(samples_weight)
    samples_weight = samples_weight.double()
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    return sampler


def train(model, train_dataloader, eval_dataloader, loss_function, optimizer, num_epochs, lr_scheduler_function, device):
    

    model.train().to(device)
    num_training_steps = num_epochs * len(train_dataloader)
    progress_bar = tqdm(range(num_training_steps))
    lr_scheduler = get_scheduler(
        lr_scheduler_function,
        optimizer=optimizer,
        num_warmup_steps=num_training_steps/10,
        num_training_steps=num_training_steps
    )
    model.train()
    for _ in range(num_epochs):
        train_targs, train_preds = [], []
        val_targs, val_preds = [], []
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            loss = loss_function(outputs.logits, batch["labels"])
            loss.backward()
            optimizer.step()
            progress_bar.set_postfix(loss=loss.item())
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            
            predictions = torch.argmax(logits, dim=-1)            

            #Getting metrics
            train_targs += list(batch["labels"].cpu().numpy())
            train_preds += list(predictions.cpu().numpy())

            progress_bar.update(1)
        
        
        print('-----------Training Metrics-----------')
        print('Accuracy: {}'.format(accuracy_score(train_targs, train_preds)))
        print('F1: {}'.format(f1_score(train_targs, train_preds)))
        print('Precision: {}'.format(precision_score(train_targs, train_preds)))
        print('Recall: {}'.format(recall_score(train_targs, train_preds)))
        print('Confusion Matrix:')
        print(confusion_matrix(train_targs, train_preds))
        model.eval()
        
        with torch.no_grad():   
            for batch in eval_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)            

                #Getting metrics
                val_targs += list(batch["labels"].cpu().numpy())
                val_preds += list(predictions.cpu().numpy())

        print('-----------Validation Metrics-----------')
        print('Accuracy: {}'.format(accuracy_score(val_targs, val_preds)))
        print('F1: {}'.format(f1_score(val_targs, val_preds)))
        print('Precision: {}'.format(precision_score(val_targs, val_preds)))
        print('Recall: {}'.format(recall_score(val_targs, val_preds)))
        print('Confusion Matrix:')
        print(confusion_matrix(val_targs, val_preds))
        print('-' * 66)


## Train model

In [3]:
print("Loading data...")
df = load_data_and_rename_columns('data/processed/bert_data.pickle',
            rename_dict={"status":"labels"},
            drop_columns_dict={'bill_id','cong'},
            drop_congress_number=[115])

print("Combining list of sentences with [SEP] tokens...")
df['sentences'] = df['sentences'].apply(lambda x: '[SEP] '.join(x))

print("Training validation split...")
df_train, df_eval = train_validation_split(df, 0.8, 0.2, random_seed=3060)
# class_weights = list(float(x) for x in compute_class_weight('balanced', classes=df_train["labels"].unique(), y=df_train["labels"]))

print("Converting pandas df to dataset-object...")
dataset_train = create_dataset_object_from_pandas_dataframe(df_train, "__index_level_0__")
dataset_eval = create_dataset_object_from_pandas_dataframe(df_eval, "__index_level_0__")
dataset = datasets.DatasetDict({"train":dataset_train,"eval":dataset_eval})

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

print("Applying tokenizer")
dataset_tokenized = dataset.map(tokenizer_function, batched=True)
dataset_tokenized["train"] = dataset_tokenized["train"].remove_columns("sentences")
dataset_tokenized["eval"] = dataset_tokenized["eval"].remove_columns("sentences")
dataset_tokenized.set_format("torch") #converting lists to tensors

print("Creating dataloader with batches using a weighted sampler")
sampler = create_weighted_sampler(dataset_tokenized["train"])
train_dataloader = DataLoader(dataset_tokenized["train"], batch_size=6, drop_last=True, sampler=sampler)
eval_dataloader = DataLoader(dataset_tokenized["eval"], batch_size=6, drop_last = True)

print("Loading model")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

print("Training model")
#Arguments:
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
num_epochs = 4
lr_scheduler_function = "linear"
# class_weights = torch.tensor(device=device)
loss_function = torch.nn.CrossEntropyLoss()

#Training function
train(model, train_dataloader, eval_dataloader,loss_function, optimizer, num_epochs, lr_scheduler_function, device)

print("Saving model")
torch.save(model, "results/BERT_finetuned_congress_103_114_4_epochs_80pct_train_20_val.pt")

print("Done")

Loading data...
Combining list of sentences with [SEP] tokens...
Training validation split...
Converting pandas df to dataset-object...
Loading tokenizer...
Applying tokenizer


100%|██████████| 17/17 [00:10<00:00,  1.57ba/s]
100%|██████████| 5/5 [00:02<00:00,  1.73ba/s]


Creating dataloader with batches using a weighted sampler
Loading model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training model


 25%|██▌       | 2760/11040 [24:26<1:13:20,  1.88it/s, loss=0.00742] 

-----------Training Metrics-----------
Accuracy: 0.8270531400966183
F1: 0.8297467601949827
Precision: 0.8125509372453138
Recall: 0.8476861411393174
Confusion Matrix:
[[6717 1610]
 [1254 6979]]
-----------Validation Metrics-----------
Accuracy: 0.9647342995169083
F1: 0.6490384615384615
Precision: 0.48736462093862815
Recall: 0.9712230215827338
Confusion Matrix:
[[3859  142]
 [   4  135]]
------------------------------------------------------------------


 50%|█████     | 5520/11040 [50:06<47:23,  1.94it/s, loss=0.579]      

-----------Training Metrics-----------
Accuracy: 0.9798913043478261
F1: 0.9799554565701558
Precision: 0.9759021700035967
Recall: 0.9840425531914894
Confusion Matrix:
[[8087  201]
 [ 132 8140]]
-----------Validation Metrics-----------
Accuracy: 0.9881642512077294
F1: 0.8492307692307692
Precision: 0.7419354838709677
Recall: 0.9928057553956835
Confusion Matrix:
[[3953   48]
 [   1  138]]
------------------------------------------------------------------


 75%|███████▌  | 8280/11040 [1:15:46<23:37,  1.95it/s, loss=0.000563]

-----------Training Metrics-----------
Accuracy: 0.9914251207729469
F1: 0.9913719771539676
Precision: 0.9912515188335358
Recall: 0.9914924647544968
Confusion Matrix:
[[8260   72]
 [  70 8158]]
-----------Validation Metrics-----------
Accuracy: 0.9958937198067633
F1: 0.9423728813559323
Precision: 0.8910256410256411
Recall: 1.0
Confusion Matrix:
[[3984   17]
 [   0  139]]
------------------------------------------------------------------


100%|██████████| 11040/11040 [1:41:25<00:00,  1.94it/s, loss=0.00025]   

-----------Training Metrics-----------
Accuracy: 0.9982487922705314
F1: 0.9982305204710477
Precision: 0.997317727383565
Recall: 0.9991449859533407
Confusion Matrix:
[[8351   22]
 [   7 8180]]


100%|██████████| 11040/11040 [1:43:23<00:00,  1.78it/s, loss=0.00025]

-----------Validation Metrics-----------
Accuracy: 0.9992753623188406
F1: 0.9893238434163701
Precision: 0.9788732394366197
Recall: 1.0
Confusion Matrix:
[[3998    3]
 [   0  139]]
------------------------------------------------------------------
Saving model





Done


## Test fine-tuned model on test data set

In [4]:
#print("Loading model")
#model = torch.load("results/BERT_finetuned_congress_103_114_4_epochs_80pct_train_20_val.pt")

print("Loading data...")
df = load_data_and_rename_columns('data/processed/bert_data.pickle',
            rename_dict={"status":"labels"},
            drop_columns_dict={'bill_id','cong'},
            drop_congress_number=list(range(103,115)))

print("Combining list of sentences with [SEP] tokens...")
df['sentences'] = df['sentences'].apply(lambda x: '[SEP] '.join(x))

print("Converting pandas df to dataset-object...")
dataset_test = create_dataset_object_from_pandas_dataframe(df, "__index_level_0__")
dataset_test = datasets.DatasetDict({"test":dataset_test})

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

print("Applying tokenizer")
dataset_test_tokenized = dataset_test.map(tokenizer_function, batched=True)
dataset_test_tokenized["test"] = dataset_test_tokenized["test"].remove_columns("sentences")
dataset_test_tokenized.set_format("torch") #converting lists to tensors

print("Creating dataloader")
test_dataloader = DataLoader(dataset_test_tokenized["test"], batch_size=6, drop_last=True)

test_targs, test_preds = [], []


print("Calculating predictions of the voting behaviour of the 115th congress")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 
with torch.no_grad():   
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)            

        #Getting metrics
        test_targs += list(batch["labels"].cpu().numpy())
        test_preds += list(predictions.cpu().numpy())

    print('-----------Test Metrics-----------')
    print('Accuracy: {}'.format(accuracy_score(test_targs, test_preds)))
    print('F1: {}'.format(f1_score(test_targs, test_preds)))
    print('Precision: {}'.format(precision_score(test_targs, test_preds)))
    print('Recall: {}'.format(recall_score(test_targs, test_preds)))
    print('AUC {}'.format(roc_auc_score(test_targs, test_preds)))
    print('Avg. precision {}'.format(average_precision_score(test_targs, test_preds)))
    print('Confusion Matrix:')
    print(confusion_matrix(test_targs, test_preds))
    print('-' * 66)


Loading data...
Combining list of sentences with [SEP] tokens...
Converting pandas df to dataset-object...
Loading tokenizer...
Applying tokenizer


100%|██████████| 2/2 [00:00<00:00,  2.87ba/s]


Creating dataloader
Calculating predictions of the voting behaviour of the 115th congress
-----------Test Metrics-----------
Accuracy: 0.9666666666666667
F1: 0.25396825396825395
Precision: 0.8888888888888888
Recall: 0.14814814814814814
AUC 0.5737053425106523
Avg. precision 0.1643113562735312
Confusion Matrix:
[[1355    1]
 [  46    8]]
------------------------------------------------------------------
