# Condemnation Classifier

This notebook holds the code for

- loading the condemnation dataset using our dataset_loading_script
- preprocessing the dataset which only includes encoding at this stage
- running a 10 fold cv on the dataset

**Note no hyperparams were tuned!**

### Imports and Config

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

In [2]:
checkpoint = "bert-base-uncased"
# checkpoint = "roberta-base"
# checkpoint = "bert-large-uncased"

### Loading the data

In [3]:
raw_dataset = load_dataset("my_dataset_loading_script", "condemnation")

Reusing dataset my_dataset_loading_script (/home/geev/.cache/huggingface/datasets/my_dataset_loading_script/condemnation/1.1.0/ef891eb2986445dbe69883df48d98d7039da591e0c35cb211978c71951a3c83e)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1344
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 336
    })
})

In [5]:
raw_train_dataset = raw_dataset["train"]
raw_train_dataset[0]

{'text': 'As multiple sexual allegations come to light with people like <OTHER TARGET 1>, NPR’s -JOHN DOE-, and Michael Fallon, just to name a few...',
 'label': 1}

In [6]:
raw_train_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['no_condemnation', 'condemnation'], id=None)}

### Preprocessing & Evaluation

In [7]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /home/geev/.cache/huggingface/datasets/my_dataset_loading_script/condemnation/1.1.0/ef891eb2986445dbe69883df48d98d7039da591e0c35cb211978c71951a3c83e/cache-47313ceb021f21df.arrow
Loading cached processed dataset at /home/geev/.cache/huggingface/datasets/my_dataset_loading_script/condemnation/1.1.0/ef891eb2986445dbe69883df48d98d7039da591e0c35cb211978c71951a3c83e/cache-b1e535fe82b66242.arrow


In [8]:
def evaluate(trainer, tokenized_dataset):
    predictions = trainer.predict(tokenized_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    def get_metrics(y_pred, y_true):
        metrics ={}
        metrics["accuracy"] = accuracy_score(y_true, y_pred)
        metrics["precision"] = precision_score(y_true, y_pred)
        metrics["recall"] = recall_score(y_true, y_pred)
        metrics["f1"] = f1_score(y_true, y_pred)
        return metrics
    return get_metrics(preds, predictions.label_ids)


### Training with CV

In [9]:
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]
n_split = 10
sss = StratifiedShuffleSplit(n_splits=n_split, test_size=1/n_split, random_state=0)
sss.get_n_splits(train_dataset, train_dataset["label"])

10

In [10]:
torch.cuda.empty_cache()

In [11]:
performance = {"test":{}, "val":{}}
fold = 1
for train_index, val_index in sss.split(train_dataset, train_dataset["label"]):
    cv_train_dataset = Dataset.from_dict(train_dataset[train_index])
    
    training_args = TrainingArguments("test_trainer")
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    
    trainer = Trainer(
    model,
    training_args,
    train_dataset = cv_train_dataset,
    data_collator = data_collator,
    tokenizer = tokenizer,
    )
    
    trainer.train()
    cv_val_dataset = Dataset.from_dict(train_dataset[val_index])

    performance["val"]["fold "+str(fold)] = evaluate(trainer, cv_val_dataset)
    performance["test"]["fold "+str(fold)] = evaluate(trainer, test_dataset)

    print("saving a model! for fold {}".format(fold))
    trainer.save_model("./models/hf/hf_fold_{}_model.")
    torch.save(model, "./models/fold_{}_model.p".format(fold))
    fold+=1
    del model
    del trainer
    del cv_train_dataset
    del cv_val_dataset
    break
    torch.cuda.empty_cache()
    
    

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 135
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 336
  Batch size = 8
Saving model checkpoint to ./models/hf/hf_fold_{}_model.
Configuration saved in ./models/hf/hf_fold_{}_model./config.json


saving a model! for fold 1


Model weights saved in ./models/hf/hf_fold_{}_model./pytorch_model.bin
tokenizer config file saved in ./models/hf/hf_fold_{}_model./tokenizer_config.json
Special tokens file saved in ./models/hf/hf_fold_{}_model./special_tokens_map.json


In [12]:
performance

{'test': {'fold 1': {'accuracy': 0.7738095238095238,
   'precision': 0.7848101265822784,
   'recall': 0.8815165876777251,
   'f1': 0.8303571428571428}},
 'val': {'fold 1': {'accuracy': 0.8148148148148148,
   'precision': 0.8061224489795918,
   'recall': 0.9294117647058824,
   'f1': 0.8633879781420767}}}

In [17]:
import pickle
pickle_dump_file = "10_cv_"+checkpoint+"_performance.p"
pickle.dump(performance, open(pickle_dump_file, "wb"))

In [18]:
pickle.load(open(pickle_dump_file, "rb"))

{'test': {'fold 1': {'accuracy': 0.8184523809523809,
   'precision': 0.8537735849056604,
   'recall': 0.8578199052132701,
   'f1': 0.8557919621749408},
  'fold 2': {'accuracy': 0.7678571428571429,
   'precision': 0.7878787878787878,
   'recall': 0.8625592417061612,
   'f1': 0.823529411764706},
  'fold 3': {'accuracy': 0.8065476190476191,
   'precision': 0.811965811965812,
   'recall': 0.9004739336492891,
   'f1': 0.8539325842696629},
  'fold 4': {'accuracy': 0.7678571428571429,
   'precision': 0.821256038647343,
   'recall': 0.8056872037914692,
   'f1': 0.8133971291866028},
  'fold 5': {'accuracy': 0.7857142857142857,
   'precision': 0.8293838862559242,
   'recall': 0.8293838862559242,
   'f1': 0.8293838862559242},
  'fold 6': {'accuracy': 0.7797619047619048,
   'precision': 0.8215962441314554,
   'recall': 0.8293838862559242,
   'f1': 0.8254716981132075},
  'fold 7': {'accuracy': 0.7678571428571429,
   'precision': 0.7878787878787878,
   'recall': 0.8625592417061612,
   'f1': 0.823529

In [None]:
def agg_performance(performance):
    
    agg_performance = {}
    for key in performance:
        agg_performance[key] = {}
        for metric in performance[key]["fold 1"]:
            metric_val_list = [performance[key][fold][metric] for fold in performance[key]]
            agg_performance[key]["avg_"+metric] = "{:.2f}".format(np.mean(metric_val_list))
            agg_performance[key]["std_"+metric] = "{:.2f}".format(np.std(metric_val_list))
    return agg_performance
        
import pprint
pprint.pprint(agg_performance(performance)        )