# BERT

In [16]:
from transformers import logging, TextClassificationPipeline, AutoTokenizer, AutoConfig, TrainingArguments, Trainer, AutoModelForSequenceClassification, pipeline
import numpy as np
import pandas as pd
import sklearn.metrics
import datasets
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import huggingface_hub
import random
import spacy
from tqdm import tqdm
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
logging.set_verbosity_error()

In [2]:
# Load Libraries
model_name = 'bert-base-cased'
metric = evaluate.load("f1")
cf_metric = evaluate.load("BucketHeadP65/confusion_matrix")
classification_report_metric = evaluate.load("bstrai/classification_report")
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = spacy.load("en_core_web_sm")

Using the latest cached version of the module from /home/mac9908/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/0ca73f6cf92ef5a268320c697f7b940d1030f8471714bffdb6856c641b818974 (last modified on Sun Apr 16 03:14:33 2023) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/mac9908/.cache/huggingface/modules/evaluate_modules/metrics/BucketHeadP65--confusion_matrix/254235a9ddfc4c72780abf742d0465b2f277b620321424b2c9e0cb665679e57d (last modified on Tue Apr 18 00:30:53 2023) since it couldn't be found locally at BucketHeadP65--confusion_matrix, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/mac9908/.cache/huggingface/modules/evaluate_modules/metrics/bstrai--classification_report/84690405a7a830f4f3f8c00ea1f9de9c2d4e4ccab3cb790c1a495a48055fb571 (last modified on Tue Apr 18 02:03:32 2023) since it couldn't be found locally a

In [17]:
# Results Table
results_table = {'model': [], 'precision': [], 'recall': [], 'f1': [], 'accuracy': []}
# Util functions
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")
def tokenize_function(examples):
    return tokenizer(examples["comment"], padding="max_length", truncation=True, return_tensors='pt', max_length=512)
def labelify(examples):
  label_encode = {
      'true': 1,
      'false': 0
  }
  label = label_encode[examples['label'].lower()]
  examples['label'] = label
  return examples
def ner_mask(x):
    doc = nlp(x['comment'])
    masked = x['comment']
    for e in reversed(doc.ents): #reversed to not modify the offsets of other entities when substituting
        if e.label_ in ['LOC', 'GPE']:
            start = e.start_char
            end = start + len(e.text)
            masked = masked[:start] + '<LOCATION>' + masked[end:]
    x['comment'] = masked
    return x

def load_data(filename, mask=False, seed=42, split=0.2):
  df = pd.read_csv(filename)
  df['labels'] = df['label'].astype('int')
  dt = datasets.Dataset.from_pandas(df)
  if mask:
    dt = dt.map(ner_mask)
  print(dt)
  dt = dt.train_test_split(test_size=split, seed=seed)
  train=dt['train'].shuffle(seed=seed)
  test=dt['test'].shuffle(seed=seed)
  return train, test
# Train
class InfraTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        labels = labels.type(torch.LongTensor)
        labels = labels.to('cuda')
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (we have 2 labels with greater weight on positive)
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([0.5567, 4.9114], device=0))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
       
def finetune(train, 
             test, 
             model_name,
             tokenizer,
             freeze=True, 
             batch_size=8, 
             epochs=3,
             dropout=None,
             compute_loss=False,
             metric_for_best_model='f1',
             exp='BERT_mask'):
  try:
      torch.cuda.empty_cache()
      def tokenize_function(examples):
        return tokenizer(examples["comment"], padding="max_length", truncation=True, return_tensors='pt', max_length=512)
      train = train.map(tokenize_function, batched=True, remove_columns=['id', 'comment','Unnamed: 0', 'label'])
      test = test.map(tokenize_function, batched=True, remove_columns=['id', 'comment','Unnamed: 0', 'label'])
      print(train)
      model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, label2id=label2id, id2label=id2label)
      if dropout:
          model.config.hidden_dropout_prob = dropout
      if freeze:
        for name, param in model.named_parameters():
          if 'classifier' not in name:
            param.requires_grad = True
      training_args = TrainingArguments(output_dir=f"{model_name}_checkpoints",
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model=metric_for_best_model,
                                  num_train_epochs=epochs)
      
      if compute_loss:
          trainer = InfraTrainer(
              model=model,
              args=training_args,
              train_dataset=train,
              eval_dataset=test,
              compute_metrics=compute_metrics
          )
      else:
          trainer = Trainer(
              model=model,
              args=training_args,
              train_dataset=train,
              eval_dataset=test,
              compute_metrics=compute_metrics
          ) 
      print(trainer.label_names)
      trainer.train()
      trainer.save_model(f'{exp}_infrastructure_ft')
  except Exception as e:
    raise(e)
  finally:
      print('Deleting and clearing GPU memory')
      del model
      del trainer
      import gc
      gc.collect()
      gc.collect()
def eval(data, model_path):
  pipe = pipeline(task='text-classification',
                  model=model_path,
                  tokenizer=tokenizer,
                  padding="max_length",
                  truncation=True,
                  max_length=512,
                  device=0)
  def predict(x):
    x['predict'] = pipe(x['comment'])[0]['label']
    return x
  res = data.map(predict)
  res = res.to_pandas()
  res['predict'] = res['predict'].map(label2id)
  print(f"Precision Score: {sklearn.metrics.precision_score(res['labels'], res['predict'])}")
  print(f"Accuracy Score: {sklearn.metrics.accuracy_score(res['labels'], res['predict'])}")
  print(f"Balanced Accuracy Score: {sklearn.metrics.balanced_accuracy_score(res['labels'], res['predict'])}")
  print(f"F1: {sklearn.metrics.f1_score(res['labels'], res['predict'])}")
  print(f"Classification Report:\n {sklearn.metrics.classification_report(res['label'], res['predict'])}")
  cm = sklearn.metrics.confusion_matrix(res['labels'], res['predict'], labels=[0, 1])
  disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
  disp.plot()
  plt.show()
  return res
def eval_variance(data, model_path):
    pipe = pipeline(task='text-classification',
                  model=model_path,
                  tokenizer=tokenizer,
                  padding="max_length",
                  truncation=True,
                  max_length=512,
                  device=0)
    def predict(x):
        x['predict'] = pipe(x['comment'])[0]['label']
        return x
    res = data.map(predict)
    res = res.to_pandas()
    res['predict'] = res['predict'].map(label2id)
    rep = sklearn.metrics.classification_report(res['label'], res['predict'], output_dict=True)
    precision, recall, f1, accuracy = rep['macro avg']['precision'], rep['macro avg']['recall'], rep['macro avg']['f1-score'], rep['accuracy']
    return precision, recall, f1, accuracy

def generate_evaluation_data(test, exp='BERT_mask', n=5):
    mp, mr, mf, ma = [],[],[], []
    for i in range(n):
      c = test.train_test_split(test_size=0.3, seed=i)
      precision, recall, f1, accuracy = eval_variance(c['train'], f'{exp}_infrastructure_ft')
      mp.append(precision)
      mr.append(recall)
      mf.append(f1)
      ma.append(accuracy)
    results_table['model'].append(exp)
    results_table['precision'].append(f'{np.mean(mp)} +- {np.var(mp)}')
    results_table['recall'].append(f'{np.mean(mr)} +- {np.var(mr)}')
    results_table['f1'].append(f'{np.mean(mf)} +- {np.var(mf)}')
    results_table['accuracy'].append(f'{np.mean(ma)} +- {np.var(ma)}')

In [23]:
# Exp1 BERT_mask
train, test = load_data('infrastructure.csv', mask=True, seed=42, split=0.3)
finetune(train, 
         test, 
         'bert-base-cased', 
         AutoTokenizer.from_pretrained('bert-base-cased'), 
         compute_loss=True, freeze=True, 
         epochs=5, batch_size=32, 
         exp='BERT_mask')

Map:   0%|          | 0/2662 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/1863 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1863
})
['labels']
{'eval_loss': 0.4622590243816376, 'eval_f1': 0.8488055787991025, 'eval_runtime': 4.15, 'eval_samples_per_second': 192.53, 'eval_steps_per_second': 3.133, 'epoch': 1.0}
{'eval_loss': 0.3863871395587921, 'eval_f1': 0.8681950771055896, 'eval_runtime': 4.1658, 'eval_samples_per_second': 191.8, 'eval_steps_per_second': 3.121, 'epoch': 2.0}
{'eval_loss': 0.4827236235141754, 'eval_f1': 0.913681962399193, 'eval_runtime': 4.1752, 'eval_samples_per_second': 191.366, 'eval_steps_per_second': 3.114, 'epoch': 3.0}
{'eval_loss': 0.3978336751461029, 'eval_f1': 0.9155061830671034, 'eval_runtime': 4.1892, 'eval_samples_per_second': 190.73, 'eval_steps_per_second': 3.103, 'epoch': 4.0}
{'eval_loss': 0.49780669808387756, 'eval_f1': 0.9239084900632921, 'eval_runtime': 4.197, 'eval_samples_per_second': 190.374, 'eval_steps_per_second': 3.097, 'epoch': 5.0}
{'train_runtime': 151.5062, 'train

In [24]:
# Exp2 BERT_nomask
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
finetune(train, 
         test, 
         'bert-base-cased', 
         AutoTokenizer.from_pretrained('bert-base-cased'), 
         compute_loss=True, freeze=True, 
         epochs=5, batch_size=32, 
         exp='BERT_nomask')

Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/1863 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1863
})
['labels']
{'eval_loss': 0.46606793999671936, 'eval_f1': 0.8834685690095627, 'eval_runtime': 4.1933, 'eval_samples_per_second': 190.544, 'eval_steps_per_second': 3.1, 'epoch': 1.0}
{'eval_loss': 0.34395670890808105, 'eval_f1': 0.8958106772120613, 'eval_runtime': 4.2061, 'eval_samples_per_second': 189.96, 'eval_steps_per_second': 3.091, 'epoch': 2.0}
{'eval_loss': 0.4619446098804474, 'eval_f1': 0.9156214772011113, 'eval_runtime': 4.2093, 'eval_samples_per_second': 189.819, 'eval_steps_per_second': 3.088, 'epoch': 3.0}
{'eval_loss': 0.4261991083621979, 'eval_f1': 0.9133312665296454, 'eval_runtime': 4.2119, 'eval_samples_per_second': 189.701, 'eval_steps_per_second': 3.086, 'epoch': 4.0}
{'eval_loss': 0.5973544120788574, 'eval_f1': 0.9228613614154083, 'eval_runtime': 4.2044, 'eval_samples_per_second': 190.038, 'eval_steps_per_second': 3.092, 'epoch': 5.0}
{'train_runtime': 152.8054, 

In [25]:
model_name = 'roberta-large'
# Exp3 ROBERTA_mask
train, test = load_data('infrastructure.csv', mask=True, seed=42, split=0.3)
finetune(train, 
         test, 
         'roberta-large', 
         AutoTokenizer.from_pretrained('roberta-large'), 
         compute_loss=True, freeze=True, 
         epochs=5, batch_size=16, 
         exp='ROBERTA_mask')

Map:   0%|          | 0/2662 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/1863 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1863
})
['labels']
{'eval_loss': 0.4948447048664093, 'eval_f1': 0.8169628087627483, 'eval_runtime': 13.365, 'eval_samples_per_second': 59.783, 'eval_steps_per_second': 1.871, 'epoch': 1.0}
{'eval_loss': 1.2415260076522827, 'eval_f1': 0.8409850499756065, 'eval_runtime': 13.5512, 'eval_samples_per_second': 58.962, 'eval_steps_per_second': 1.845, 'epoch': 2.0}
{'eval_loss': 0.5171976089477539, 'eval_f1': 0.8032947883418865, 'eval_runtime': 13.4062, 'eval_samples_per_second': 59.599, 'eval_steps_per_second': 1.865, 'epoch': 3.0}
{'eval_loss': 0.6125231385231018, 'eval_f1': 0.9110265875944923, 'eval_runtime': 13.5691, 'eval_samples_per_second': 58.884, 'eval_steps_per_second': 1.842, 'epoch': 4.0}
{'eval_loss': 0.6460419297218323, 'eval_f1': 0.9284772281633286, 'eval_runtime': 13.4117, 'eval_samples_per_second': 59.575, 'eval_steps_per_second': 1.864, 'epoch': 5.0}
{'train_runtime': 519.3902, 'train_samples_per

In [34]:
# Exp4 ROBERTA_nomask
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
finetune(train, 
         test, 
         'roberta-large', 
         AutoTokenizer.from_pretrained('roberta-large'), 
         compute_loss=True, freeze=True, 
         epochs=5, batch_size=16, 
         exp='ROBERTA_nomask')

Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/1863 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1863
})
['labels']
{'eval_loss': 0.6356980204582214, 'eval_f1': 0.8356711311474047, 'eval_runtime': 13.6784, 'eval_samples_per_second': 58.413, 'eval_steps_per_second': 1.828, 'epoch': 1.0}
{'eval_loss': 0.40916019678115845, 'eval_f1': 0.8435921086693382, 'eval_runtime': 13.3727, 'eval_samples_per_second': 59.749, 'eval_steps_per_second': 1.869, 'epoch': 2.0}
{'eval_loss': 0.47223132848739624, 'eval_f1': 0.8754515399442182, 'eval_runtime': 13.4114, 'eval_samples_per_second': 59.576, 'eval_steps_per_second': 1.864, 'epoch': 3.0}
{'eval_loss': 0.4746869206428528, 'eval_f1': 0.8753101896990432, 'eval_runtime': 13.445, 'eval_samples_per_second': 59.427, 'eval_steps_per_second': 1.859, 'epoch': 4.0}
{'eval_loss': 0.7546632885932922, 'eval_f1': 0.9190626224924059, 'eval_runtime': 13.4188, 'eval_samples_per_second': 59.543, 'eval_steps_per_second': 1.863, 'epoch': 5.0}
{'train_runtime': 520.2512, 'train_samples_p

In [35]:
# Results Table
results_table = {'model': [], 'precision': [], 'recall': [], 'f1': [], 'accuracy': []}
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
train, test = load_data('infrastructure.csv', mask=True, seed=42, split=0.3)
generate_evaluation_data(test, exp='BERT_mask')
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
generate_evaluation_data(test, exp='BERT_nomask')
model_name = 'roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
train, test = load_data('infrastructure.csv', mask=True, seed=42, split=0.3)
generate_evaluation_data(test, exp='ROBERTA_mask')
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
generate_evaluation_data(test, exp='ROBERTA_nomask')


Map:   0%|          | 0/2662 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/2662 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Unnamed: 0,model,precision,recall,f1,accuracy
0,BERT_mask,0.804009962289174 +- 0.0002091038893005009,0.8278386437627937 +- 0.00021321314028287236,0.8148501939347932 +- 0.00012478094556835366,0.9262969588550984 +- 1.8433120733740488e-05
1,BERT_nomask,0.793770596227169 +- 8.043359276791313e-05,0.8218811306448716 +- 0.00018427068156368125,0.8065604898528809 +- 5.373482574783937e-05,0.9223613595706619 +- 1.1008669327094961e-05
2,ROBERTA_mask,0.8222065523723666 +- 1.2891191958082072e-05,0.8270583652913046 +- 0.00038773415532411575,0.8240957544142992 +- 8.97521857147603e-05,0.9320214669051878 +- 3.45621013757637e-05
3,ROBERTA_nomask,0.7783101907180043 +- 8.005407560543732e-05,0.8268708083869303 +- 0.0001960106999305339,0.7992985386575718 +- 3.1725154171486e-05,0.9162790697674419 +- 1.8433120733740857e-05


In [37]:
pd.DataFrame.from_dict(results_table)

Unnamed: 0,model,precision,recall,f1,accuracy
0,BERT_mask,0.804009962289174 +- 0.0002091038893005009,0.8278386437627937 +- 0.00021321314028287236,0.8148501939347932 +- 0.00012478094556835366,0.9262969588550984 +- 1.8433120733740488e-05
1,BERT_nomask,0.793770596227169 +- 8.043359276791313e-05,0.8218811306448716 +- 0.00018427068156368125,0.8065604898528809 +- 5.373482574783937e-05,0.9223613595706619 +- 1.1008669327094961e-05
2,ROBERTA_mask,0.8222065523723666 +- 1.2891191958082072e-05,0.8270583652913046 +- 0.00038773415532411575,0.8240957544142992 +- 8.97521857147603e-05,0.9320214669051878 +- 3.45621013757637e-05
3,ROBERTA_nomask,0.7783101907180043 +- 8.005407560543732e-05,0.8268708083869303 +- 0.0001960106999305339,0.7992985386575718 +- 3.1725154171486e-05,0.9162790697674419 +- 1.8433120733740857e-05


In [36]:
# Results Table
results_table = {'model': [], 'precision': [], 'recall': [], 'f1': [], 'accuracy': []}
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
train, test = load_data('infrastructure.csv', mask=True, seed=42, split=0.3)
generate_evaluation_data(test, exp='BERT_mask')
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
generate_evaluation_data(test, exp='BERT_nomask')
model_name = 'roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
train, test = load_data('infrastructure.csv', mask=True, seed=42, split=0.3)
generate_evaluation_data(test, exp='ROBERTA_mask')
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
generate_evaluation_data(test, exp='ROBERTA_nomask')
pd.DataFrame.from_dict(results_table)

Map:   0%|          | 0/2662 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/2662 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Dataset({
    features: ['Unnamed: 0', 'id', 'comment', 'label', 'labels'],
    num_rows: 2662
})


Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Map:   0%|          | 0/559 [00:00<?, ? examples/s]



Unnamed: 0,model,precision,recall,f1,accuracy
0,BERT_mask,0.804009962289174 +- 0.0002091038893005009,0.8278386437627937 +- 0.00021321314028287236,0.8148501939347932 +- 0.00012478094556835366,0.9262969588550984 +- 1.8433120733740488e-05
1,BERT_nomask,0.793770596227169 +- 8.043359276791313e-05,0.8218811306448716 +- 0.00018427068156368125,0.8065604898528809 +- 5.373482574783937e-05,0.9223613595706619 +- 1.1008669327094961e-05
2,ROBERTA_mask,0.8222065523723666 +- 1.2891191958082072e-05,0.8270583652913046 +- 0.00038773415532411575,0.8240957544142992 +- 8.97521857147603e-05,0.9320214669051878 +- 3.45621013757637e-05
3,ROBERTA_nomask,0.7783101907180043 +- 8.005407560543732e-05,0.8268708083869303 +- 0.0001960106999305339,0.7992985386575718 +- 3.1725154171486e-05,0.9162790697674419 +- 1.8433120733740857e-05


## Experiment 1 Masked Locations

In [None]:
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
print(f"Distribution of train set: {train.to_pandas().groupby('labels')['labels'].count()}")
print(f"Distribution of test set: {test.to_pandas().groupby('labels')['labels'].count()}")
print(train['comment'][6])

In [None]:
finetune(train, test, model_name, freeze=True, epochs=5, batch_size=32)

In [None]:
generate_evaluation_data(test)

In [None]:
train_pred = eval(train, f'{model_name}_infrastructure_ft')

In [None]:
test_pred = eval(test, f'{model_name}_infrastructure_ft')

In [None]:
t = test_pred[test_pred['predict']==1]
t[t['labels']==1]

## Experiment 2 No Masked Locations

In [None]:
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
print(f"Distribution of train set: {train.to_pandas().groupby('labels')['labels'].count()}")
print(f"Distribution of test set: {test.to_pandas().groupby('labels')['labels'].count()}")
print(train['comment'][6])

In [None]:
finetune(train, test, model_name, freeze=False, epochs=5, batch_size=32)

In [None]:
generate_evaluation_data(test, exp='BERT_nomask')

In [None]:
test_pred = eval(test, f'{model_name}_infrastructure_ft')

In [None]:
train_pred = eval(train, f'{model_name}_infrastructure_ft')

# ROBERTA

In [None]:
model_name = 'roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Experiment 1 Masked Locations

In [None]:
train, test = load_data('infrastructure.csv', mask=True, seed=42, split=0.3)
print(f"Distribution of train set: {train.to_pandas().groupby('labels')['labels'].count()}")
print(f"Distribution of test set: {test.to_pandas().groupby('labels')['labels'].count()}")
print(train['comment'][6])

In [None]:
finetune(train, test, model_name, compute_loss=True, freeze=True, epochs=5, batch_size=16)

In [None]:
generate_evaluation_data(test, exp='ROBERTA_mask')

In [None]:
test_pred = eval(test, f'{model_name}_infrastructure_ft')

In [None]:
train_pred = eval(train, f'{model_name}_infrastructure_ft')

## Experiment 2 No Masked Locations

In [None]:
model_name = 'roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
train, test = load_data('infrastructure.csv', mask=False, seed=42, split=0.3)
print(f"Distribution of train set: {train.to_pandas().groupby('labels')['labels'].count()}")
print(f"Distribution of test set: {test.to_pandas().groupby('labels')['labels'].count()}")
print(train['comment'][6])

In [None]:
finetune(train, test, model_name, compute_loss=True, freeze=False, epochs=5, batch_size=16)

In [None]:
generate_evaluation_data(test, exp='ROBERTA_nomask')

In [None]:
test_pred = eval(test, f'{model_name}_infrastructure_ft')

In [None]:
mp, mr, mf, ma = [],[],[], []
for i in range(5):
  c = test.train_test_split(test_size=0.3, seed=i)
  precision, recall, f1, accuracy = eval_variance(c['train'], f'{model_name}_infrastructure_ft')
  mp.append(precision)
  mr.append(recall)
  mf.append(f1)
  ma.append(accuracy)
print(mp)

In [None]:
print(np.var(mp))
print(np.var(mr))
print(np.var(mf))
print(np.var(ma))

In [None]:
train_pred = eval(train, f'{model_name}_infrastructure_ft')