# Setup

In [None]:
%pip install transformers datasets evaluate

In [None]:
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [None]:
from huggingface_hub import notebook_login
# notebook_login()

# Generate dataset
Usually not needed

In [None]:
openCaselist = load_dataset("Yusuf5/OpenCaselistTMP", split='train')

In [None]:
def headerChecker(search: str, headers: list[str]):
  def check(row):
    return any(search in row[header].lower() for header in headers if row[header])
  return check

In [None]:
searchHeaders = ['pocket', 'hat', 'block']
linkRows = openCaselist.filter(headerChecker('link', searchHeaders))
impactRows = openCaselist.filter(headerChecker('impact', searchHeaders))

Filter:   0%|          | 0/4064137 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4064137 [00:00<?, ? examples/s]

In [None]:
from collections import Counter
bucketLinkCounts = Counter(linkRows['bucketId'])
bucketImpactCounts = Counter(impactRows['bucketId'])

linkBuckets = { id for id, value in (bucketLinkCounts - bucketImpactCounts).items() if value > 0}
impactBuckets = { id for id, value in (bucketImpactCounts - bucketLinkCounts).items() if value > 0}
len(linkBuckets), len(impactBuckets)

(69211, 58844)

In [None]:
sameSum = sum(bucketLinkCounts[id] for id in linkBuckets)
diffSum = sum(bucketImpactCounts[id] for id in linkBuckets)
sameSum / (sameSum + diffSum)

0.9693250970637638

In [None]:
sameSum = sum(bucketImpactCounts[id] for id in impactBuckets)
diffSum = sum(bucketLinkCounts[id] for id in impactBuckets)
sameSum / (sameSum + diffSum)

0.9679680738532551

In [None]:
def toLabeled(row):
  if row['bucketId'] in linkBuckets: label = 0
  elif row['bucketId'] in impactBuckets: label = 1
  else: label = None
  return {'liLabel': label}

In [None]:
labeled = openCaselist.map(toLabeled).filter(
    lambda row: (row['label'] is not None) and (row['tag'] is not None)
)

In [None]:
# Making sure tags that are in the test set arent also in the train set
bucketTags = {}
for i, key in enumerate(zip(labeled['tag'], labeled['bucketId'])):
  if key not in bucketTags: bucketTags[key] = []
  bucketTags[key].append(i)

In [None]:
bucketTagSet = list(set(zip(labeled['tag'], labeled['bucketId'])))

trainIds = [i for key in bucketTagSet[:int(len(bucketTagSet) * 0.8)] for i in bucketTags[key]]
validateIds = [i for key in bucketTagSet[int(len(bucketTagSet) * 0.8):int(len(bucketTagSet) * 0.9)] for i in bucketTags[key]]
testIds = [i for key in bucketTagSet[int(len(bucketTagSet) * 0.9):] for i in bucketTags[key]]
len(trainIds), len(validateIds), len(testIds)

(1048283, 131331, 130552)

In [None]:
splits = DatasetDict({
    'train': labeled.select(trainIds),
    'validate': labeled.select(validateIds),
    'test': labeled.select(testIds)
}).rename_columns({'tag': 'text'})

In [None]:
splits.push_to_hub('Yusuf5/OpenCaselistLI')

# Load Dataset

In [None]:
dataset = load_dataset('Yusuf5/OpenCaselistLI')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def preprocess(examples): return tokenizer(examples['text'], truncation=True)

In [None]:
tokenized_dataset = dataset.map(preprocess, batched=True)

In [None]:
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "LINK", 1: "IMPACT"}
label2id = {v: k for k,v in id2label.items()}

# Train

## Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="OpenCaselistLIClassifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=10000,
    save_strategy="steps",
    save_steps=10000,
    load_best_model_at_end=True,
    # push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

## Train

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
10000,0.3434,0.396612,0.824529
20000,0.2834,0.389014,0.837976
30000,0.2534,0.377751,0.853742
40000,0.2384,0.383431,0.856108
50000,0.2339,0.354298,0.86477
60000,0.2267,0.361462,0.866716


TrainOutput(global_step=65492, training_loss=0.2737070251485347, metrics={'train_runtime': 8204.9603, 'train_samples_per_second': 127.712, 'train_steps_per_second': 7.982, 'total_flos': 1.7913149254925424e+16, 'train_loss': 0.2737070251485347, 'epoch': 1.0})