In [1]:
!git clone https://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@github.com/Amsterdam-Internships/super-weak
!pip install -r super-weak/requirements.txt
%cd super-weak

fatal: destination path 'super-weak' already exists and is not an empty directory.
/content/super-weak


In [1]:
import torch
import transformers
from transformers import pipeline

In [7]:
def init_device():
  if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are %d GPU(s) available." % torch.cuda.device_count())
    print("Using GPU: ", torch.cuda.get_device_name(0))

  else:
    print("No GPU available, using CPU instead.")
    device = torch.device("cpu")
  
init_device()

No GPU available, using CPU instead.


In [None]:
def read_conll(path):
  text = []
  labels = []
  with open(path, mode = "r") as file: 
    raw_text = file.read()
    docs = raw_text.split("-DOCSTART- -DOCSTART- O")
    token_docs = []
    tag_docs = []
    for doc in docs[1:]:
      tokens = []
      tags = []
      for line in doc.split("\n"):
        if line == "":
          continue
        else:
          token, pos, ent = line.split(" ")
          tokens.append(token)
          tags.append(ent)
      token_docs.append(tokens)
      tag_docs.append(tags)
  
  return token_docs, tag_docs

train_texts, train_tags = read_conll("data/ned_train.txt")
val_texts, val_tags = read_conll("data/ned_testa.txt")

In [None]:
!wget http://noisy-text.github.io/2017/files/wnut17train.conll

--2021-05-20 11:40:16--  http://noisy-text.github.io/2017/files/wnut17train.conll
Resolving noisy-text.github.io (noisy-text.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to noisy-text.github.io (noisy-text.github.io)|185.199.108.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493781 (482K) [application/octet-stream]
Saving to: ‘wnut17train.conll’


2021-05-20 11:40:16 (7.10 MB/s) - ‘wnut17train.conll’ saved [493781/493781]



In [None]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_wnut('wnut17train.conll')

from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [None]:
print(train_texts[0][10:17], train_tags[0][10:17], sep='\n')

['maar', 'het', 'bericht', 'werd', 'alvast', 'bekendgemaakt', 'door']
['O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
unique_tags = set(tag for doc in train_tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
from transformers import XLMRobertaTokenizerFast

tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
train_encodings = tokenizer(train_texts, is_split_into_words = True, return_offsets_mapping = True, truncation = True)
val_encodings = tokenizer(val_texts, is_split_into_words = True, return_offsets_mapping = True, truncation = True) 

In [None]:
import numpy as np

def encode_tags(tags, encodings):
  labels = [[tag2id[tag] for tag in doc] for doc in tags]
  encoded_labels = []
  for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
    print(len(doc_labels), len(doc_offset))
    # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)

    # set labels whose first offset position is 0 and the second is not 0
    doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
    encoded_labels.append(doc_enc_labels.tolist())

  return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

749 512


ValueError: ignored

In [None]:
label_all_tokens = True

def tokenize_and_align_labels(examples, labels):
    tokenized_inputs = tokenizer(examples, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = tokenize_and_align_labels(train_texts, train_tags)
tokenized_val = tokenize_and_align_labels(val_texts, val_tags)

In [None]:
class CoNLL2002(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_set = CoNLL2002(tokenized_train.input_ids, tokenized_train.labels)
val_set = CoNLL2002(tokenized_val.input_ids, tokenized_val.labels)

In [None]:
tokenized_train.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("conll2002", "nl")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2632.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2012.0, style=ProgressStyle(description…


Downloading and preparing dataset conll2002/nl (download: 3.47 MiB, generated: 7.74 MiB, post-processed: Unknown size, total: 11.21 MiB) to /root/.cache/huggingface/datasets/conll2002/nl/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=571224.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=110052.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=193696.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset conll2002 downloaded and prepared to /root/.cache/huggingface/datasets/conll2002/nl/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5. Subsequent calls will reuse this data.


In [None]:
raw_datasets["train"].features

{'id': Value(dtype='string', id=None),
 'ner_tags': Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], names_file=None, id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(num_classes=12, names=['Adj', 'Adv', 'Art', 'Conj', 'Int', 'Misc', 'N', 'Num', 'Prep', 'Pron', 'Punc', 'V'], names_file=None, id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…




In [None]:
def tokenize_function(examples):
  return tokenizer(examples["tokens"], padding = "max_length", truncation = True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




TypeError: ignored

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels = 2)

In [None]:
from transformers import Trainer
import numpy as np
from datasets import load_metric
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer", evaluation_strategy = "epoch")

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis = -1)
  return metric.compute(predictions = predictions, references = labels)

trainer = Trainer(model = model,
                  args = training_args,
                  train_dataset = small_train_dataset,
                  eval_dataset = small_eval_dataset,
                  compute_metrics = compute_metrics
                  )

trainer.train()
trainer.evaluate()

In [None]:
nlp = pipeline("ner", "xlm-roberta-base")

sequence = """Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window."""

In [None]:
print(nlp(sequence))

In [3]:
################## HIER BEGINT CODE #################################
task = "ner"
model_checkpoint = "xlm-roberta-large"
batch_size = 8

In [6]:
from datasets import Dataset, ClassLabel, Sequence
import spacy

nlp = spacy.load("nl_core_news_md")

sample = ["Als ik geweten had dat Sara religieus was, dan had ik niet stoned naar Amsterdam gekomen.", 
          "Tyler Childers is een geweldig muzikant uit Kentucky, USA.",
          "Tyler Childers is net als Sturgill Simpson een country muzikant."]

processed_sample = list(nlp.pipe(sample))

def relabel(ent_label: str) -> str:
  """
  returns ConLL-2002 label of Spacy labelled entity
  """
  mappings = {"PERSON":"PER", "COMPANY":"ORG", "GPE":"LOC", 'EVENT':"MISC", 'FAC':"MISC", 'LANGUAGE':"MISC", 'LAW':"MISC", 'NORP':"MISC", 'PRODUCT':"MISC",'WORK_OF_ART':"MISC", "MISC":"MISC", "PER":"PER", "ORG":"ORG", "LOC":"LOC"}    
  exclude = {"CARDINAL", "ORDINAL", "DATE", "PERCENT", "QUANTITY", "TIME", "MONEY"}

  return mappings[ent_label] if ent_label != "" and ent_label not in exclude else None

def convert_ent(token) -> str:
  """
  returns ConLL-2002 IOB style entity label of Spacy token
  """
  return token.ent_iob_ + "-" + relabel(token.ent_type_) if relabel(token.ent_type_) else token.ent_iob_

def process_spacy(docs: list):
  store = []
  tokens = []
  ids = []

  c = 0
  classlabels = ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'])
  for doc in processed_sample:
    ents = [classlabels.str2int(convert_ent(tok)) for tok in doc]
    toks = [token.text for token in doc]
    store.append(ents)
    tokens.append(toks)
    ids.append(str(c))
    c += 1 
    
  d = {"ids" : ids,
       "ner_tags" : store,
       "tokens" : tokens}

  class_sequence = Sequence(feature =  classlabels, id = None)
  ds = Dataset.from_dict(d)
  ds.features["ner_tags"] = class_sequence
  return ds

ds = process_spacy(processed_sample)
ds

Dataset({
    features: ['ids', 'ner_tags', 'tokens'],
    num_rows: 3
})

In [7]:
from datasets import load_dataset, load_metric, Dataset

In [8]:
datasets = load_dataset("conll2002", "nl")

Downloading:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Downloading and preparing dataset conll2002/nl (download: 3.47 MiB, generated: 7.74 MiB, post-processed: Unknown size, total: 11.21 MiB) to /root/.cache/huggingface/datasets/conll2002/nl/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5...


Downloading:   0%|          | 0.00/571k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/194k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset conll2002 downloaded and prepared to /root/.cache/huggingface/datasets/conll2002/nl/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5. Subsequent calls will reuse this data.


In [85]:
label_list = ds.features[f"{task}_tags"].feature.names
label_list 

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
from transformers import XLMRobertaTokenizerFast

tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
tokenizer("Hello, this is one sentence!")

{'input_ids': [0, 35378, 4, 903, 83, 1632, 149357, 38, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
example = datasets["train"][4]
print(example["tokens"])

['In', 'eerste', 'aanleg', 'werd', 'Vandenbussche', 'begin', 'de', 'jaren', "'90", 'veroordeeld', 'wegens', 'belangenvermenging', 'maar', 'later', 'vrijgesproken', 'door', 'het', 'hof', 'van', 'beroep', 'in', 'Gent', '.']


In [None]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words = True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['<s>', '▁In', '▁eerste', '▁aan', 'leg', '▁werd', '▁Vand', 'en', 'bus', 'sche', '▁begin', '▁de', '▁jaren', "▁'", '90', '▁ver', 'oordeel', 'd', '▁wegen', 's', '▁belang', 'en', 'ver', 'men', 'ging', '▁maar', '▁later', '▁vrij', 'gesproken', '▁door', '▁het', '▁hof', '▁van', '▁beroep', '▁in', '▁Gent', '▁', '.', '</s>']


In [None]:
len(example[f"{task}_tags"]), len(tokenized_input["input_ids"])

(23, 39)

In [None]:
print(tokenized_input.word_ids())

[None, 0, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 9, 10, 10, 11, 11, 11, 11, 11, 12, 13, 14, 14, 15, 16, 17, 18, 19, 20, 21, 22, 22, None]


In [None]:
label_all_tokens = True

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenize_and_align_labels(datasets['train'][:5])

{'input_ids': [[0, 262, 8484, 131, 225, 34784, 83, 1850, 1348, 213257, 62178, 1476, 225, 70071, 11281, 144, 36661, 22427, 98098, 1911, 293, 136707, 94257, 607, 107950, 56219, 23, 80254, 112, 6, 5, 2], [0, 360, 242, 13556, 20324, 79635, 18, 8, 97383, 192, 3986, 310, 143, 302, 184, 43229, 8, 184410, 435, 293, 335, 12236, 607, 225, 36179, 233, 8518, 35586, 3188, 19161, 233, 17929, 6, 5, 2], [0, 46100, 478, 2], [0, 136685, 83, 107950, 56219, 6543, 435, 747, 6, 209325, 33, 23, 45811, 6, 4, 1476, 225, 335, 12236, 76405, 68, 64714, 9329, 107, 1409, 2610, 6, 4, 509, 171, 7, 131010, 14246, 80, 344, 46870, 1515, 14949, 13, 58066, 33, 4223, 7560, 4319, 6, 7560, 20143, 6492, 18, 131, 153, 2], [0, 360, 8155, 664, 2828, 11281, 58066, 33, 4223, 7560, 9842, 8, 30392, 242, 5039, 493, 73218, 71, 37528, 7, 47042, 33, 814, 1055, 9966, 1476, 14432, 34234, 132121, 1911, 225, 85253, 131, 93484, 23, 44704, 6, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
from transformers import XLMRobertaForTokenClassification, TrainingArguments, Trainer

model = XLMRobertaForTokenClassification.from_pretrained(model_checkpoint, num_labels = len(label_list))

Downloading:   0%|          | 0.00/513 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-

In [None]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=3, 
    load_best_model_at_end=True
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [None]:
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'LOC': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'PER': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
!pip show transformers

Name: transformers
Version: 4.6.0
Summary: State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
Home-page: https://github.com/huggingface/transformers
Author: Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Suraj Patil, Stas Bekman, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors
Author-email: thomas@huggingface.co
License: Apache
Location: /usr/local/lib/python3.7/dist-packages
Requires: regex, huggingface-hub, sacremoses, packaging, filelock, tqdm, importlib-metadata, numpy, requests, tokenizers
Required-by: 


In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


RuntimeError: ignored

In [None]:
trainer.evaluate()

{'epoch': 3.0,
 'eval_accuracy': 0.9858934732604946,
 'eval_f1': 0.8956113418818855,
 'eval_loss': 0.07510825991630554,
 'eval_mem_cpu_alloc_delta': 704512,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 45092864,
 'eval_precision': 0.8962588107717332,
 'eval_recall': 0.8949648077964266,
 'eval_runtime': 26.9308,
 'eval_samples_per_second': 107.535}

In [None]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay          69G   69G     0 100% /
tmpfs            64M     0   64M   0% /dev
tmpfs           6.4G     0  6.4G   0% /sys/fs/cgroup
shm             5.8G  4.0K  5.8G   1% /dev/shm
tmpfs           6.4G   28K  6.4G   1% /var/colab
/dev/sda1        75G   70G  4.9G  94% /opt/bin
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware
