# Compute and store all reps

## Set-up

In [1]:
import pickle
import transformers
from transformers import AutoModel, AutoTokenizer
import break_utils

In [2]:
transformers.logging.set_verbosity_error()

In [3]:
ann_filename = "annotated_break_data.csv"

In [4]:
def get_reps(weights_name, layers=(1, 6, 12)):
    df = break_utils.load_annotated_dataset(
        ann_filename, 
        AutoTokenizer.from_pretrained(weights_name), 
        model=AutoModel.from_pretrained(weights_name),
        layers=layers)
    with open(f"reps/{weights_name.replace('/', '_')}_df.pickle", "wb") as f:
        pickle.dump(df, f)

## RoBERTa-base

In [5]:
%%time
get_reps('roberta-base')

CPU times: user 3min 40s, sys: 2.79 s, total: 3min 43s
Wall time: 3min 41s


## RoBERTa-large

In [6]:
%%time
get_reps('roberta-large', layers=(1, 6, 12, 18, 24))
#
# get_reps('roberta-large', layers=(24,))

CPU times: user 12min 50s, sys: 8.14 s, total: 12min 58s
Wall time: 12min 48s


## BERT-base

In [7]:
%%time
get_reps('bert-base-cased')

CPU times: user 3min 50s, sys: 2.41 s, total: 3min 53s
Wall time: 3min 53s


## BERT-large

In [8]:
get_reps('bert-large-cased', layers=(1, 6, 12, 18, 24))

## DeBERTa-v3-base

In [9]:
get_reps('microsoft/deberta-v3-base')

## DeBERTa-v3-large

In [10]:
get_reps('microsoft/deberta-v3-large', layers=(1, 6, 12, 18, 24))

## DeBERTa-base

In [11]:
get_reps('microsoft/deberta-base')

## DeBERTa-large

In [12]:
%%time
get_reps('microsoft/deberta-large', layers=(1, 6, 12, 18, 24))

CPU times: user 16min 34s, sys: 8.25 s, total: 16min 43s
Wall time: 16min 30s
