In [40]:
import os

model_name = 'gpt2'

output_dir = 'runs'

low_rank = 8
alpha = 8

In [11]:
from adapters import init, LoRAConfig
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name)
init(model)

adapter_config = LoRAConfig(r=low_rank, alpha=alpha)
model.add_adapter("style_adapter", config=adapter_config, set_active=True)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import load_dataset

tweets = load_dataset("tweet_eval", "sentiment")



Downloading builder script: 100%|██████████| 9.72k/9.72k [00:00<00:00, 16.4MB/s]
Downloading metadata: 100%|██████████| 30.4k/30.4k [00:00<00:00, 25.9MB/s]
Downloading readme: 100%|██████████| 21.9k/21.9k [00:00<00:00, 39.4MB/s]
Downloading data: 4.97MB [00:00, 36.9MB/s]/6 [00:00<?, ?it/s]
Downloading data: 91.2kB [00:00, 55.2MB/s]                   .24s/it]
Downloading data: 1.16MB [00:00, 16.1MB/s]                  1.30it/s]
Downloading data: 24.6kB [00:00, 9.92MB/s]                   .35it/s]
Downloading data: 219kB [00:00, 11.4MB/s]                    .63it/s]
Downloading data: 4.00kB [00:00, 4.10MB/s]                  1.67it/s]
Downloading data files: 100%|██████████| 6/6 [00:03<00:00,  1.60it/s]
Extracting data files: 100%|██████████| 6/6 [00:00<00:00, 1325.28it/s]
Generating train split: 100%|██████████| 45615/45615 [00:01<00:00, 29980.60 examples/s]
Generating test split: 100%|██████████| 12284/12284 [00:00<00:00, 30061.54 examples/s]
Generating validation split: 100%|█████████

In [16]:
from transformers import AutoTokenizer

dataset_raw = tweet_text

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [32]:
def tokenize_dataset(data: list[str]) -> list[list[int]]:
    return tokenizer(data, truncation=False, padding=False, add_special_tokens=True)['input_ids']

dataset_tokenized = {k: tokenize_dataset(v) for k, v in dataset_raw.items()}

In [34]:
from itertools import chain

def chunk(input_ids: list[list[int]], chunk_size: int = None):
    '''
    Util function for grouping and chunking text
    Adapted from https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py
    '''
    
    if chunk_size is None:
        chunk_size = tokenizer.model_max_length

    input_ids_concat = list(chain(*input_ids))
    total_length = len(input_ids_concat)

    total_length = (total_length // chunk_size) * chunk_size
    chunks = [input_ids_concat[i : i + chunk_size] for i in range(0, total_length, chunk_size)]

    return chunks

dataset_tokenized = {k: chunk(v) for k, v in dataset_tokenized.items()}

In [39]:
def make_dataset(input_ids: list[list[int]]) -> list[dict]:
    return {
        'input_ids': input_ids,
        'labels': input_ids.copy(),
        'attention_mask': [[1] * len(x) for x in input_ids]
    }

dataset_tokenized = {k: make_dataset(v) for k, v in dataset_tokenized.items()}

In [43]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# adapted from: https://huggingface.co/docs/transformers/tasks/language_modeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

lr = 1e-4
num_epochs = 10
exp_name = f'{model_name}_lr_{lr}_r_{low_rank}_a_{alpha}'

args = TrainingArguments(
    output_dir=os.path.join(output_dir, exp_name),
    evaluation_strategy='epoch',
    learning_rate=lr,
    num_train_epochs=num_epochs,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset_tokenized['train'],
    eval_dataset=dataset_tokenized['validation'],
    data_collator=data_collator,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
from model import StyleModel, ModelArgs

model_args = ModelArgs(
    model_name='gpt2',
    is_mlm=False,
    from_pretrained='runs/gpt2_TweetData_16_8_10_8_0.0001/checkpoint-1510/style_adapter'
)

model = StyleModel(
    model_args
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = model.tokenizer
model = model.model

In [4]:
inputs = tokenizer('Is the sky green? ', return_tensors='pt').to('cuda')

In [10]:
outputs = tokenizer('Yes', return_tensors='pt').to('cuda')

In [35]:
from data import RedditData

data = RedditData()

In [12]:
def parse_path(path):
    run_id, checkpoint_id = path.split('/')[1:3]
    checkpoint_num = int(checkpoint_id.split('-')[1])
    model_name, data, r, alpha, epochs, batch_size, lr = run_id.split('_')

    return {
        'model_name': model_name,
        'data': data,
        'r': int(r),
        'alpha': int(alpha),
        'epochs': int(epochs),
        'batch_size': int(batch_size),
        'lr': float(lr),
        'num_steps': int(checkpoint_num)
    }

In [30]:
import glob
import pandas as pd

files = glob.glob('data/*.csv')
combined_df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)