In [50]:
import pandas as pd 
import os 
PATH_TRAIN = os.path.join("..","data","train_data_ext","train.csv")
PATH_TEST = os.path.join("..","data","train_data_ext","test.csv")
PATH_VAL = os.path.join("..","data","train_data_ext","val.csv")

In [51]:
train_data = pd.read_csv(PATH_TRAIN)
test_data = pd.read_csv(PATH_TEST)
val_data = pd.read_csv(PATH_VAL)

In [52]:
train_data.head()

Unnamed: 0,text,label
0,"I didn't feel humiliated, which was a surprise...",sadness
1,"I didn't feel humiliated, which was a surprise...",sadness
2,I can go from feeling so hopeless to so damned...,sadness
3,I'm grabbing a minute to post because I feel i...,anger
4,I am ever feeling nostalgic about the fireplac...,love


In [53]:
train_data = train_data.drop_duplicates()

In [54]:
train_data.head()

Unnamed: 0,text,label
0,"I didn't feel humiliated, which was a surprise...",sadness
2,I can go from feeling so hopeless to so damned...,sadness
3,I'm grabbing a minute to post because I feel i...,anger
4,I am ever feeling nostalgic about the fireplac...,love
5,"I am feeling extremely grouchy today, and to b...",anger


In [55]:
idx_2_class = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
class_2_idx = {value: key for key, value in idx_2_class.items()}


In [56]:
# train_data["label"] = train_data["label"].replace(class_2_idx)

In [57]:
train_data

Unnamed: 0,text,label
0,"I didn't feel humiliated, which was a surprise...",sadness
2,I can go from feeling so hopeless to so damned...,sadness
3,I'm grabbing a minute to post because I feel i...,anger
4,I am ever feeling nostalgic about the fireplac...,love
5,"I am feeling extremely grouchy today, and to b...",anger
...,...,...
395,I feel incredibly thankful for the lessons I'm...,joy
396,I feel such a profound and unshakeable longing...,love
397,I feel distinctly called in Clermont to focus ...,love
398,I hope you can feel glad that she gave you so ...,joy


In [58]:
from datasets import Dataset
ds_train = Dataset.from_pandas(train_data)
ds_test = Dataset.from_pandas(test_data)
ds_val = Dataset.from_pandas(val_data)

In [59]:
ds_train = ds_train.remove_columns(['__index_level_0__'])

In [60]:
ds_val

Dataset({
    features: ['text', 'label'],
    num_rows: 200
})

In [61]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

In [62]:
max_len = 0 
def find_max_len(paragraph):
    global max_len
    len_sentence = len(paragraph)
    max_len = max(max_len, len_sentence)

In [None]:
max_input_length =4082 # 27814
max_output_length = 8
batch_size = 2

In [66]:
max_len

27814

In [69]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["label"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [71]:
train_dataset = ds_train.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
)
val_dataset = ds_val.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
)

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [72]:
test_dataset = ds_test.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [73]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)
val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)


In [78]:
num_labels = len(idx_2_class)
num_labels

6

In [None]:
from transformers import LEDForSequenceClassification
led = LEDForSequenceClassification.from_pretrained(
    "allenai/led-base-16384",
    cache_dir="../led_model",
    gradient_checkpointing=True,
    use_cache=False,
    num_labels=num_labels  
)

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

Some weights of LEDForSequenceClassification were not initialized from the model checkpoint at allenai/led-base-16384 and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

In [77]:
led

LEDForSequenceClassification(
  (led): LEDModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): LEDEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): LEDLearnedPositionalEmbedding(16384, 768)
      (layers): ModuleList(
        (0-5): 6 x LEDEncoderLayer(
          (self_attn): LEDEncoderAttention(
            (longformer_self_attn): LEDEncoderSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): Linear(in_features=768, out_features=768, bias=True)
          )
    