In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_scheduler
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW, SGD
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

In [17]:
df = pd.read_csv("./data/data_3class.csv")

In [21]:
def get_core(s):
    try:
        start = s.index("Containment Procedures:")
        return s[start:start+512]
    except:
        return s[len(s)//2-106:len(s)//2+106]

In [22]:
df
mid = df
mid["report"] = mid["report"].apply(get_core)
mid

Unnamed: 0.1,Unnamed: 0,item,class,report
0,0,SCP-6501,safe,Containment Procedures: The cemetery in which ...
1,1,SCP-6502,safe,Containment Procedures: To maximize available ...
2,2,SCP-6503,keter,ch as an epic poem or a warrior's creed have b...
3,3,SCP-6504,euclid,Containment Procedures: Global psychiatric res...
4,5,SCP-6506,safe,Containment Procedures: SCP-6506 is currently ...
...,...,...,...,...
7674,8214,SCP-7995,euclid,Containment Procedures: All information regard...
7675,8215,SCP-7996,keter,Containment Procedures: Due to the unpredictab...
7676,8216,SCP-7997,keter,Containment Procedures: The Department of Proc...
7677,8217,SCP-7998,keter,Containment Procedures: Due to a lack of knowl...


In [24]:
mid.to_csv("data3_core.csv")

In [None]:



df['report'] = df['report'].dropna()
df['class'] = df['class'].astype('category').cat.codes

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['report'].tolist(),
    df['class'].tolist(),
    test_size=0.2,
    random_state=42
)

In [2]:
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification

In [4]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=1024)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=1024)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
class SCPDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.labels = labels
        self.encodings = encodings

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item 

train_dataset = SCPDataset(train_encodings, train_labels)
val_dataset = SCPDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=df['class'].nunique())
model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=df['class'].nunique())

optimizer = AdamW(model.parameters(), lr=5e-5)
# optimizer = SGD(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
loss_fn = CrossEntropyLoss()

epochs = 3
for e in range(epochs):
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

    avg_train_loss = train_loss/ len(train_loader)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            val_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == batch['labels']).sum().item()
            total += batch['labels'].size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct / total
    print(f"Epoch {e + 1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 1/192 [05:06<16:16:28, 306.75s/it]

In [None]:
model.save_pretrained("/content/drive/MyDrive/SCP-Classifications-Longformer-epoch3", from_pt=True)