<a href="https://colab.research.google.com/github/CSAKAS/GPT2ft/blob/main/5212.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/comp5212/arxiv100.csv')
df.head()

Unnamed: 0,title,abstract,label
0,The Pre-He White Dwarfs in Eclipsing Binaries....,We report the first $BV$ light curves and hi...,astro-ph
1,A Possible Origin of kHZ QPOs in Low-Mass X-ra...,A possible origin of kHz QPOs in low-mass X-...,astro-ph
2,The effects of driving time scales on heating ...,Context. The relative importance of AC and D...,astro-ph
3,A new hard X-ray selected sample of extreme hi...,Extreme high-energy peaked BL Lac objects (E...,astro-ph
4,The baryon cycle of Seven Dwarfs with superbub...,"We present results from a high-resolution, c...",astro-ph


In [90]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

"""labels = df['label'].tolist()
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
print(label2idx)"""

le = LabelEncoder()
df['label']= le.fit_transform(df['label'])

labels = df.label
texts =  df.title + df.abstract
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=1)

In [None]:
!pip install transformers

In [None]:
# tokenizor
from transformers import GPT2Tokenizer


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

train_encoding = []
for text in train_texts:
    tokens = tokenizer.encode(text, truncation=True, max_length=1024)
    train_encoding.append(tokens)

test_encoding = []
for text in test_texts:
    tokens = tokenizer.encode(text, truncation=True, max_length=1024)
    test_encoding.append(tokens)

In [91]:
# load
import numpy as np

train_encoding=np.load('/content/drive/MyDrive/comp5212/train_encoding.npy',allow_pickle=True)
train_encoding=train_encoding.tolist()

test_encoding=np.load('/content/drive/MyDrive/comp5212/test_encoding.npy',allow_pickle=True)
test_encoding=test_encoding.tolist()

In [5]:
# save encoding
import numpy as np
train_encoding_saved=np.array(train_encoding)
np.save('/content/drive/MyDrive/comp5212/train_encoding.npy',train_encoding_saved)

test_encoding_saved=np.array(test_encoding)
np.save('/content/drive/MyDrive/comp5212/test_encoding.npy',test_encoding_saved)

In [92]:
# data_loader
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

class TensorDataset(Dataset):
    def __init__(self, data_tensor, target_tensor):
        super().__init__()
        self.data = data_tensor
        self.target = target_tensor

    def __getitem__(self, index):
        return self.data[index], self.target[index]

    def __len__(self):
        return len(self.data)

sequence_list_tensors = [torch.tensor(seq) for seq in train_encoding]
train_encoding = pad_sequence(sequence_list_tensors, batch_first=True)
sequence_list_tensors = [torch.tensor(seq) for seq in test_encoding]
test_encoding = pad_sequence(sequence_list_tensors, batch_first=True)

train_labels_tensor = torch.zeros(len(train_labels), 10)
for i, label in enumerate(train_labels):
    train_labels_tensor[i, label] = 1

test_labels_tensor = torch.zeros(len(test_labels), 10)
for i, label in enumerate(test_labels):
    test_labels_tensor[i, label] = 1

train_dataset = TensorDataset(train_encoding, train_labels_tensor)
test_dataset = TensorDataset(test_encoding, test_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

print(train_loader)
print('Created `train_dataloader` with %d batches!'%len(train_loader))

<torch.utils.data.dataloader.DataLoader object at 0x7fc96b3fb3d0>
Created `train_dataloader` with 5625 batches!


In [94]:
# cuda
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [95]:
# model
from transformers import GPT2ForSequenceClassification,GPT2Config

model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path="gpt2", num_labels=10)
model = GPT2ForSequenceClassification.from_pretrained("gpt2", config = model_config)
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=10, bias=False)
)

In [96]:
# hyper para + optimizer/scheduler
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

epochs = 20
optimizer = AdamW(model.parameters(),lr = 2e-5, eps = 1e-8)

total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps= total_steps)

In [None]:
predictions_labels = []
true_labels = []
total_loss = 0
for batch in train_loader:
  input = {'input_ids':batch[0], 'labels': batch[1]}
  outputs = model(**input)
  #print(batch)
  print(outputs)
  break

"""
  true_labels += batch['labels'].numpy().flatten().tolist()
  batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
  optimizer.zero_grad()
  outputs = model(**batch)
  loss, logits = outputs[:2]
  total_loss += loss.item()
  loss.backward()
  torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  optimizer.step()
  scheduler.step()
  predictions_labels += logits.argmax(axis=-1).flatten().tolist()
avg_epoch_loss = total_loss / len(train_loader)
print(avg_epoch_loss)"""

In [None]:
# train
"""from tqdm import tqdm
def train_model():
  predictions_labels = []
  true_labels = []
  total_loss = 0
  for batch in tqdm(train_loader, total=len(train_loader)):
    true_labels += batch['labels'].numpy().flatten().tolist()
    batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
    optimizer.zero_grad()
    outputs = model(**batch)
    loss, logits = outputs[:2]
    total_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
    predictions_labels += logits.argmax(axis=-1).flatten().tolist()
  avg_epoch_loss = total_loss / len(train_loader)
  print(avg_epoch_loss)
  return predictions_labels, true_labels, avg_epoch_loss"""

In [None]:
#train_model()

  0%|          | 0/5625 [00:00<?, ?it/s]


AttributeError: ignored