In [1]:
import torch 
import pandas as pd 
import numpy as np
import pandas as pd
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os


In [2]:
from sklearn.preprocessing import LabelEncoder
# df= pd.read_csv("./dataset/20ng/20ng_over512.csv", names = ["label", "text"])
# df= pd.read_csv("./dataset/hyper/hy.csv")
# df= pd.read_csv("./dataset/IMDB/IMDB_over512.csv", names = ["label", "text"])
df = pd.read_csv('./dataset/yelp/yelp_over512.csv', names=['label','text'])

LE = LabelEncoder()
df['label'] = LE.fit_transform(df['label'])

In [3]:
df['text']

0        After a morning of Thrift Store hunting, a fri...
1        I will start by saying we have a nice new deck...
2        Two meals, on the recommendation of a friend w...
3        I have to say that I write this review with mu...
4        THE WORST DENTAL EXPERIENCE OF MY LIFE.  THEY ...
                               ...                        
12014    Let me start by telling you what they got righ...
12015    The Marvel Experience has a lot of potential. ...
12016    I know what The Marvel Experience feels like w...
12017    Went for dinner with friends tonight. The plac...
12018    Really disappointing. Either the place was hav...
Name: text, Length: 12019, dtype: object

In [4]:
config = LongformerConfig()
config

LongformerConfig {
  "attention_probs_dropout_prob": 0.1,
  "attention_window": 512,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "sep_token_id": 2,
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [5]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.8, shuffle = True)
print(test_data)
train_data=Dataset.from_pandas(train_data)
test_data=Dataset.from_pandas(test_data)

       label                                               text
3705       1  Rich. Decadent. Extravagant. Those are some pr...
6423       0  If you check my history, you'll see I don't of...
5557       0  I'm only giving this place one star because I ...
9571       0  I don't know why I torture myself.  It's been ...
11483      0  Do not spend your money here! Trust me, just g...
...      ...                                                ...
11197      0  I moved to AZ/Skysong from Chicago in April an...
8109       1  When it's that time to shop for new clothes, s...
10946      1  Stayed at Hard Rock for the first time recentl...
10329      0  It pains me to give the services department a ...
232        0  I chose this motel because it is 3 blocks from...

[9616 rows x 2 columns]


In [6]:
# import torch
# torch.cuda.empty_cache()

In [7]:
# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained(
                                                              'allenai/longformer-base-4096',
                                                               gradient_checkpointing=False,
                                                               num_labels=2,
                                                                attention_window = 512
                                                           )
# tokenizer = LongformerTokenizer.from_pretrained(model_name)

tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096',num_labels=2 ,max_length = 1024)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 

In [8]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')

    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



In [9]:
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization (batched_text):
    return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, max_length = 1024)

train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))
# summary = tokenizer.decode(predicted_abstract[0], skip_special_tokens=True)


Map:   0%|          | 0/2403 [00:00<?, ? examples/s]

Map:   0%|          | 0/9616 [00:00<?, ? examples/s]

In [10]:

train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


In [11]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = './dataset/IMD/longformer',
    num_train_epochs = 20,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 8,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "no",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps = 4,
    fp16 = True,
    learning_rate=3e-5,
    save_strategy = 'no',
    logging_dir='./dataset/IMDB/longformer',
    dataloader_num_workers = 0,
    run_name = 'longformer-classification-updated-rtx3090_paper_replication_2_warm'
)

In [13]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
trainer.train()

RuntimeError: CUDA out of memory. Tried to allocate 144.00 MiB (GPU 0; 47.54 GiB total capacity; 9.45 GiB already allocated; 30.38 MiB free; 9.52 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [14]:
# save the best model
trainer.save_model('./dataset/IMD/longformer')

In [11]:
# trainer.evaluate()

In [12]:
import torch
torch.cuda.empty_cache()

In [13]:
torch.cuda.is_available()

True

In [14]:

# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained(
                                                              'allenai/longformer-base-4096',
                                                               gradient_checkpointing=False,
                                                               num_labels=2,
                                                                attention_window = 512
                                                           )
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096',num_labels=2 ,max_length = 1024)
pretrained_weights_path = './dataset/IMD/longformer/pytorch_model.bin'  # Specify the path to your pretrained weights file
state_dict = torch.load(pretrained_weights_path)
model.load_state_dict(state_dict)

import torch
torch.cuda.empty_cache()

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 

In [15]:
model.eval()
da = train_data['text']
inputs = tokenizer(da, padding = 'max_length', truncation=True, max_length = 1024, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
with torch.no_grad():
    out = model(input_ids, attention_mask=attention_mask)

In [16]:
# inputs = tokenizer.batch_encode_plus(da, add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')

out1=out['logits'].tolist()

In [17]:
np.shape(out1)

(2403, 2)

In [18]:
import wandb 
y = train_data['label']
wandb.init(project="embedding_3")
df_data = {
            'target':y,
            'feature': out1
        }
df_1 = pd.DataFrame(df_data)
wandb.log({"Embedding": df_1})
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mimhilaryy1999[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# inputs = tokenizer.batch_encode_plus(da, add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')
# input_ids = inputs['input_ids']
# attention_mask = inputs['attention_mask']
# with torch.no_grad():
#     out = model(input_ids, attention_mask=attention_mask)
    


In [None]:

# np.shape(out)

In [None]:
# np.shape(y)

In [None]:
# import wandb 
# y = df['label'].tolist()
# wandb.init(project="embedding_2")
# out = embeddings.tolist()
# df_data = {
#             'target':y,
#             'feature': out
#         }
# df = pd.DataFrame(df_data)
# wandb.log({"Embedding": df})
# wandb.finish()