Transformer(Bert)

In [1]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install evaluate

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.14.6

In [2]:
import random
import pyarrow as pa
import datasets
import tqdm
import string

In [3]:
from datasets import load_dataset
dts=load_dataset('SeyedAli/Persian-Text-Sentiment')

Downloading readme:   0%|          | 0.00/524 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/55852 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/13964 [00:00<?, ? examples/s]

In [4]:
dts

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 55852
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 13964
    })
})

In [5]:
!pip install transformers[torch]



In [6]:
import torch
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification,TrainingArguments,Trainer

In [7]:
def gpu_check():
  torch.__version__
  #If ther 's a GPU available...
  if torch.cuda.is_available():
    #Tell Pytorch to use GPU.
    device = torch.device("cuda")
    print("There are %d GPU(s) avalilable." % torch.cuda.device_count())
    print('We will use the GPU:',torch.cuda.get_device_name(0))
    #If not..
  else:
    print('No GPU available,using the CPU instead.')
    device = torch.device("cpu")
    return device

In [8]:
device=gpu_check()
print(device)

There are 1 GPU(s) avalilable.
We will use the GPU: Tesla T4
None


In [9]:
label2id={'negetive':0,'positive':1}
id2label={0:'negetive',1:'positive'}

In [18]:
modelname="HooshvareLab/bert-base-parsbert-uncased"
tokenizer=AutoTokenizer.from_pretrained(modelname,model_max_length=100, add_special_tokens = True, add_prefix_space=True)
model=AutoModelForSequenceClassification.from_pretrained(
    modelname,num_labels=2,id2label=id2label,label2id=label2id
).to(device)
data_collector = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import precision_recall_fscore_support,accuracy_score

In [12]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
def preprocess_function(examples):
  return tokenizer(examples["text"],truncation=True)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return metric.compute(predictions=preds, references=labels)

In [14]:
tokenized_data=dts.map(preprocess_function,batched=True)

Map:   0%|          | 0/55852 [00:00<?, ? examples/s]

Map:   0%|          | 0/13964 [00:00<?, ? examples/s]

Injecting LoRA to the BERT model

In [15]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=4, lora_alpha=1, lora_dropout=0.1
)

In [16]:
from peft import get_peft_model
model = get_peft_model(model, lora_config)

In [17]:
training_args = TrainingArguments(
    output_dir="Persian-Text-Sentiment-Bert-LORA/",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy ="epoch",
    save_strategy ="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

In [19]:
trainer =Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=data_collector,
    compute_metrics=compute_metrics
)

In [20]:
print("شروع آموزش...")
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


شروع آموزش...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3085,0.331716,0.871813
2,0.2696,0.387864,0.873245
3,0.2138,0.408799,0.869951
4,0.1895,0.557192,0.865726
5,0.1379,0.633209,0.859711
6,0.0983,0.855796,0.857777
7,0.0697,1.037483,0.859209
8,0.0546,1.053442,0.862718
9,0.0287,1.19987,0.856918
10,0.0168,1.248142,0.860069


TrainOutput(global_step=34910, training_loss=0.14014070415797325, metrics={'train_runtime': 8498.5564, 'train_samples_per_second': 65.719, 'train_steps_per_second': 4.108, 'total_flos': 1.87492424160756e+16, 'train_loss': 0.14014070415797325, 'epoch': 10.0})

In [21]:
trainer.evaluate()

{'eval_loss': 0.3317162096500397,
 'eval_accuracy': 0.8718132340303638,
 'eval_runtime': 52.8098,
 'eval_samples_per_second': 264.421,
 'eval_steps_per_second': 16.531,
 'epoch': 10.0}

In [22]:
trainer.save_model('sentiment')

In [23]:
!zip -r emotion_bert_model.zip 'sentiment'

  adding: sentiment/ (stored 0%)
  adding: sentiment/model.safetensors (deflated 8%)
  adding: sentiment/config.json (deflated 50%)
  adding: sentiment/training_args.bin (deflated 51%)


In [24]:
!pip install huggingface_hub



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [None]:
trainer.push_to_hub('SeyedAli/Persian-Text-Emotion-Bert-LORA')

adapter_model.safetensors:   0%|          | 0.00/160k [00:00<?, ?B/s]

events.out.tfevents.1699697696.36d742872af9.1859.0:   0%|          | 0.00/5.79k [00:00<?, ?B/s]

events.out.tfevents.1699698648.36d742872af9.1859.1:   0%|          | 0.00/5.79k [00:00<?, ?B/s]

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

events.out.tfevents.1699699938.36d742872af9.1859.3:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

events.out.tfevents.1699699377.36d742872af9.1859.2:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

events.out.tfevents.1699699972.36d742872af9.1859.4:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

events.out.tfevents.1699699984.36d742872af9.1859.5:   0%|          | 0.00/19.0k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

events.out.tfevents.1699705059.36d742872af9.1859.6:   0%|          | 0.00/417 [00:00<?, ?B/s]

'https://huggingface.co/SeyedAli/Persian-Text-Sentiment-Bert-LORA/tree/main/'

In [None]:
modelname="SeyedAli/Persian-Text-Sentiment-Bert-LORA"
tokenizer=AutoTokenizer.from_pretrained(modelname,model_max_length=100, add_special_tokens = True)
model=AutoModelForSequenceClassification.from_pretrained(
    modelname,num_labels=2,id2label=id2label,label2id=label2id
).to(device)
data_collector = DataCollatorWithPadding(tokenizer=tokenizer)

OSError: ignored