In [1]:
from transformers import pipeline

# Load the GPT-2 model
generator = pipeline('text-generation', model='gpt2')


2024-07-25 12:16:19.957482: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-25 12:16:19.973156: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 12:16:19.990235: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 12:16:19.995428: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-25 12:16:20.009272: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Generate text based on a prompt
prompt = "Once upon a time"
generated_text = generator(prompt, max_length=50, num_return_sequences=1)
print(generated_text[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, as much as anyone, it occurred to me that such things are quite strange, and I should also like to confess that the idea of my being able to see the whole of the whole, and to know the whole without having


In [3]:
from datasets import load_dataset
from transformers import BertTokenizer

# Load a dataset (e.g., IMDB reviews for sentiment analysis)
dataset = load_dataset('imdb')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [4]:
from transformers import DataCollatorWithPadding

# Use data collator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create train and test datasets
train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
test_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000))


In [5]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.324677
2,No log,0.425153
3,No log,0.410448


TrainOutput(global_step=375, training_loss=0.2670516357421875, metrics={'train_runtime': 4646.2696, 'train_samples_per_second': 0.646, 'train_steps_per_second': 0.081, 'total_flos': 789333166080000.0, 'train_loss': 0.2670516357421875, 'epoch': 3.0})

In [8]:
# Evaluate the model
results = trainer.evaluate()
print(results)


{'eval_loss': 0.4104481339454651, 'eval_runtime': 353.2942, 'eval_samples_per_second': 2.831, 'eval_steps_per_second': 0.354, 'epoch': 3.0}


In [9]:
model.save_pretrained('./fine-tuned-bert')
tokenizer.save_pretrained('./fine-tuned-bert')


('./fine-tuned-bert/tokenizer_config.json',
 './fine-tuned-bert/special_tokens_map.json',
 './fine-tuned-bert/vocab.txt',
 './fine-tuned-bert/added_tokens.json')

In [10]:
from transformers import pipeline

# Load the fine-tuned model
sentiment_model = pipeline('sentiment-analysis', model='./fine-tuned-bert')

# Test the model
result = sentiment_model("I love this movie!")
print(result)


[{'label': 'LABEL_1', 'score': 0.9937484264373779}]
