#Text Generation in LLms Using HuggingFace

In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# load GPT-2 model and tokenizer

In [7]:
tokenizer= AutoTokenizer.from_pretrained('gpt2')
model= AutoModelForCausalLM.from_pretrained('gpt2')     #میره از سایت هاگینگ فیس میگرده و اون مدل رو طبق اسمی که دادیم پیدا و دان مینکه


#generate Text using GPT-2
def generate_text(prompt, max_length=150, num_return_sequences=1):
  inputs= tokenizer(prompt,return_tensors='pt')
  outputs=model.generate(inputs['input_ids'],
                         max_length=max_length,
                         num_return_sequences=num_return_sequences,
                         no_repeat_ngram_size=2,
                         #top_k=50,
                         #top_p=0.95,
                         temperature=0.1,      #یک عدد بین صفر یک که مثلا میگه چقد خلاقانه جواب بده
                         do_sample=True)
  return [tokenizer.decode(output,skip_special_tokens=True) for output in outputs]


#Example prompt for text generation
prompt="In a future where AI has taken over the world,"                         #تو اینده ای که هوش مصنوعی دنیارو گرفته ... ادامش رو باید تولید کنه
generated_texts= generate_text(prompt=prompt)

for index,text in enumerate(generated_texts):
  print(f"Generated Text {index+1}:\n{text}\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text 1:
In a future where AI has taken over the world, it's not clear how much of a threat it is. But it could be that it will be a lot more than just a few years away.

"It's a very big problem," says Dr. David H. Hirsch, a professor of computer science at the University of California, Berkeley. "It could become a major threat to the entire world."
. . .
 (The original version of this article was published in the May 2015 issue of the journal Nature.)



In [9]:
#Example prompt for text generation
prompt="In a future where AI has taken over the world,"
generated_texts= generate_text(prompt=prompt)

print(generated_texts)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['In a future where AI has taken over the world, it will be possible to create a world where humans are not only able to do things, but also to make decisions.\n\n"We are going to have to see how we can make that happen," said Dr. Mark B. Bowers, a professor of computer science at the University of California, Berkeley. "We\'re going back to the days of the computer. We\'re not going away. It\'s going on."\n.']


In [10]:
inputs= tokenizer(prompt,return_tensors='pt')
inputs

{'input_ids': tensor([[ 818,  257, 2003,  810, 9552,  468, 2077,  625,  262,  995,   11]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

#Text Generation using T5

In [13]:
from transformers import T5Tokenizer,T5ForConditionalGeneration

#load pre-trained T5 model and tokenizer
t5_tokenizer= T5Tokenizer.from_pretrained('t5-base')
t5_model=T5ForConditionalGeneration.from_pretrained('t5-base')


#generate Text using T5
def generate_t5_text(prompt, max_length=50, num_return_sequences=1):
  inputs= t5_tokenizer(prompt,return_tensors='pt')
  outputs=t5_model.generate(inputs['input_ids'],
                         max_length=max_length,
                         num_return_sequences=num_return_sequences,
                         no_repeat_ngram_size=2,
                         #top_k=50,
                         #top_p=0.95,
                         temperature=0.1,      #یک عدد بین صفر یک که مثلا میگه چقد خلاقانه جواب بده
                         do_sample=True)
  return [t5_tokenizer.decode(output,skip_special_tokens=True) for output in outputs]

In [16]:
prompt_t5="""translate: Water is an inorganic compound with the chemical formular H20.It is a transparent,tasteless,odorless,"""

generate_t5_text= generate_t5_text(prompt_t5)

for index,text in enumerate(generate_t5_text):
  print(f"Generated Text {index+1}:\n{text}\n")

Generated Text 1:
Wasser ist eine anorganische Verbindung mit dem chemischen Formular H20.It is a transparent,tasteless,odorless.



به المانی ترجمش کرد

#Fine-tuning GPT-2 on a custom Dataset

In [None]:
!pip install transformers datasets

from transformers import GPT2Tokenizer,GPT2LMHeadModel,Trainer, TrainingArguments
from datasets import load_dataset


dataset= load_dataset("wikitext","wikitext-2-raw-v1")

tokenizer= GPT2Tokenizer.from_pretrained('gp2')

tokenizer.pad_token= tokenizer.SPECIAL_TOKENS_ATTRIBUTES

model= GPT2LMHeadModel.from_pretrained('gpt2')

def tokenize_function(examples):
  tokens= tokenizer(examples['text'],truncation=True,padding="max_length",max_length=128)
  tokens['labels']= tokens['input_ids'].copy()
  return tokens


tokenized_datasets=dataset.map(tokenize_function,batched=True)


training_args=TrainingArguments(output_dir="./gpt2-finetuned",
                                per_device_train_batch_size=2,
                                num_train_epochs=1,
                                logging_dir="./logs",
                                prediction_loss_only=True)

trainer= Trainer(model=model,
                 args=training_args
                 train_dataset=tokenized_datasets['train'],
                 eval_dataset=tokenized_datasets["validation"])


trainer.train()

model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

#مدل های گیتید

In [17]:
import os
os.environ["HUGGING_FACE_HUB_TOKEN"]= ""        #لینک اکسس توکن در سایت در سایت هاگینگ فیس

In [18]:
import torch
from transformers import pipeline

pipe=pipeline("text-generation",
              model="google/gemma-2-2b",    #2b= 2میلیارد پارامتر
              device="cuda")


text="In a future where AI has taken over the world,"
outputs=pipe(text,max_new_tokens=256)
response=outputs[0]["generated-text"]
print(response)

In [None]:
text="Capital of India is "
outputs=pipe(text,max_new_tokens=32)
response=outputs[0]["generated-text"]
print(response)