In [None]:
!pip install sentence-transformers transformers beautifulsoup4 nltk datasets accelerate

In [None]:
import requests, re, time, pandas as pd
from bs4 import BeautifulSoup
import torch, torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, r2_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2LMHeadModel
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import stopwords
import nltk

# Download only what's needed (avoid broken punkt_tab)
nltk.download("stopwords")

# Arabic stopwords
arabic_stopwords = set(stopwords.words("arabic"))

# Semantic sentence model
sbert_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
reference = "Ÿáÿ∞ÿß ÿßŸÑŸÜÿµ Ÿäÿ™ÿ≠ÿØÿ´ ÿπŸÜ ÿßŸÑÿ™ÿπŸÑŸäŸÖ ŸÅŸä ÿßŸÑŸàÿ∑ŸÜ ÿßŸÑÿπÿ±ÿ®Ÿä"
ref_embedding = sbert_model.encode(reference, convert_to_tensor=True)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
def get_article_links(base_url, pages=2):
    links = []
    for page in range(1, pages + 1):
        url = f"{base_url}?page={page}"
        res = requests.get(url)
        soup = BeautifulSoup(res.content, "html.parser")
        for a in soup.find_all("a", href=True):
            if "/news/" in a["href"] and not a["href"].startswith("https"):
                links.append("https://www.aljazeera.net" + a["href"])
    return list(set(links))

def extract_text(url):
    try:
        soup = BeautifulSoup(requests.get(url).content, "html.parser")
        return " ".join(p.get_text() for p in soup.find_all("p")).strip()
    except:
        return ""


In [4]:
def semantic_score(text):
    emb = sbert_model.encode(text, convert_to_tensor=True)
    sim = util.cos_sim(emb, ref_embedding).item()
    return round(max(0.0, min(sim * 10, 10.0)), 2)

def preprocess(text):
    text = re.sub(r"[^\u0600-\u06FF\s]", "", text)  # Keep Arabic
    tokens = text.split()  # Simple whitespace tokenizer
    tokens = [t for t in tokens if t not in arabic_stopwords and len(t) > 2]
    return tokens


In [5]:
def scrape_articles():
    links = get_article_links("https://www.aljazeera.net/news/")
    data = []
    for link in links[:20]:
        text = extract_text(link)
        if len(text.split()) > 50:
            score = semantic_score(text)
            data.append({"Text": text, "Score": score})
    df = pd.DataFrame(data)
    df["tokens"] = df["Text"].apply(preprocess)
    df.to_csv("arabic_dataset.csv", index=False, encoding="utf-8-sig")
    return df




In [None]:
data = scrape_articles()
data["tokens"][0]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
data["input_ids"] = data["tokens"].apply(lambda x: tokenizer.encode(" ".join(x), padding="max_length", max_length=100, truncation=True))

X_train, X_test, y_train, y_test = train_test_split(data["input_ids"].tolist(), data["Score"].tolist(), test_size=0.2)

class ArabicDataset(Dataset):
    def __init__(self, encodings, scores):
        self.encodings = encodings
        self.scores = scores
    def __len__(self): return len(self.encodings)
    def __getitem__(self, idx): return torch.tensor(self.encodings[idx]), torch.tensor(self.scores[idx], dtype=torch.float)

train_loader = DataLoader(ArabicDataset(X_train, y_train), batch_size=2)
test_loader = DataLoader(ArabicDataset(X_test, y_test), batch_size=2)


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, vocab_size, embed_dim=128, hidden_dim=64, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        rnn_cls = {"rnn": nn.RNN, "gru": nn.GRU, "lstm": nn.LSTM}[rnn_type]
        self.rnn = rnn_cls(embed_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
        self.bidirectional = bidirectional
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), 1)

    def forward(self, x):
        x = self.embedding(x)
        out, h = self.rnn(x)

        # Handle LSTM hidden state tuple (h, c)
        if isinstance(h, tuple):
            h = h[0]

        # Bidirectional: concatenate forward and backward hidden states
        if self.bidirectional:
            h_out = torch.cat((h[0], h[1]), dim=1)  # shape [batch, hidden*2]
        else:
            h_out = h[-1]  # shape [batch, hidden]

        return self.fc(h_out).squeeze()


In [9]:
def train_and_evaluate(rnn_type, bidirectional=False):
    print(f"\n==> Training {rnn_type.upper()} {'Bi' if bidirectional else ''}RNN")
    model = RNNModel(rnn_type, tokenizer.vocab_size, bidirectional=bidirectional).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    for epoch in range(3):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = loss_fn(preds.view(-1), y_batch.view(-1))  # Fix shape mismatch
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    preds, actuals = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            y_pred = model(X_batch)
            preds.extend(y_pred.cpu().view(-1).numpy())
            actuals.extend(y_batch.numpy())

    print("MSE:", round(mean_squared_error(actuals, preds), 4))
    print("R¬≤:", round(r2_score(actuals, preds), 4))

    # BLEU evaluation with rounding
    bleu_scores = [
        sentence_bleu([[str(round(act, 1))]], str(round(pred, 1)),
                      smoothing_function=SmoothingFunction().method4)
        for pred, act in zip(preds, actuals)
    ]
    print("BLEU Score:", round(sum(bleu_scores) / len(bleu_scores), 4))


In [10]:
train_and_evaluate("rnn")
train_and_evaluate("gru")
train_and_evaluate("lstm")
train_and_evaluate("lstm", bidirectional=True)  # BiLSTM



==> Training RNN RNN
MSE: 0.6279
R¬≤: -1.6907
BLEU Score: 0.0

==> Training GRU RNN
MSE: 1.5393
R¬≤: -5.5962
BLEU Score: 0.0

==> Training LSTM RNN
MSE: 5.7165
R¬≤: -23.4969
BLEU Score: 0.0

==> Training LSTM BiRNN
MSE: 2.0246
R¬≤: -7.6761
BLEU Score: 0.0


In [11]:
print("\n--- Arabic Text Generation with GPT-2 ---")

gpt2_tokenizer = GPT2Tokenizer.from_pretrained("aubmindlab/aragpt2-base")
gpt2_model = GPT2LMHeadModel.from_pretrained("aubmindlab/aragpt2-base").to(device)
gpt2_model.eval()

prompt = "ÿßŸÑÿ∞ŸÉÿßÿ° ÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä ÿ≥Ÿäÿ≥ÿßŸáŸÖ ŸÅŸä ÿ™ÿ∑ŸàŸäÿ± ÿßŸÑÿ™ÿπŸÑŸäŸÖ"
inputs = gpt2_tokenizer(prompt, return_tensors="pt").to(device)

generated = gpt2_model.generate(
    inputs["input_ids"],
    max_length=80,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
    pad_token_id=gpt2_tokenizer.eos_token_id
)

print("\nGenerated Paragraph:\n")
print(gpt2_tokenizer.decode(generated[0], skip_special_tokens=True))



--- Arabic Text Generation with GPT-2 ---


vocab.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/553M [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generated Paragraph:

ÿßŸÑÿ∞ŸÉÿßÿ° ÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä ÿ≥Ÿäÿ≥ÿßŸáŸÖ ŸÅŸä ÿ™ÿ∑ŸàŸäÿ± ÿßŸÑÿ™ÿπŸÑŸäŸÖ ŸÅŸä ÿßŸÑÿ£ÿ±ÿØŸÜ " .Ÿàÿ£ÿ∂ÿßŸÅ ÿ£ŸÜ " ŸáŸÜÿßŸÉ ÿ™ÿ≠ÿØŸäÿßÿ™ ÿ£ÿÆÿ±Ÿâ ÿ™Ÿàÿßÿ¨Ÿá ÿßŸÑÿ™ÿπŸÑŸäŸÖ ŸÅŸä ÿßŸÑÿ£ÿ±ÿØŸÜ ÿå ŸÖŸÜŸáÿß ŸÜŸÇÿµ ÿßŸÑŸÉŸàÿßÿØÿ± ÿßŸÑÿ®ÿ¥ÿ±Ÿäÿ© ÿßŸÑŸÖÿ§ŸáŸÑÿ© ÿå ŸàÿπÿØŸÖ Ÿàÿ¨ŸàÿØ ÿÆÿ∑ÿ∑ Ÿàÿßÿ∂ÿ≠ÿ© ŸÅŸä ŸÖÿ¨ÿßŸÑ ÿßŸÑÿ™ÿπŸÑŸäŸÖ ÿå ŸàÿπÿØŸÖ Ÿàÿ¨ŸàÿØ ÿÆÿ∑ÿ© Ÿàÿßÿ∂ÿ≠ÿ© ŸÅŸä ŸÖÿ¨ÿßŸÑ ÿßŸÑÿ™ÿπŸÑŸäŸÖ " .Ÿàÿ£Ÿàÿ∂ÿ≠ ÿ£ŸÜ " Ÿàÿ¨ŸàÿØ ÿ™ÿπŸÑŸäŸÖ ŸÖÿ±ÿ™ÿ®ÿ∑ ÿ®ÿßŸÑÿ∑ŸÑÿ®ÿ© ÿå ÿ≥Ÿäÿ§ÿØŸä ÿ•ŸÑŸâ ÿ≤ŸäÿßÿØÿ© ŸÖÿπÿØŸÑÿßÿ™ ÿßŸÑÿ™ÿ≥ÿ±ÿ® ÿßŸÑŸÖÿØÿ±ÿ≥Ÿä ÿå Ÿàÿ®ÿßŸÑÿ™ÿßŸÑŸä ÿ•ŸÑŸâ ÿ™ŸÇŸÑŸäŸÑ ŸÜÿ≥ÿ®ÿ© ÿßŸÑÿ™ÿ≥ÿ±ÿ® ÿßŸÑŸÖÿØÿ±ÿ≥Ÿä ÿå ŸàŸÉÿ∞ŸÑŸÉ ÿ≤ŸäÿßÿØÿ© ÿπÿØÿØ ÿßŸÑÿ∑ŸÑÿ®ÿ© ÿå Ÿàÿ®ÿßŸÑÿ™ÿßŸÑŸä ÿ≤ŸäÿßÿØÿ© ŸÜÿ≥ÿ®ÿ© ÿßŸÑÿßŸÑÿ™ÿ≠ÿßŸÇ ÿ®ÿßŸÑŸÖÿØÿßÿ±ÿ≥


In [12]:
generated = gpt2_model.generate(
    inputs["input_ids"],
    max_length=80,
    num_return_sequences=3,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    pad_token_id=gpt2_tokenizer.eos_token_id
)
for i, sample in enumerate(generated):
    print(f"\nSample {i+1}:\n{gpt2_tokenizer.decode(sample, skip_special_tokens=True)}")



Sample 1:
ÿßŸÑÿ∞ŸÉÿßÿ° ÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä ÿ≥Ÿäÿ≥ÿßŸáŸÖ ŸÅŸä ÿ™ÿ∑ŸàŸäÿ± ÿßŸÑÿ™ÿπŸÑŸäŸÖ ÿπŸÜ ÿ®ÿπÿØ ÿå Ÿàÿ≥Ÿäÿ™ŸÖŸÉŸÜ Ÿáÿ∞ÿß ÿßŸÑŸÜÿ∏ÿßŸÖ ŸÖŸÜ ÿ•ŸÜÿ¥ÿßÿ° ÿØŸàÿ±ÿßÿ™ ÿ™ÿπŸÑŸäŸÖŸäÿ© ÿ¨ÿØŸäÿØÿ© ÿπÿ®ÿ± ÿßŸÑÿ•ŸÜÿ™ÿ±ŸÜÿ™ ÿå ŸÅŸä ÿßŸÑŸàŸÇÿ™ ŸÜŸÅÿ≥Ÿá ÿ≥Ÿäÿ≥ÿßŸáŸÖ ÿßŸÑÿ∞ŸÉÿßÿ° ÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä ŸÅŸä ÿ™ÿ∑ŸàŸäÿ± ÿßŸÑŸÖÿØÿßÿ±ÿ≥ .ŸàŸÉÿßŸÜ ÿßŸÑÿ∞ŸÉÿßÿ° ÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä ŸÇÿØ ÿ®ÿØÿ£ ŸÅŸä ÿ™ÿ∑ŸàŸäÿ± ŸÜÿ∏ÿßŸÖ ÿ∞ŸÉÿßÿ° ÿßÿµÿ∑ŸÜÿßÿπŸä ŸÖÿÆÿµÿµ ŸÑÿ•ÿØÿßÿ±ÿ© ÿßŸÑÿ™ÿπŸÑŸäŸÖ ÿπŸÜ ÿ®ÿπÿØ ÿå ŸàŸÇÿØ ÿ®ÿØÿ£ ÿ®ÿ™ÿ∑ŸàŸäÿ±Ÿá ŸÅŸä ÿπÿßŸÖ 2013 .

Sample 2:
ÿßŸÑÿ∞ŸÉÿßÿ° ÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä ÿ≥Ÿäÿ≥ÿßŸáŸÖ ŸÅŸä ÿ™ÿ∑ŸàŸäÿ± ÿßŸÑÿ™ÿπŸÑŸäŸÖ ÿßŸÑÿπÿßŸÑŸä ŸÅŸä ÿßŸÑÿ£ÿ±ÿØŸÜ ÿ®ÿ¥ŸÉŸÑ ÿπÿßŸÖ ŸàÿßŸÑÿ™ÿπŸÑŸäŸÖ ÿßŸÑÿπÿßŸÑŸä ÿ®Ÿàÿ¨Ÿá ÿÆÿßÿµ .

Sample 3:
ÿßŸÑÿ∞ŸÉÿßÿ° ÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä ÿ≥Ÿäÿ≥ÿßŸáŸÖ ŸÅŸä ÿ™ÿ∑ŸàŸäÿ± ÿßŸÑÿ™ÿπŸÑŸäŸÖ Ÿàÿ±ŸÅÿπ ŸÉŸÅÿßÿ°ÿ™Ÿá ÿå ÿÆÿµŸàÿµÿß ŸÅŸä ÿ∏ŸÑ ŸÖÿß ÿ™ÿ¥ŸáÿØŸá ÿßŸÑÿπÿØŸäÿØ ŸÖŸÜ ÿßŸÑÿ®ŸÑÿØÿßŸÜ ŸÖŸÜ ÿßŸÜÿ™ÿ¥ÿßÿ± ÿßŸÑÿ™ŸÇŸÜŸäÿßÿ™ ÿßŸÑÿ≠ÿØŸäÿ´ÿ© ÿå ÿßŸÑÿ£ŸÖÿ± ÿßŸÑÿ∞Ÿä Ÿäÿ≥ÿ™ÿØÿπŸä Ÿàÿ∂ÿπ ÿßÿ≥ÿ™ÿ±ÿßÿ™Ÿäÿ¨Ÿäÿ© Ÿ

# Fine Tuning

In [13]:
df = pd.read_csv("arabic_dataset.csv")
with open("gpt2_arabic_train.txt", "w", encoding="utf-8") as f:
    for line in df["Text"]:
        f.write(line.strip() + "\n")


In [14]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling

model_name = "aubmindlab/aragpt2-base"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 has no pad token

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="gpt2_arabic_train.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Language modeling, not masked
)



In [15]:
model = GPT2LMHeadModel.from_pretrained(model_name)

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_arabic_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlegendsit1234[0m ([33mlegendsit1234-university-abdelmalek-essaadi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=75, training_loss=6.208026529947917, metrics={'train_runtime': 97.6374, 'train_samples_per_second': 1.506, 'train_steps_per_second': 0.768, 'total_flos': 9602482176000.0, 'train_loss': 6.208026529947917, 'epoch': 3.0})

In [17]:
model.save_pretrained("./gpt2_arabic_finetuned")
tokenizer.save_pretrained("./gpt2_arabic_finetuned")

('./gpt2_arabic_finetuned/tokenizer_config.json',
 './gpt2_arabic_finetuned/special_tokens_map.json',
 './gpt2_arabic_finetuned/vocab.json',
 './gpt2_arabic_finetuned/merges.txt',
 './gpt2_arabic_finetuned/added_tokens.json')

In [18]:
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2_arabic_finetuned", tokenizer=tokenizer)

prompt = "ÿ£ŸáŸÖŸäÿ© ÿßŸÑÿ™ÿπŸÑŸäŸÖ ŸÅŸä ÿßŸÑÿπÿµÿ± ÿßŸÑÿ≠ÿØŸäÿ´"
outputs = generator(prompt, max_length=100, num_return_sequences=1)

print(outputs[0]["generated_text"])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ÿ£ŸáŸÖŸäÿ© ÿßŸÑÿ™ÿπŸÑŸäŸÖ ŸÅŸä ÿßŸÑÿπÿµÿ± ÿßŸÑÿ≠ÿØŸäÿ´ ÿå ŸàŸÅŸä ÿ∏ŸÑ ŸÖÿß Ÿäÿ¥ŸáÿØŸá ÿßŸÑÿπÿßŸÑŸÖ ŸÖŸÜ ÿ™ÿ∫Ÿäÿ±ÿßÿ™ ÿπŸÖŸäŸÇÿ© ÿπŸÑŸâ ŸÖÿÆÿ™ŸÑŸÅ ÿßŸÑÿµÿπÿØ ÿßŸÑÿ≥Ÿäÿßÿ≥Ÿäÿ© ŸàÿßŸÑÿßŸÇÿ™ÿµÿßÿØŸäÿ© ŸàÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© ŸàÿßŸÑÿ´ŸÇÿßŸÅŸäÿ©ÿå ŸÅÿ∂ŸÑÿß ÿπŸÜ ÿßŸÑÿ™ÿ≠ÿØŸäÿßÿ™ ÿßŸÑÿ™Ÿä ÿ™Ÿàÿßÿ¨Ÿá ÿßŸÑŸÖÿ¨ÿ™ŸÖÿπ ÿßŸÑÿØŸàŸÑŸä ÿ®ÿ£ÿ≥ÿ±Ÿáÿå ÿÆÿßÿµÿ© ŸÅŸäŸÖÿß Ÿäÿ™ÿπŸÑŸÇ ÿ®ŸÇÿ∂ÿßŸäÿß ÿßŸÑŸÑÿßÿ¨ÿ¶ŸäŸÜ ÿßŸÑŸÅŸÑÿ≥ÿ∑ŸäŸÜŸäŸäŸÜÿå ÿßŸÑÿ∞ŸäŸÜ ŸäÿπŸäÿ¥ŸàŸÜ ŸÅŸä ÿßŸÑÿ£ÿ±ÿßÿ∂Ÿä ÿßŸÑŸÖÿ≠ÿ™ŸÑÿ© ŸÖŸÜÿ∞ ÿπÿßŸÖ 1948ÿå ÿ•ÿ∞ ŸÑÿß ÿ™ÿ≤ÿßŸÑ ÿ£ÿπÿØÿßÿØŸáŸÖ ÿ™ÿ™ÿ≤ÿßŸäÿØÿå ÿ•ŸÑÿß ÿ£ŸÜ ŸáŸÜÿßŸÉ ÿ™ÿ≠ÿØŸäÿßÿ™ ŸÉÿ®Ÿäÿ±ÿ© ÿ™Ÿàÿßÿ¨ŸáŸáŸÖÿå ÿ™ÿ™ŸÖÿ´ŸÑ ŸÅŸä ÿßÿ≥ÿ™ŸÖÿ±ÿßÿ± ÿßŸÑÿßÿ≠ÿ™ŸÑÿßŸÑ ÿßŸÑÿ•ÿ≥ÿ±ÿßÿ¶ŸäŸÑŸä ŸÅŸä ÿßŸÜÿ™ŸáÿßŸÉ ÿßŸÑŸÇÿßŸÜŸàŸÜ ÿßŸÑÿØŸàŸÑŸä ÿßŸÑÿ•ŸÜÿ≥ÿßŸÜŸäÿå ŸàÿßŸÜÿ™ŸáÿßŸÉ ÿ≠ŸÇŸàŸÇ ÿßŸÑÿ•ŸÜÿ≥ÿßŸÜÿå ÿ•ÿ∂ÿßŸÅÿ© ÿ•ŸÑŸâ ÿßŸÑÿßŸÜÿ™ŸáÿßŸÉÿßÿ™ ÿßŸÑÿ¨ÿ≥ŸäŸÖÿ© ŸÑÿ≠ŸÇŸàŸÇ ÿßŸÑÿ•ŸÜÿ≥ÿßŸÜ ÿßŸÑÿ£ÿÆÿ±Ÿâÿå ÿ®ŸÖÿß ŸÅŸäŸáÿß ÿ¨ÿ±ÿßÿ¶ŸÖ ÿßŸÑÿ≠ÿ±ÿ® ŸàÿßŸÑÿ¨ÿ±ÿßÿ¶ŸÖ ÿ∂ÿØ


In [19]:
outputs = generator(prompt, max_length=120, num_return_sequences=3, temperature=0.9, top_p=0.95)
for i, out in enumerate(outputs):
    print(f"\nSample {i+1}:\n{out['generated_text']}")



Sample 1:
ÿ£ŸáŸÖŸäÿ© ÿßŸÑÿ™ÿπŸÑŸäŸÖ ŸÅŸä ÿßŸÑÿπÿµÿ± ÿßŸÑÿ≠ÿØŸäÿ´ÿå Ÿàÿ∞ŸÑŸÉ ŸÖŸÜ ÿ£ÿ¨ŸÑ ÿ£ŸÜ ÿ™ŸÉŸàŸÜ ÿßŸÑŸÖŸÜÿßŸáÿ¨ ÿßŸÑÿ™ÿπŸÑŸäŸÖŸäÿ© ÿπŸÑŸâ ŸÖÿ≥ÿ™ŸàŸâ ÿπÿßŸÑ ŸÖŸÜ ÿßŸÑÿ¨ŸàÿØÿ© ÿå Ÿàÿ£ŸÜ ŸäŸÉŸàŸÜ ŸÑŸáÿß ÿØŸàÿ± ŸÅÿπÿßŸÑ ŸÅŸä ÿ™ÿ≠ŸÇŸäŸÇ ÿ£ŸáÿØÿßŸÅ ÿßŸÑÿ™ŸÜŸÖŸäÿ© ÿßŸÑÿßŸÇÿ™ÿµÿßÿØŸäÿ© ŸàÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ©ÿå ÿ®ÿßŸÑÿ•ÿ∂ÿßŸÅÿ© ÿ•ŸÑŸâ ÿ£ŸáŸÖŸäÿ© ÿßŸÑÿØŸàÿ± ÿßŸÑÿ∞Ÿä ÿ™ŸÇŸàŸÖ ÿ®Ÿá Ÿàÿ≤ÿßÿ±ÿ© ÿßŸÑÿ™ÿ±ÿ®Ÿäÿ© ŸàÿßŸÑÿ™ÿπŸÑŸäŸÖ ŸÖŸÖÿ´ŸÑÿ© ŸÅŸä ÿßŸÑŸÖÿØŸäÿ±Ÿäÿ© ÿßŸÑÿπÿßŸÖÿ© ŸÑŸÑÿ™ÿ±ÿ®Ÿäÿ© ŸàÿßŸÑÿ™ÿπŸÑŸäŸÖ ÿ®ŸÖÿ≠ÿßŸÅÿ∏ÿ© ÿ¨ŸÜŸàÿ® ÿ≥ŸäŸÜÿßÿ°ÿå ÿ≠Ÿäÿ´ ÿ™ÿπŸÖŸÑ ÿßŸÑŸàÿ≤ÿßÿ±ÿ© ÿ®ÿßŸÑÿ™ŸÜÿ≥ŸäŸÇ ŸÖÿπ ŸÖÿØŸäÿ±Ÿäÿ© ÿßŸÑÿ™ÿ±ÿ®Ÿäÿ© ŸàÿßŸÑÿ™ÿπŸÑŸäŸÖ ÿ®ÿßŸÑŸÖÿ≠ÿßŸÅÿ∏ÿ© ÿπŸÑŸâ ÿ•ÿπÿØÿßÿØ ŸÖŸÜÿßŸáÿ¨ ÿ™ÿπŸÑŸäŸÖŸäÿ© ÿ™ÿ™ŸÜÿßÿ≥ÿ® ŸÖÿπ ÿßÿ≠ÿ™Ÿäÿßÿ¨ÿßÿ™ ÿßŸÑÿ∑ŸÑÿßÿ® Ÿàÿ£ŸàŸÑŸäÿßÿ° ÿßŸÑÿ£ŸÖŸàÿ± Ÿàÿßÿ≠ÿ™Ÿäÿßÿ¨ÿßÿ™ŸáŸÖ ÿßŸÑŸÖÿ≥ÿ™ŸÇÿ®ŸÑŸäÿ©ÿå ŸÉŸÖÿß ÿ™ÿ≥ÿπŸâ ÿßŸÑŸàÿ≤ÿßÿ±ÿ© ÿ•ŸÑŸâ ÿ™ÿ∑ŸàŸäÿ± ÿßŸÑŸÖŸÜÿßŸáÿ¨ ÿßŸÑÿ™ÿπŸÑŸäŸÖŸäÿ© ÿ®ŸÖÿß Ÿäÿ™ŸÜÿßÿ≥ÿ® ŸÖÿπ ŸÖÿ™ÿ∑ŸÑÿ®ÿßÿ™ ÿßŸÑŸÖÿ±ÿ≠ŸÑÿ© ÿßŸÑÿ±ÿßŸáŸÜÿ© ŸàÿßŸÑŸÖÿ≥ÿ™ŸÇÿ®ŸÑŸäÿ©ÿå Ÿà