In [None]:
!pip install gradio transformers


Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

# **Dataset Processing and Model Implementation**

In [None]:
import os
import pandas as pd
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from sklearn.model_selection import train_test_split
import math

# 1. Load data
df = pd.read_csv("merged_dataset.csv")
all_text = ""

# Combine data into a single text
for col in df.select_dtypes(include="object").columns:
    all_text += "\n".join(df[col].dropna().astype(str)) + "\n"

# 2. Split data into training and validation sets
train_text, eval_text = train_test_split(all_text.split("\n"), test_size=0.1, random_state=42)

# Save combined data into text files
with open("train.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(train_text))
with open("eval.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(eval_text))

# 3. Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # padding
model = GPT2LMHeadModel.from_pretrained("gpt2")

# 4. Load and organize data into training format
def load_dataset(tokenizer, file_path, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

train_dataset = load_dataset(tokenizer, "train.txt")
eval_dataset = load_dataset(tokenizer, "eval.txt")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 5. Configure training parameters
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=50,
    save_total_limit=1,
    logging_steps=10,
    logging_first_step=True,
    report_to="none"
)

# 6. Define perplexity calculation function
def compute_perplexity(eval_loss):
    return math.exp(eval_loss)

# 7. Create Trainer
class CustomTrainer(Trainer):
    def compute_metrics(self, eval_pred):
        logits, labels = eval_pred
        eval_loss = logits.mean()
        perplexity = compute_perplexity(eval_loss)
        return {"eval_loss": eval_loss, "perplexity": perplexity}

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

# 8. Start training
trainer.train()

# 9. Save trained model
model_path1= "./gpt2-finetuned"
trainer.save_model(model_path1)
tokenizer.save_pretrained(model_path1)

# 10. Evaluation and metric calculation
eval_results = trainer.evaluate()
print("\n📊 Evaluation Results:")
print(f"Loss: {eval_results['eval_loss']:.4f}")
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,4.8057
10,3.905
20,3.3328
30,2.8988
40,2.6456
50,2.2716
60,2.3081
70,2.0713
80,1.97
90,1.8723



📊 Evaluation Results:
Loss: 2.5486
Perplexity: 12.79


# **Example 1**

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model=model_path1, tokenizer=model_path1)

prompt1 = "My friend"


outputs1 = generator(
    prompt1,
    max_length=30,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    truncation=True
)


print(outputs1[0]["generated_text"].replace("\n", " "))


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


My friend is a great person and I like to The environment provides opportunities for learning about natural systems, including life on Earth's surface. We are dedicated stewards of the oceans' health records The Hubble Space Telescope was designed by George Lucas in 1977 Dark energy has been causing global temperature rise since at least 2002 Asteroid Belt objects orbit around our sun Jovian system that orbits between Jupiter and Saturn with extreme storm activity Orion Nebula consists primarily outgrows from its parent star M1/Exoplanet Oort Cloud System contains more than 1 billion stars within it which make up over 4% (reduced) risk due mainly not only to infrared emissions but also because they contain methane gas That region receives frequent impacts as well known climate events such change Inception directed towards ensuring security worldwide education curriculum based on ancient teachings regarding human origins Alphas focus their attention upon promoting social skills excellen

# **Example 2**

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model=model_path1, tokenizer=model_path1)

prompt2 = "My book"

outputs2 = generator(
    prompt2,
    max_length=30,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    truncation=True
)
print(outputs2[0]["generated_text"].replace("\n", " "))


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


My book is based on the teachings of The Great Gatsby was released in U.S., Canada and Australia from 1959 to 1989 with an epic storyline that spans over 4 million years ago


# **API  Implementation**

In [None]:
from transformers import pipeline
import gradio as gr

# Load the model and tokenizer
model =model_path1
generator = pipeline("text-generation", model=model_path1, tokenizer=model_path1)

# Define the text generation function
def generate_text(prompt):
    outputs = generator(
        prompt,
        max_length=20,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        truncation=True
    )
    return outputs[0]["generated_text"].replace("\n", " ")

# Create Gradio interface
interface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(label="Enter your prompt"),
    outputs=gr.Textbox(label="Generated Text"),
    title="Text Generation API",
    description="Enter a prompt to generate text using a local Hugging Face model.",
    allow_flagging="never",
    live=False
)

interface.launch()


Device set to use cuda:0


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7ae0a880d445d035fa.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


