In [None]:
!pip install -q transformers datasets evaluate huggingface_hub bitsandbytes rouge_score accelerate gdown

import os
import gc
import torch
import pandas as pd
import gdown
import evaluate
from datasets import Dataset
from google.colab import auth, drive
from google.auth import default
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer,
    DataCollatorForSeq2Seq
)

# Enabled GPU CUDA Memory
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Mount GDrive
drive.mount('/content/drive', force_remount=True)
auth.authenticate_user()
creds, _ = default()

# Download Dataset
file_id = '18cM4Z_GlgHdDruuTXPekmgKk6UU8IBxMjGnuUKZELh0'
output_file = "chatbot_training_data.xlsx"
gdown.download(id=file_id, output=output_file, quiet=False)

# Load Dataset
df = pd.read_excel(output_file)
df = df[['user_input', 'chatbot_response']].dropna().rename(
    columns={'user_input': 'prompt', 'chatbot_response': 'response'}
)
print("Data Loaded:")
print(df.head())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df).shuffle(seed=42).select(range(50000))
del df
gc.collect()

# Split Dataset
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset, eval_dataset = split_dataset["train"], split_dataset["test"]

# Load Model and Tokenizer
model_name = "google/mt5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# "Size Mismatch" Errors
model.resize_token_embeddings(len(tokenizer))
model.config.use_cache = False
model.gradient_checkpointing_enable()

# Load Evaluation Metric
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=decoded_preds, references=decoded_labels)

# Preprocessing Function
MAX_LENGTH = 256
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["prompt"],
        max_length=MAX_LENGTH,
        truncation=True,
    )
    labels = tokenizer(
        examples["response"],
        max_length=MAX_LENGTH,
        truncation=True,
    )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

# Tokenize Dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
print("Tokenization completed!")

# Training Configuration optimized
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    save_strategy="no",
    #save_total_limit=2,
    num_train_epochs=4,
    gradient_checkpointing=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    #logging_steps=100,
    fp16=torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    optim="adamw_bnb_8bit",  # memory optimizer
    report_to="wandb",
    run_name="MT5_fully_finetuned_chatbot",
)

# Setup Data Collator and Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Start Training
print("Starting Training...")
trainer.train()
print("Training Completed!")

# Save Model
print("Saving Model Locally...")
trainer.save_model("./fine_tuned_LLM")
tokenizer.save_pretrained("./fine_tuned_LLM")
print("Model Saved!")

# Upload to Hugging Face
from huggingface_hub import HfApi, create_repo

repo_name = "ArsenKe/MT5_large_finetuned_chatbot"
create_repo(repo_name, exist_ok=True)

# Upload Model to Hugging Face
print("Uploading Model to Hugging Face Hub...")
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)
print(f"Model Uploaded: https://huggingface.co/{repo_name}")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m88.0 MB/s[0m eta [36m0:

Downloading...
From (original): https://drive.google.com/uc?id=18cM4Z_GlgHdDruuTXPekmgKk6UU8IBxMjGnuUKZELh0
From (redirected): https://docs.google.com/spreadsheets/d/18cM4Z_GlgHdDruuTXPekmgKk6UU8IBxMjGnuUKZELh0/export?format=xlsx
To: /content/chatbot_training_data.xlsx
1.55MB [00:01, 1.39MB/s]


✅ Data Loaded:
                                              prompt  \
0  User: Could you reserve the Silver Sightseeing...   
1  User: Can I reschedule my E-Rickshaw Tour to a...   
2  User: I need to adjust the number of people fo...   
3  User: Can you recommend a budget-friendly tour...   
4  User: Can I downgrade my Gold Sightseeing Tour...   

                                            response  
0  Chatbot: Of course! The Silver Sightseeing Tou...  
1  Chatbot: No problem! What new date works for y...  
2  Chatbot: Got it! How many people now for the P...  
3  Chatbot: The Silver Sightseeing Tour is perfec...  
4  Chatbot: Yes, we can switch you to a more affo...  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

✅ Tokenization completed!




🚀 Starting Training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marsenke[0m ([33marsenke-fh-tech-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,10.3892
1000,0.6514
1500,0.3701
2000,0.2901
2500,0.2292
3000,0.2009
3500,0.2006
4000,0.1602
4500,0.1585
5000,0.138


Step,Training Loss
500,10.3892
1000,0.6514
1500,0.3701
2000,0.2901
2500,0.2292
3000,0.2009
3500,0.2006
4000,0.1602
4500,0.1585
5000,0.138


✅ Training Completed!
💾 Saving Model Locally...
✅ Model Saved!


HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67e1a6e9-6cd3f33c7179acdd375837ca;29321463-cf5a-49ec-9bd3-fad5e7adf034)

Invalid username or password.

*Test*

In [None]:
from huggingface_hub import HfApi, create_repo
from huggingface_hub import notebook_login

notebook_login()
repo_name = ".../MT5_large_finetuned_chatbot"

create_repo(repo_name, exist_ok=True)

# Upload Model and Tokenizer
print("Uploading Model to Hugging Face Hub...")
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)
print(f" Model Uploaded: https://huggingface.co/{repo_name}")


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = ".../MT5_large_finetuned_chatbot"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

input_text = "what tours can i book?"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Response: {response}")


In [None]:
input_text = "what museums i can visit ?"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=512)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Response: {response}")

Response: Chatbot: The Platinum Sightseeing Tour showcases Vienna’s museums, including Albertina Museum like St. Stephen’s Cathedral. The cost is 105 euro. Call +43676849696200 for details.Royal E-Cars Tours | Contact: office@royal-ecars.com | Phone: +43676849696200 | Address: Geigergasse 5, 3. Stock, 1050 Wien
