<a href="https://colab.research.google.com/github/AnshuMishra1122003/chatbot/blob/master/chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# fine_tuned_model_new/db_utils.py
import json
import os

SCHEMA_FILE = "database_schema.json"

def load_database_schema():
    """Load database schema from a JSON file."""
    if not os.path.exists(SCHEMA_FILE):
        print(f"❌ Error: Schema file '{SCHEMA_FILE}' not found!")
        return {}

    try:
        with open(SCHEMA_FILE, "r") as file:
            schema = json.load(file)
            if not isinstance(schema, list):
                print("❌ Error: Invalid schema format! Missing 'tables' key.")
                return {}
            return schema
    except json.JSONDecodeError:
        print("❌ Error: JSON format is invalid!")
        return {}

# Fetch schema
database_schema = load_database_schema()


ModuleNotFoundError: No module named 'load_database_schema'

In [None]:
import json
import os
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import sys
import warnings
from torch.utils.data import DataLoader, TensorDataset

# Suppress 'past_key_values' warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*past_key_values.*")

SCHEMA_FILE = "database_schema.json"

def load_database_schema():
    """Load database schema from a JSON file."""
    if not os.path.exists(SCHEMA_FILE):
        print(f"❌ Error: Schema file '{SCHEMA_FILE}' not found!")
        return []

    try:
        with open(SCHEMA_FILE, "r") as file:
            schema = json.load(file)
            if not isinstance(schema, list):
                print("❌ Error: Invalid schema format! Expected a list.")
                return []
            return schema
    except json.JSONDecodeError:
        print("❌ Error: JSON format is invalid!")
        return []

# Load database schema
database_schema = load_database_schema()
if not database_schema:
    print("❌ Error: No valid database schema found. Exiting training.")
    sys.exit(1)

# Upgrade model from t5-small → t5-base
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

max_length = 256  # Reduced max_length for efficiency
batch_size = 4  # Adjust batch size for memory optimization
gradient_accumulation_steps = 4  # Accumulate gradients to simulate a larger batch

# Prepare training data
training_data = []
for query_data in database_schema:
    title = query_data.get("title", "Unknown Title")
    queries = query_data.get("queries", [])

    if not queries:
        print(f"⚠️ Warning: No queries found for '{title}', skipping...")
        continue

    sql_query = queries[0]  # Extract first query
    training_data.append({"input": title, "output": sql_query})

if not training_data:
    print("❌ Error: No training data generated from schema.")
    sys.exit(1)

print(f"✅ Training Data Loaded: {len(training_data)} queries ready for training.")

# Compute max token length **after** training_data is built
max_query_length = max(len(tokenizer.encode(d["output"], add_special_tokens=True)) for d in training_data)
print(f"📝 Max token length for queries: {max_query_length}")

# Convert training data to tokenized format
train_encodings = tokenizer(
    [d["input"] for d in training_data], padding=True, truncation=True, max_length=max_length, return_tensors="pt"
)
train_labels = tokenizer(
    [d["output"] for d in training_data], padding=True, truncation=True, max_length=max_length, return_tensors="pt"
)

# Create a DataLoader for batching
train_dataset = TensorDataset(train_encodings["input_ids"], train_labels["input_ids"])
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Mixed Precision (AMP) for faster training
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

# Training loop with gradient accumulation
for epoch in range(10):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(train_dataloader):
        input_ids, labels = [b.to(device) for b in batch]

        with autocast():
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps  # Scale loss

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * gradient_accumulation_steps  # Scale back loss

    avg_loss = total_loss / len(train_dataloader)
    print(f"✅ Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

# Save the fine-tuned model
model_path = os.path.join(os.getcwd(), "fine_tuned_sql_model")
os.makedirs(model_path, exist_ok=True)

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print(f"✅ Model training completed and saved at: {model_path}")


✅ Training Data Loaded: 394 queries ready for training.
📝 Max token length for queries: 1225


  scaler = GradScaler()
  with autocast():


KeyboardInterrupt: 

In [1]:
# fine_tuned_model_new/db_utils.py
import json
import os

SCHEMA_FILE = "database_schema.json"

def load_database_schema():
    """Load database schema from a JSON file."""
    if not os.path.exists(SCHEMA_FILE):
        print(f"❌ Error: Schema file '{SCHEMA_FILE}' not found!")
        return {}

    try:
        with open(SCHEMA_FILE, "r") as file:
            schema = json.load(file)
            if not isinstance(schema, list):
                print("❌ Error: Invalid schema format! Missing 'tables' key.")
                return {}
            return schema
    except json.JSONDecodeError:
        print("❌ Error: JSON format is invalid!")
        return {}

# Fetch schema
database_schema = load_database_schema()


with open(SCHEMA_FILE, "r") as f:
    schema = json.load(f)

# Ensure the database_folder exists
os.makedirs("database_folder", exist_ok=True)

# Split schema into 30 parts
num_parts = 5
part_size = len(schema) // num_parts
schema_parts = [schema[i * part_size: (i + 1) * part_size] for i in range(num_parts - 1)]
schema_parts.append(schema[(num_parts - 1) * part_size:])  # Add remaining data to the last part

# Save each part separately
for i, part in enumerate(schema_parts):
    part_path = f"database_folder/database_schema_part_{i+1}.json"
    with open(part_path, "w") as f:
        json.dump(part, f, indent=4)
    print(f"✅ Part {i+1} saved at {part_path}")

print("✅ Schema successfully split into 30 parts!")

✅ Part 1 saved at database_folder/database_schema_part_1.json
✅ Part 2 saved at database_folder/database_schema_part_2.json
✅ Part 3 saved at database_folder/database_schema_part_3.json
✅ Part 4 saved at database_folder/database_schema_part_4.json
✅ Part 5 saved at database_folder/database_schema_part_5.json
✅ Schema successfully split into 30 parts!


In [3]:
import json
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import os
import sys
import warnings
from torch.utils.data import DataLoader, TensorDataset

# Filter warnings related to past_key_values
warnings.filterwarnings("ignore", category=UserWarning, message=".*past_key_values.*")

# Add parent directory to sys.path (Fixed for Colab)
sys.path.append(os.path.abspath(".."))

# Directory containing schema parts
SCHEMA_DIR = "database_folder"
MODEL_BASE_PATH = "fine_tuned_models"

os.makedirs(MODEL_BASE_PATH, exist_ok=True)  # Ensure model save directory exists

# Upgrade model from t5-small → t5-base
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

batch_size = 2 # Adjust batch size for memory optimization
num_epochs = 100  # Define number of epochs
early_stop_loss_threshold = 0.5  # Stop training early if loss is below this

# Get all schema part files
schema_files = sorted([f for f in os.listdir(SCHEMA_DIR) if f.startswith("database_schema_part_")])

for idx, schema_file in enumerate(schema_files):
    schema_path = os.path.join(SCHEMA_DIR, schema_file)

    # Load schema part
    with open(schema_path, "r") as f:
        database_schema = json.load(f)

    if not database_schema:
        print(f"❌ Error: No valid data found in {schema_file}. Skipping...")
        continue

    # Prepare training data using SQL queries
    training_data = []
    max_query_length = 0  # Track max query length dynamically

    for query_data in database_schema:
        title = query_data.get("title", "Unknown Title")  # Default title if missing
        queries = query_data.get("queries", [])  # Get queries list (default: empty list)

        if not queries:
            print(f"⚠️ Warning: No queries found for '{title}', skipping...")
            continue

        sql_query = queries[0]  # Extract first query safely
        training_data.append({"input": title, "output": sql_query})

        # Update max_length dynamically
        query_length = len(tokenizer.encode(sql_query, add_special_tokens=True))
        max_query_length = max(max_query_length, query_length)

    if not training_data:
        print(f"❌ Error: No training data generated from {schema_file}. Skipping...")
        continue

    # Set dynamic max_length (cap at 1024 to prevent memory issues)
    max_length = min(max_query_length, 1225)

    print(f"✅ Training Data Loaded from {schema_file}: {len(training_data)} queries ready for training.")
    print(f"🔹 Using max_length={max_length} for this schema part.")

    # Convert training data to tokenized format
    train_encodings = tokenizer(
        [d["input"] for d in training_data], padding=True, truncation=True, max_length=max_length, return_tensors="pt"
    )
    train_labels = tokenizer(
        [d["output"] for d in training_data], padding=True, truncation=True, max_length=max_length, return_tensors="pt"
    )

    # Create DataLoader
    train_dataset = TensorDataset(train_encodings["input_ids"], train_labels["input_ids"])
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            input_ids, labels = [b.to(device) for b in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        print(f"✅ Epoch {epoch+1} for {schema_file}, Average Loss: {avg_loss:.4f}")

        # Stop training if loss reaches threshold
        if avg_loss < early_stop_loss_threshold:
            print(f"🎯 Loss reached {avg_loss:.4f}, stopping early for {schema_file}!")
            break

    # Save the fine-tuned model for this schema part
    model_path = os.path.join(MODEL_BASE_PATH, f"fine_tuned_sql_model_part_{idx+1}")
    os.makedirs(model_path, exist_ok=True)

    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

    print(f"✅ Model trained on {schema_file} and saved at: {model_path}")

print("🎉 All schema parts processed successfully!")


tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ Training Data Loaded from database_schema_part_1.json: 6 queries ready for training.
🔹 Using max_length=351 for this schema part.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


✅ Epoch 1 for database_schema_part_1.json, Average Loss: 11.0932
✅ Epoch 2 for database_schema_part_1.json, Average Loss: 8.8583
✅ Epoch 3 for database_schema_part_1.json, Average Loss: 6.6956
✅ Epoch 4 for database_schema_part_1.json, Average Loss: 5.4014
✅ Epoch 5 for database_schema_part_1.json, Average Loss: 4.7012
✅ Epoch 6 for database_schema_part_1.json, Average Loss: 4.3572
✅ Epoch 7 for database_schema_part_1.json, Average Loss: 4.1130
✅ Epoch 8 for database_schema_part_1.json, Average Loss: 4.0384
✅ Epoch 9 for database_schema_part_1.json, Average Loss: 3.8074
✅ Epoch 10 for database_schema_part_1.json, Average Loss: 3.6375
✅ Epoch 11 for database_schema_part_1.json, Average Loss: 3.5307
✅ Epoch 12 for database_schema_part_1.json, Average Loss: 3.4514
✅ Epoch 13 for database_schema_part_1.json, Average Loss: 3.3672
✅ Epoch 14 for database_schema_part_1.json, Average Loss: 3.6987
✅ Epoch 15 for database_schema_part_1.json, Average Loss: 3.1312
✅ Epoch 16 for database_schema_pa