<a href="https://colab.research.google.com/github/DUptain1993/1/blob/main/wizard_coder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ["PYDEVD_DISABLE_FILE_VALIDATION"] = "1"
print("Debugger file validation disabled to suppress frozen modules warning.")

In [None]:
# Install required packages
!pip install -q transformers>=4.33.0 datasets accelerate peft bitsandbytes
!pip install -q google-cloud-storage torch
!pip install -q auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
!pip install -q wandb

In [None]:
import os
import json
from google.cloud import storage
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset

# --- Authenticate with GCS using Kaggle Secrets ---
service_account_path = "/kaggle/working/gcs_service_account.json"

if not os.path.exists(service_account_path):
    with open(service_account_path, "w") as f:
        f.write(os.environ.get("GCS_SERVICE_ACCOUNT", ""))

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_account_path

# Initialize GCS client and bucket
BUCKET_NAME = 'wizardlm-training-1759276927'
DATASET_PATH = 'datasets/combined_training_data.jsonl'

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
print(f"✅ Connected to GCS bucket: {BUCKET_NAME}")

In [None]:
def load_wizardlm_dataset():
    blob = bucket.blob(DATASET_PATH)
    content = blob.download_as_text()
    data = [json.loads(line) for line in content.strip().split('\n') if line.strip()]
    print(f"📊 Loaded {len(data)} WizardLM training examples")
    return data

dataset = load_wizardlm_dataset()

print("\n📝 Sample WizardLM data:")
print(json.dumps(dataset[0], indent=2))

In [None]:
MODEL_NAME = "QuixiAI/WizardLM-1.0-Uncensored-CodeLlama-34b"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

print(f"✅ WizardLM model loaded with {model.num_parameters():,} parameters")

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("✅ LoRA configuration applied")

In [None]:
def prepare_wizardlm_data(data, max_samples=500):
    data = data[:max_samples]
    training_texts = [item.get('text', '') for item in data if item.get('text', '') and len(item.get('text', '')) > 50]
    return training_texts

training_texts = prepare_wizardlm_data(dataset, max_samples=500)
print(f"📝 Prepared {len(training_texts)} training examples")

def tokenize_wizardlm(examples):
    return tokenizer(
        examples,
        truncation=True,
        padding=True,
        max_length=1024,
        return_tensors="pt"
    )

train_dataset = Dataset.from_dict({"text": training_texts})
train_dataset = train_dataset.map(
    lambda x: tokenize_wizardlm([x["text"]]),
    batched=True,
    remove_columns=train_dataset.column_names
)

print("✅ Dataset tokenized")

In [None]:
training_args = TrainingArguments(
    output_dir="./wizardlm_fine_tuned",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    warmup_steps=50,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    evaluation_strategy="no",
    save_total_limit=2,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    report_to=None,
    gradient_checkpointing=True,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

print("✅ Training configuration ready")

In [None]:
print("🚀 Starting fine-tuning...")
trainer.train()
print("✅ Fine-tuning completed!")

In [None]:
import shutil

def save_wizardlm_to_gcs():
    model.save_pretrained("./wizardlm_fine_tuned")
    tokenizer.save_pretrained("./wizardlm_fine_tuned")

    shutil.make_archive("wizardlm_fine_tuned", "zip", "./wizardlm_fine_tuned")

    blob = bucket.blob("model_output/wizardlm_fine_tuned_kaggle.zip")
    blob.upload_from_filename("wizardlm_fine_tuned.zip")

    print("✅ Fine-tuned model saved to GCS at:")
    print("gs://wizardlm-training-1759276927/model_output/wizardlm_fine_tuned_kaggle.zip")

save_wizardlm_to_gcs()

In [None]:
def test_wizardlm(prompt="### Human: Explain how to implement a neural network in Python\n### Assistant:"):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + 200,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            top_p=0.9,
            top_k=50
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("🧙‍♂️ WizardLM Response:")
    print(f"Prompt: {prompt}")
    print(f"Response: {response}")

test_wizardlm()