In [3]:
# Cell 2: Install Required Libraries
!pip install -q transformers datasets torch accelerate sentencepiece
!pip install -q sentence-transformers faiss-cpu
!pip install -q flask flask-cors pyngrok
!pip install -q bitsandbytes

print("✅ All libraries installed successfully!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m112.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# Cell 3: Upload Your data.json File
# Upload your data.json file
from google.colab import files
import json

print("Please upload your data.json file...")
uploaded = files.upload()

# Save the uploaded file to data folder
for filename in uploaded.keys():
    with open(f'data/{filename}', 'wb') as f:
        f.write(uploaded[filename])
    print(f"✅ {filename} saved to data folder")

# Load and verify data
with open('data/data.json', 'r', encoding='utf-8') as f:
    qa_data = json.load(f)

print(f"📊 Total Q&A pairs loaded: {len(qa_data)}")
print(f"🔍 Sample data: {qa_data[0] if qa_data else 'No data found'}")

Please upload your data.json file...


Saving data.json to data.json
✅ data.json saved to data folder
📊 Total Q&A pairs loaded: 1097
🔍 Sample data: {'prompt': '<GALAXY>What is Galaxy Organisation?', 'completion': 'Galaxy Organisation is an IT-based non-profit established in 2015, dedicated to driving economic growth through educational programs and digital empowerment initiatives for children, youth, women, and marginalized communities. It focuses on sustainable development through technology access, skills training, and community-driven programs.'}


In [8]:
# Cell 7: Process and Prepare Data
# Import our utilities
import sys
sys.path.append('/content/drive/MyDrive/AI-Chatbot')

from utils.data_processor import DataProcessor

# Initialize processor
processor = DataProcessor()

# Load data
with open('data/data.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Process data
processed_data, stats = processor.process_qa_pairs(raw_data)

print("📊 Data Statistics:")
print(f"Galaxy Organisation Q&As: {stats['GALAXY']}")
print(f"Alibaba Q&As: {stats['ALIBABA']}")
print(f"General Q&As: {stats['GENERAL']}")
print(f"Total Q&As: {len(processed_data)}")

# Create training format
training_texts = processor.create_training_format(processed_data)

# Save processed data
with open('data/processed_data.json', 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=2)

print("✅ Data processed and saved!")

📊 Data Statistics:
Galaxy Organisation Q&As: 147
Alibaba Q&As: 872
General Q&As: 78
Total Q&As: 1097
✅ Data processed and saved!


In [9]:
# Cell 8: Fine-tune Model Using QLoRA (Efficient Training)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import torch

# Model configuration
MODEL_NAME = "microsoft/phi-2"  # Efficient 2.7B parameter model
OUTPUT_DIR = "./models/galaxy_alibaba_chatbot"

# Configure 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prepare model for training
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=32,  # rank
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Add LoRA adapters
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Prepare dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

# Create dataset
train_dataset = Dataset.from_dict({"text": training_texts})
train_dataset = train_dataset.map(tokenize_function, batched=True)

# Split into train and eval
train_test_split = train_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

Loading base model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

trainable params: 47,185,920 || all params: 2,826,869,760 || trainable%: 1.6692


Map:   0%|          | 0/1097 [00:00<?, ? examples/s]

Training samples: 987
Evaluation samples: 110


In [13]:
# Cell 9: Configure and Start Training
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    logging_steps=25,
    save_steps=200,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to="none",
    save_total_limit=3,
    push_to_hub=False,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    bf16=True,
    # tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Start training
print("🚀 Starting training...")
trainer.train()

# Save the final model
print("💾 Saving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("✅ Training completed successfully!")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🚀 Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
100,0.2965,0.284284


💾 Saving model...
✅ Training completed successfully!


In [14]:
# Cell 10: Test the Model
from utils.similarity_checker import SimilarityChecker
from utils.response_generator import ResponseGenerator

# Initialize components
similarity_checker = SimilarityChecker()
response_generator = ResponseGenerator(OUTPUT_DIR)

# Load Q&A database
with open('data/processed_data.json', 'r') as f:
    qa_database = json.load(f)

# Test function
def test_chatbot(question):
    print(f"\\n❓ Question: {question}")

    # Find similar Q&As
    similar_qa = similarity_checker.find_similar_qa(question, qa_database)

    if similar_qa:
        print(f"✅ Found {len(similar_qa)} similar Q&As")
        # Use the most similar answer if confidence is very high
        if similar_qa[0][1] > 0.95:
            print(f"💬 Answer: {similar_qa[0][0]['completion']}")
            return similar_qa[0][0]['completion']

    # Generate response
    response = response_generator.generate_response(question, similar_qa)
    print(f"💬 Answer: {response}")
    return response

# Test questions
test_questions = [
    "What is Galaxy Organisation?",
    "Tell me about Alibaba Cloud certifications",
    "Where is Galaxy Organisation located?",
    "What is the ACA certification?",
    "Does Galaxy accept electronic donations?",
    "When was Alibaba founded?",
    "Hello",
    "What is the weather?"  # Irrelevant question
]

for q in test_questions:
    test_chatbot(q)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading model on cuda...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model loaded successfully!
\n❓ Question: What is Galaxy Organisation?
✅ Found 3 similar Q&As
💬 Answer: Galaxy Organisation is an IT-based non-profit established in 2015, dedicated to driving economic growth through educational programs and digital empowerment initiatives for children, youth, women, and marginalized communities. It focuses on sustainable development through technology access, skills training, and community-driven programs.
\n❓ Question: Tell me about Alibaba Cloud certifications
✅ Found 3 similar Q&As
💬 Answer: We offer over 250 certifications that focus on practical skills in areas like cloud migration and serverless computing. Plus, they're short and easy to complete. Learn more at <a href='https://edu.alibabacloud.com'>Alibaba Academy</a>.
\n❓ Question: Where is Galaxy Organisation located?
✅ Found 3 similar Q&As
💬 Answer: Galaxy's headquarters is in Amman, Jordan at: Khair Al-Din Al-Ma'ani Street, P.O. Box 962420, Amman 11196. Mobile: +962782979999. They serve com