In [2]:
# Cell 1: Mount Google Drive and Setup
# Mount Google Drive to save our work
from google.colab import drive
drive.mount('/content/drive')

# Create project directory
import os
project_path = '/content/drive/MyDrive/AI-Chatbot'
os.makedirs(project_path, exist_ok=True)
os.chdir(project_path)

# Create folder structure
folders = ['data', 'models', 'utils', 'templates', 'static']
for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("✅ Project structure created successfully!")
print(f"📁 Working directory: {os.getcwd()}")

Mounted at /content/drive
✅ Project structure created successfully!
📁 Working directory: /content/drive/MyDrive/AI-Chatbot


In [3]:
# Cell 2: Install Required Libraries
!pip install -q transformers datasets torch accelerate sentencepiece
!pip install -q sentence-transformers faiss-cpu
!pip install -q flask flask-cors pyngrok
!pip install -q bitsandbytes

print("✅ All libraries installed successfully!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m112.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# Cell 3: Upload Your data.json File
# Upload your data.json file
from google.colab import files
import json

print("Please upload your data.json file...")
uploaded = files.upload()

# Save the uploaded file to data folder
for filename in uploaded.keys():
    with open(f'data/{filename}', 'wb') as f:
        f.write(uploaded[filename])
    print(f"✅ {filename} saved to data folder")

# Load and verify data
with open('data/data.json', 'r', encoding='utf-8') as f:
    qa_data = json.load(f)

print(f"📊 Total Q&A pairs loaded: {len(qa_data)}")
print(f"🔍 Sample data: {qa_data[0] if qa_data else 'No data found'}")

Please upload your data.json file...


Saving data.json to data.json
✅ data.json saved to data folder
📊 Total Q&A pairs loaded: 1097
🔍 Sample data: {'prompt': '<GALAXY>What is Galaxy Organisation?', 'completion': 'Galaxy Organisation is an IT-based non-profit established in 2015, dedicated to driving economic growth through educational programs and digital empowerment initiatives for children, youth, women, and marginalized communities. It focuses on sustainable development through technology access, skills training, and community-driven programs.'}


In [5]:
# Cell 4: Create Data Processor Utility
# Create data_processor.py
data_processor_code = '''
import json
import re
from typing import List, Dict, Tuple

class DataProcessor:
    def __init__(self):
        self.galaxy_keywords = [
            'galaxy', 'galaxy organisation', 'galaxy organization',
            'computers for schools', 'cfsp', 'women empowerment',
            'kids empowerment', 'electronics recycling', 'amman jordan'
        ]
        self.alibaba_keywords = [
            'alibaba', 'alibaba cloud', 'alibaba academy', 'aca', 'acp', 'ace',
            'cloud computing', 'maxcompute', 'dataworks', 'ecs', 'pai',
            'double 11', 'singles day', 'jack ma'
        ]

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        # Remove extra whitespace
        text = ' '.join(text.split())
        # Fix common typos
        text = text.replace('orgaanisation', 'organisation')
        text = text.replace('GENERAL', 'Alibaba')
        return text.strip()

    def categorize_question(self, question: str) -> str:
        """Determine if question is about Galaxy, Alibaba, or General"""
        question_lower = question.lower()

        galaxy_score = sum(1 for kw in self.galaxy_keywords if kw in question_lower)
        alibaba_score = sum(1 for kw in self.alibaba_keywords if kw in question_lower)

        if galaxy_score > alibaba_score:
            return "GALAXY"
        elif alibaba_score > galaxy_score:
            return "ALIBABA"
        else:
            return "GENERAL"

    def process_qa_pairs(self, qa_data: List[Dict]) -> Tuple[List[Dict], Dict]:
        """Process and categorize Q&A pairs"""
        processed_data = []
        statistics = {"GALAXY": 0, "ALIBABA": 0, "GENERAL": 0}

        for item in qa_data:
            prompt = self.clean_text(item.get('prompt', ''))
            completion = self.clean_text(item.get('completion', ''))

            # Remove category tags from prompt
            prompt = re.sub(r'<(GALAXY|ALIBABA|GENERAL)>', '', prompt).strip()

            category = self.categorize_question(prompt)
            statistics[category] += 1

            processed_data.append({
                'prompt': prompt,
                'completion': completion,
                'category': category,
                'text': f"Human: {prompt}\\nAssistant: {completion}"
            })

        return processed_data, statistics

    def create_training_format(self, processed_data: List[Dict]) -> List[str]:
        """Create training format for the model"""
        training_texts = []

        for item in processed_data:
            # Format: <|system|>You are a helpful assistant...<|user|>question<|assistant|>answer
            text = f"""<|system|>
You are a specialized assistant for Galaxy Organisation and Alibaba.
Galaxy Organisation is an IT-based non-profit in Jordan focused on digital empowerment.
Alibaba is a global technology company with cloud computing and e-commerce services.
Answer questions accurately based only on the information you've been trained on.
<|user|>
{item['prompt']}
<|assistant|>
{item['completion']}"""
            training_texts.append(text)

        return training_texts
'''

with open('utils/data_processor.py', 'w', encoding='utf-8') as f:
    f.write(data_processor_code)

print("✅ Data processor utility created!")

✅ Data processor utility created!


In [6]:
# Cell 5: Create Similarity Checker Utility
# Create similarity_checker.py
similarity_code = '''
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import List, Tuple
import torch

class SimilarityChecker:
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.embeddings_cache = {}

    def encode_questions(self, questions: List[str]) -> np.ndarray:
        """Encode questions to embeddings"""
        # Use cache for efficiency
        uncached = [q for q in questions if q not in self.embeddings_cache]
        if uncached:
            new_embeddings = self.model.encode(uncached, convert_to_tensor=True)
            for q, emb in zip(uncached, new_embeddings):
                self.embeddings_cache[q] = emb

        embeddings = [self.embeddings_cache[q] for q in questions]
        return torch.stack(embeddings) if embeddings else torch.tensor([])

    def find_similar_qa(self, user_question: str, qa_database: List[dict],
                       top_k: int = 3, threshold: float = 0.75) -> List[Tuple[dict, float]]:
        """Find most similar Q&A pairs"""
        if not qa_database:
            return []

        # Encode user question
        user_embedding = self.model.encode(user_question, convert_to_tensor=True)

        # Encode all database questions
        db_questions = [item['prompt'] for item in qa_database]
        db_embeddings = self.encode_questions(db_questions)

        # Calculate similarities
        similarities = torch.nn.functional.cosine_similarity(
            user_embedding.unsqueeze(0), db_embeddings
        )

        # Get top matches above threshold
        top_indices = torch.argsort(similarities, descending=True)[:top_k]
        results = []

        for idx in top_indices:
            score = similarities[idx].item()
            if score >= threshold:
                results.append((qa_database[idx], score))

        return results

    def is_relevant_question(self, question: str, keywords: List[str]) -> bool:
        """Check if question is relevant to our domains"""
        question_lower = question.lower()

        # Direct keyword matching
        if any(keyword in question_lower for keyword in keywords):
            return True

        # Check for related terms
        galaxy_related = ['jordan', 'amman', 'nonprofit', 'recycling', 'empowerment']
        alibaba_related = ['cloud', 'computing', 'certification', 'training', 'china']

        all_related = galaxy_related + alibaba_related
        return any(term in question_lower for term in all_related)
'''

with open('utils/similarity_checker.py', 'w', encoding='utf-8') as f:
    f.write(similarity_code)

print("✅ Similarity checker utility created!")

✅ Similarity checker utility created!


In [7]:
# Cell 6: Create Response Generator Utility
# Create response_generator.py
response_generator_code = '''
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Dict, Optional

class ResponseGenerator:
    def __init__(self, model_path: str, device: str = None):
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

        print(f"Loading model on {self.device}...")

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map="auto" if self.device == "cuda" else None
        )

        # Set padding token
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        print("✅ Model loaded successfully!")

    def generate_response(self, prompt: str, similar_qa: List[Dict] = None,
                         max_length: int = 256, temperature: float = 0.7) -> str:
        """Generate response using the model with context from similar Q&As"""

        # Build context from similar Q&As
        context = ""
        if similar_qa:
            context = "Here are some relevant examples:\\n"
            for qa, score in similar_qa[:2]:  # Use top 2 most similar
                context += f"Q: {qa['prompt']}\\nA: {qa['completion']}\\n\\n"

        # Format the prompt
        formatted_prompt = f"""<|system|>
You are a specialized assistant for Galaxy Organisation and Alibaba.
Galaxy Organisation is an IT-based non-profit in Jordan focused on digital empowerment.
Alibaba is a global technology company with cloud computing and e-commerce services.
{context}
<|user|>
{prompt}
<|assistant|>"""

        # Tokenize
        inputs = self.tokenizer(
            formatted_prompt,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(self.device)

        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_length,
                temperature=temperature,
                do_sample=True,
                top_p=0.9,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )

        # Decode response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the assistant's response
        if "<|assistant|>" in response:
            response = response.split("<|assistant|>")[-1].strip()

        return response

    def batch_generate(self, prompts: List[str], **kwargs) -> List[str]:
        """Generate responses for multiple prompts"""
        responses = []
        for prompt in prompts:
            response = self.generate_response(prompt, **kwargs)
            responses.append(response)
        return responses
'''

with open('utils/response_generator.py', 'w', encoding='utf-8') as f:
    f.write(response_generator_code)

# Create __init__.py
with open('utils/__init__.py', 'w') as f:
    f.write('# Utils package')

print("✅ Response generator utility created!")

✅ Response generator utility created!


In [8]:
# Cell 7: Process and Prepare Data
# Import our utilities
import sys
sys.path.append('/content/drive/MyDrive/AI-Chatbot')

from utils.data_processor import DataProcessor

# Initialize processor
processor = DataProcessor()

# Load data
with open('data/data.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Process data
processed_data, stats = processor.process_qa_pairs(raw_data)

print("📊 Data Statistics:")
print(f"Galaxy Organisation Q&As: {stats['GALAXY']}")
print(f"Alibaba Q&As: {stats['ALIBABA']}")
print(f"General Q&As: {stats['GENERAL']}")
print(f"Total Q&As: {len(processed_data)}")

# Create training format
training_texts = processor.create_training_format(processed_data)

# Save processed data
with open('data/processed_data.json', 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=2)

print("✅ Data processed and saved!")

📊 Data Statistics:
Galaxy Organisation Q&As: 147
Alibaba Q&As: 872
General Q&As: 78
Total Q&As: 1097
✅ Data processed and saved!


In [9]:
# Cell 8: Fine-tune Model Using QLoRA (Efficient Training)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import torch

# Model configuration
MODEL_NAME = "microsoft/phi-2"  # Efficient 2.7B parameter model
OUTPUT_DIR = "./models/galaxy_alibaba_chatbot"

# Configure 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prepare model for training
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=32,  # rank
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Add LoRA adapters
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Prepare dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

# Create dataset
train_dataset = Dataset.from_dict({"text": training_texts})
train_dataset = train_dataset.map(tokenize_function, batched=True)

# Split into train and eval
train_test_split = train_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

Loading base model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

trainable params: 47,185,920 || all params: 2,826,869,760 || trainable%: 1.6692


Map:   0%|          | 0/1097 [00:00<?, ? examples/s]

Training samples: 987
Evaluation samples: 110


In [13]:
# Cell 9: Configure and Start Training
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    logging_steps=25,
    save_steps=200,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    load_best_model_at_end=True,
    report_to="none",
    save_total_limit=3,
    push_to_hub=False,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    bf16=True,
    # tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Start training
print("🚀 Starting training...")
trainer.train()

# Save the final model
print("💾 Saving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("✅ Training completed successfully!")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🚀 Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
100,0.2965,0.284284


💾 Saving model...
✅ Training completed successfully!


In [14]:
# Cell 10: Test the Model
from utils.similarity_checker import SimilarityChecker
from utils.response_generator import ResponseGenerator

# Initialize components
similarity_checker = SimilarityChecker()
response_generator = ResponseGenerator(OUTPUT_DIR)

# Load Q&A database
with open('data/processed_data.json', 'r') as f:
    qa_database = json.load(f)

# Test function
def test_chatbot(question):
    print(f"\\n❓ Question: {question}")

    # Find similar Q&As
    similar_qa = similarity_checker.find_similar_qa(question, qa_database)

    if similar_qa:
        print(f"✅ Found {len(similar_qa)} similar Q&As")
        # Use the most similar answer if confidence is very high
        if similar_qa[0][1] > 0.95:
            print(f"💬 Answer: {similar_qa[0][0]['completion']}")
            return similar_qa[0][0]['completion']

    # Generate response
    response = response_generator.generate_response(question, similar_qa)
    print(f"💬 Answer: {response}")
    return response

# Test questions
test_questions = [
    "What is Galaxy Organisation?",
    "Tell me about Alibaba Cloud certifications",
    "Where is Galaxy Organisation located?",
    "What is the ACA certification?",
    "Does Galaxy accept electronic donations?",
    "When was Alibaba founded?",
    "Hello",
    "What is the weather?"  # Irrelevant question
]

for q in test_questions:
    test_chatbot(q)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading model on cuda...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model loaded successfully!
\n❓ Question: What is Galaxy Organisation?
✅ Found 3 similar Q&As
💬 Answer: Galaxy Organisation is an IT-based non-profit established in 2015, dedicated to driving economic growth through educational programs and digital empowerment initiatives for children, youth, women, and marginalized communities. It focuses on sustainable development through technology access, skills training, and community-driven programs.
\n❓ Question: Tell me about Alibaba Cloud certifications
✅ Found 3 similar Q&As
💬 Answer: We offer over 250 certifications that focus on practical skills in areas like cloud migration and serverless computing. Plus, they're short and easy to complete. Learn more at <a href='https://edu.alibabacloud.com'>Alibaba Academy</a>.
\n❓ Question: Where is Galaxy Organisation located?
✅ Found 3 similar Q&As
💬 Answer: Galaxy's headquarters is in Amman, Jordan at: Khair Al-Din Al-Ma'ani Street, P.O. Box 962420, Amman 11196. Mobile: +962782979999. They serve com

In [15]:
# Cell 11: Create Flask App
# Create app.py
app_code = '''
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
import json
import os
import sys

# Add project path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from utils.similarity_checker import SimilarityChecker
from utils.response_generator import ResponseGenerator

app = Flask(__name__)
CORS(app)

# Configuration
MODEL_PATH = './models/galaxy_alibaba_chatbot'
QA_DATABASE_PATH = './data/processed_data.json'

# Initialize components
print("Initializing chatbot components...")
similarity_checker = SimilarityChecker()
response_generator = ResponseGenerator(MODEL_PATH)

# Load Q&A database
with open(QA_DATABASE_PATH, 'r', encoding='utf-8') as f:
    qa_database = json.load(f)

print(f"✅ Loaded {len(qa_database)} Q&A pairs")

# Keywords for relevance checking
RELEVANT_KEYWORDS = [
    'galaxy', 'organisation', 'organization', 'alibaba', 'cloud',
    'certification', 'jordan', 'amman', 'recycling', 'empowerment',
    'training', 'aca', 'acp', 'ace', 'nonprofit', 'women', 'kids'
]

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/chat', methods=['POST'])
def chat():
    try:
        data = request.json
        user_message = data.get('message', '').strip()

        if not user_message:
            return jsonify({
                'response': 'Please ask a question about Galaxy Organisation or Alibaba.',
                'status': 'error'
            })

        # Check if question is relevant
        if not similarity_checker.is_relevant_question(user_message.lower(), RELEVANT_KEYWORDS):
            return jsonify({
                'response': 'I specialize in questions about Galaxy Organisation and Alibaba. Please ask about their programs, services, or certifications.',
                'status': 'irrelevant'
            })

        # Find similar Q&As
        similar_qa = similarity_checker.find_similar_qa(
            user_message, qa_database, top_k=3, threshold=0.7
        )

        # If very high similarity, use the stored answer
        if similar_qa and similar_qa[0][1] > 0.9:
            response = similar_qa[0][0]['completion']
            status = 'exact_match'
        else:
            # Generate response
            response = response_generator.generate_response(user_message, similar_qa)
            status = 'generated'

        return jsonify({
            'response': response,
            'status': status,
            'confidence': similar_qa[0][1] if similar_qa else 0.0
        })

    except Exception as e:
        print(f"Error: {str(e)}")
        return jsonify({
            'response': 'Sorry, I encountered an error. Please try again.',
            'status': 'error',
            'error': str(e)
        })

@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'healthy', 'model_loaded': True})

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)
'''

with open('app.py', 'w') as f:
    f.write(app_code)

print("✅ Flask app created!")

✅ Flask app created!


In [16]:
# Cell 12: Create HTML Template
# Create index.html
html_code = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Galaxy & Alibaba Assistant</title>
    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
</head>
<body>
    <div class="container">
        <header>
            <h1>Galaxy & Alibaba Assistant</h1>
            <p>Ask me about Galaxy Organisation or Alibaba Cloud services!</p>
        </header>

        <div class="chat-container">
            <div id="chat-messages" class="chat-messages">
                <div class="message bot-message">
                    <div class="message-content">
                        Hello! I'm your specialized assistant for Galaxy Organisation and Alibaba.
                        I can help you with:
                        <ul>
                            <li>Galaxy Organisation's programs and services</li>
                            <li>Alibaba Cloud certifications (ACA, ACP, ACE)</li>
                            <li>Training opportunities and courses</li>
                            <li>Contact information and locations</li>
                        </ul>
                        How can I assist you today?
                    </div>
                </div>
            </div>

            <div class="input-container">
                <input
                    type="text"
                    id="user-input"
                    placeholder="Type your question here..."
                    autofocus
                >
                <button id="send-button">Send</button>
            </div>
        </div>

        <footer>
            <p>Powered by AI | Galaxy Organisation & Alibaba Cloud</p>
        </footer>
    </div>

    <script src="{{ url_for('static', filename='script.js') }}"></script>
</body>
</html>'''

with open('templates/index.html', 'w') as f:
    f.write(html_code)

print("✅ HTML template created!")

✅ HTML template created!


In [18]:
# Cell 13: Create Complete CSS Styles
css_code = '''/* Modern and Professional Styling for Galaxy & Alibaba Chatbot */
* {
    margin: 0;
    padding: 0;
    box-sizing: border-box;
}

body {
    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    min-height: 100vh;
    display: flex;
    align-items: center;
    justify-content: center;
    padding: 20px;
}

.container {
    background: white;
    border-radius: 20px;
    box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
    width: 100%;
    max-width: 800px;
    height: 600px;
    display: flex;
    flex-direction: column;
    overflow: hidden;
}

header {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 20px;
    text-align: center;
}

header h1 {
    font-size: 28px;
    margin-bottom: 5px;
}

header p {
    font-size: 16px;
    opacity: 0.9;
}

.chat-container {
    flex: 1;
    display: flex;
    flex-direction: column;
}

.chat-messages {
    flex: 1;
    overflow-y: auto;
    padding: 20px;
    background: #f8f9fa;
    scroll-behavior: smooth;
}

.message {
    margin-bottom: 15px;
    display: flex;
    animation: fadeIn 0.3s ease-out;
}

.user-message {
    justify-content: flex-end;
}

.bot-message {
    justify-content: flex-start;
}

.message-content {
    max-width: 70%;
    padding: 12px 16px;
    border-radius: 18px;
    word-wrap: break-word;
}

.user-message .message-content {
    background: #667eea;
    color: white;
}

.bot-message .message-content {
    background: white;
    color: #333;
    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
    transition: box-shadow 0.3s ease;
}

.bot-message .message-content:hover {
    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.15);
}

.message-content ul {
    margin: 10px 0 0 20px;
}

.message-content ul li {
    margin: 5px 0;
    line-height: 1.5;
}

.message-content a {
    color: #667eea;
    text-decoration: none;
    font-weight: 500;
}

.message-content a:hover {
    text-decoration: underline;
    color: #764ba2;
}

.input-container {
    padding: 20px;
    background: white;
    border-top: 1px solid #e0e0e0;
    display: flex;
    gap: 10px;
}

#user-input {
    flex: 1;
    padding: 12px 20px;
    border: 2px solid #e0e0e0;
    border-radius: 25px;
    font-size: 16px;
    outline: none;
    transition: border-color 0.3s;
}

#user-input:focus {
    border-color: #667eea;
}

#send-button {
    padding: 12px 30px;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    border: none;
    border-radius: 25px;
    font-size: 16px;
    font-weight: 600;
    cursor: pointer;
    transition: transform 0.2s, box-shadow 0.2s;
}

#send-button:hover {
    transform: scale(1.05);
    box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
}

#send-button:active {
    transform: scale(0.95);
}

#send-button:disabled {
    opacity: 0.6;
    cursor: not-allowed;
    transform: scale(1);
}

footer {
    padding: 15px;
    text-align: center;
    color: #666;
    font-size: 14px;
    border-top: 1px solid #e0e0e0;
    background: #fafafa;
}

/* Loading animation */
.typing-indicator {
    display: inline-block;
}

.typing-indicator span {
    display: inline-block;
    width: 8px;
    height: 8px;
    border-radius: 50%;
    background: #667eea;
    animation: typing 1.4s infinite;
    margin: 0 2px;
}

.typing-indicator span:nth-child(2) {
    animation-delay: 0.2s;
}

.typing-indicator span:nth-child(3) {
    animation-delay: 0.4s;
}

@keyframes typing {
    0%, 60%, 100% {
        transform: translateY(0);
    }
    30% {
        transform: translateY(-10px);
    }
}

/* Scrollbar styling */
.chat-messages::-webkit-scrollbar {
    width: 6px;
}

.chat-messages::-webkit-scrollbar-track {
    background: #f1f1f1;
}

.chat-messages::-webkit-scrollbar-thumb {
    background: #888;
    border-radius: 3px;
}

.chat-messages::-webkit-scrollbar-thumb:hover {
    background: #555;
}

/* Responsive design */
@media (max-width: 768px) {
    body {
        padding: 0;
    }

    .container {
        border-radius: 0;
        height: 100vh;
        max-width: 100%;
    }

    header h1 {
        font-size: 24px;
    }

    header p {
        font-size: 14px;
    }

    .message-content {
        max-width: 85%;
    }

    #send-button {
        padding: 12px 20px;
    }
}

/* Error and success messages */
.error-message {
    background: #fee;
    border-left: 4px solid #f44336;
    color: #c00;
    padding: 10px;
    margin: 10px 0;
    border-radius: 4px;
}

.success-message {
    background: #efe;
    border-left: 4px solid #4caf50;
    color: #080;
    padding: 10px;
    margin: 10px 0;
    border-radius: 4px;
}

/* Confidence indicator */
.confidence-indicator {
    font-size: 12px;
    color: #999;
    margin-top: 5px;
}

.high-confidence {
    color: #4caf50;
}

.medium-confidence {
    color: #ff9800;
}

.low-confidence {
    color: #f44336;
}

/* Loading state */
.message.loading {
    opacity: 0.6;
}

/* Code styling */
.message-content code {
    background: #f4f4f4;
    padding: 2px 4px;
    border-radius: 3px;
    font-family: 'Courier New', monospace;
    font-size: 14px;
}

.message-content pre {
    background: #f4f4f4;
    padding: 10px;
    border-radius: 5px;
    overflow-x: auto;
    margin: 10px 0;
}

/* Text formatting */
.message-content strong {
    font-weight: 600;
    color: #333;
}

.message-content em {
    font-style: italic;
}

/* Welcome message */
.welcome-message {
    background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
    border: none;
    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
}

/* Fade in animation */
@keyframes fadeIn {
    from {
        opacity: 0;
        transform: translateY(10px);
    }
    to {
        opacity: 1;
        transform: translateY(0);
    }
}

/* Selection color */
::selection {
    background: #667eea;
    color: white;
}

::-moz-selection {
    background: #667eea;
    color: white;
}
'''

# Save the CSS file
with open('static/style.css', 'w') as f:
    f.write(css_code)

print("✅ Complete CSS styles created successfully!")
print("📁 Saved to: static/style.css")

✅ Complete CSS styles created successfully!
📁 Saved to: static/style.css


In [39]:
# Cell 18: Install ngrok and required packages
!pip install pyngrok flask-ngrok



In [31]:
# # Cell 19: Run the Chatbot with Ngrok
# import os
# import sys
# from pyngrok import ngrok
# import threading
# import time
# import subprocess

# # IMPORTANT: Get your free authtoken from https://dashboard.ngrok.com/auth
# # Replace 'YOUR_AUTHTOKEN_HERE' with your actual token
# NGROK_AUTH_TOKEN = "2ymfgcS9L6VEoH0RKOhgizsNA0a_2Hj4hMbpHGFmBhyWDuqad"  # <-- CHANGE THIS!

# # Set your authtoken
# ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# # Change to project directory
# os.chdir('/content/drive/MyDrive/AI-Chatbot')

# # Function to run Flask app
# def run_flask_app():
#     """Run the Flask application"""
#     print("🚀 Starting Flask app...")
#     subprocess.run([sys.executable, "app.py"])

# # Start Flask in a separate thread
# flask_thread = threading.Thread(target=run_flask_app, daemon=True)
# flask_thread.start()

# # Wait for Flask to start
# print("⏳ Waiting for Flask app to initialize...")
# time.sleep(10)

# # Create ngrok tunnel
# try:
#     # Open a tunnel on port 5000
#     public_url = ngrok.connect(5000, "http")
#     print("\n" + "="*50)
#     print("✅ CHATBOT IS LIVE!")
#     print("="*50)
#     print(f"\n🌐 Public URL: {public_url}")
#     print(f"\n📱 Share this link to access your chatbot from anywhere!")
#     print(f"\n⚠️  Note: This URL will expire when you stop this cell")
#     print("\n💡 Tips:")
#     print("- Click the URL to open in a new tab")
#     print("- Share with others to test")
#     print("- Keep this cell running to keep the chatbot active")
#     print("\n" + "="*50)

#     # Display as clickable link
#     from IPython.display import HTML, display
#     display(HTML(f'<h2>Your Chatbot is Live! 🎉</h2><p>Click here to open: <a href="{public_url}" target="_blank" style="color: #667eea; font-size: 18px;">{public_url}</a></p>'))

#     # Keep the tunnel open
#     print("\n⏰ Press 'Stop' button or interrupt kernel to shut down...")
#     while True:
#         time.sleep(1)

# except Exception as e:
#     print(f"❌ Error: {str(e)}")
#     print("\n🔧 Troubleshooting:")
#     print("1. Make sure you replaced 'YOUR_AUTHTOKEN_HERE' with your ngrok token")
#     print("2. Get your free token from: https://dashboard.ngrok.com/auth")
#     print("3. Check if Flask app started correctly")

# finally:
#     # Cleanup
#     ngrok.kill()

🚀 Starting Flask app...⏳ Waiting for Flask app to initialize...


✅ CHATBOT IS LIVE!

🌐 Public URL: NgrokTunnel: "https://cd20-34-82-56-246.ngrok-free.app" -> "http://localhost:5000"

📱 Share this link to access your chatbot from anywhere!

⚠️  Note: This URL will expire when you stop this cell

💡 Tips:
- Click the URL to open in a new tab
- Share with others to test
- Keep this cell running to keep the chatbot active




⏰ Press 'Stop' button or interrupt kernel to shut down...




KeyboardInterrupt: 