In [None]:
# KERNEL CHECK AND IMPORTS
import sys
import subprocess
import time
import os
import threading
from pathlib import Path

print("🔧 Kernel Check")
print("=" * 50)
print(f"Python version: {sys.version}")
print(f"Working directory: {os.getcwd()}")

# Test imports
try:
    import torch
    import transformers
    import gradio as gr
    import streamlit
    import flask
    print("✅ All imports successful!")
    print(f"   PyTorch: {torch.__version__}")
    print(f"   Transformers: {transformers.__version__}")
    print(f"   Gradio: {gr.__version__}")
    print(f"   Streamlit: {streamlit.__version__}")
    print(f"   Flask: {flask.__version__}")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Please run: pip install -r requirements.txt")


## 🧠 Real TinyLlama Model Loading

We'll load TinyLlama once and use it across all applications. This ensures consistency and efficiency.


In [None]:
# STEP 1: LOAD REAL TINYLLAMA MODEL
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

print("🚀 STEP 1: LOAD REAL TINYLLAMA MODEL")
print("=" * 50)

# Load TinyLlama model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"✅ {model_name} loaded successfully!")
print(f"   Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
print(f"   Parameters: 1.1B")
print(f"   Provider: TinyLlama")

def chat_with_tinyllama(message, max_length=200):
    """Chat with TinyLlama model"""
    prompt = f"<|user|>\n{message}\n<|assistant|>\n"
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            no_repeat_ngram_size=3
        )
    
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<|assistant|>" in full_response:
        response = full_response.split("<|assistant|>")[-1].strip()
    else:
        response = full_response
    
    response = response.replace("<|user|>", "").replace("<|assistant|>", "").strip()
    return response

# Test the model
print("\n🧪 Testing model...")
test_response = chat_with_tinyllama("Hello! How are you?")
print(f"🤖 TinyLlama: {test_response}")
print("\n✅ Model ready for all applications!")


## 🌐 Launch Applications

Now we'll create and launch each application in separate processes. Each will have its own URL.


In [None]:
# STEP 2: CREATE APPLICATION FILES
print("🛠️ STEP 2: CREATING APPLICATION FILES")
print("=" * 50)

# Create Gradio app file
gradio_code = '''#!/usr/bin/env python3
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def chat_with_tinyllama(message, max_length=200):
    prompt = f"<|user|>\\n{message}\\n<|assistant|>\\n"
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            inputs, max_length=inputs.shape[1] + max_length,
            num_return_sequences=1, temperature=0.7, do_sample=True,
            pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1, no_repeat_ngram_size=3
        )
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<|assistant|>" in full_response:
        response = full_response.split("<|assistant|>")[-1].strip()
    else:
        response = full_response
    return response.replace("<|user|>", "").replace("<|assistant|>", "").strip()

def gradio_chat(message, history):
    response = chat_with_tinyllama(message)
    return response

# Create interface
with gr.Blocks(title="Real TinyLlama AI Assistant") as demo:
    gr.Markdown("# 🤖 Real TinyLlama On-Device AI Assistant")
    gr.Markdown("**Model**: TinyLlama/TinyLlama-1.1B-Chat-v1.0 | **Parameters**: 1.1B | **Provider**: TinyLlama")
    
    with gr.Tab("💬 Chat"):
        chatbot = gr.Chatbot(label="Real AI Conversation", height=400, type="messages")
        msg = gr.Textbox(label="Your Message", placeholder="Type your message here...")
        clear = gr.Button("Clear")
        
        def user(user_message, history):
            return "", history + [{"role": "user", "content": user_message}]
        
        def bot(history):
            user_message = history[-1]["content"]
            bot_message = chat_with_tinyllama(user_message)
            history.append({"role": "assistant", "content": bot_message})
            return history
        
        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
        clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
'''

with open('gradio_app.py', 'w') as f:
    f.write(gradio_code)

# Create Streamlit app file
streamlit_code = '''#!/usr/bin/env python3
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

@st.cache_resource
def load_model():
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def chat_with_tinyllama(message, model, tokenizer, max_length=200):
    prompt = f"<|user|>\\n{message}\\n<|assistant|>\\n"
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            inputs, max_length=inputs.shape[1] + max_length,
            num_return_sequences=1, temperature=0.7, do_sample=True,
            pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1, no_repeat_ngram_size=3
        )
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<|assistant|>" in full_response:
        response = full_response.split("<|assistant|>")[-1].strip()
    else:
        response = full_response
    return response.replace("<|user|>", "").replace("<|assistant|>", "").strip()

st.set_page_config(page_title="Real TinyLlama AI Assistant", page_icon="🤖", layout="wide")
st.title("🤖 Real TinyLlama On-Device AI Assistant")
st.markdown("**Model**: TinyLlama/TinyLlama-1.1B-Chat-v1.0 | **Parameters**: 1.1B | **Provider**: TinyLlama")

model, tokenizer = load_model()

if "messages" not in st.session_state:
    st.session_state.messages = []

for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

if prompt := st.chat_input("What would you like to know?"):
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)
    
    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            response = chat_with_tinyllama(prompt, model, tokenizer)
        st.markdown(response)
    
    st.session_state.messages.append({"role": "assistant", "content": response})
'''

with open('streamlit_app.py', 'w') as f:
    f.write(streamlit_code)

# Create Flask API file
flask_code = '''#!/usr/bin/env python3
from flask import Flask, request, jsonify
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def chat_with_tinyllama(message, max_length=200):
    prompt = f"<|user|>\\n{message}\\n<|assistant|>\\n"
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            inputs, max_length=inputs.shape[1] + max_length,
            num_return_sequences=1, temperature=0.7, do_sample=True,
            pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1, no_repeat_ngram_size=3
        )
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<|assistant|>" in full_response:
        response = full_response.split("<|assistant|>")[-1].strip()
    else:
        response = full_response
    return response.replace("<|user|>", "").replace("<|assistant|>", "").strip()

app = Flask(__name__)

@app.route('/')
def home():
    return jsonify({
        "message": "Real TinyLlama API",
        "status": "online",
        "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        "endpoints": ["/chat - POST: Send messages to the AI", "/health - GET: Check API health"]
    })

@app.route('/chat', methods=['POST'])
def chat():
    data = request.get_json()
    if not data or 'message' not in data:
        return jsonify({"error": "Message is required"}), 400
    
    response = chat_with_tinyllama(data['message'])
    return jsonify({"message": data['message'], "response": response, "model": "TinyLlama"})

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5001, debug=True)
'''

with open('flask_api.py', 'w') as f:
    f.write(flask_code)

print("✅ Application files created:")
print("   📄 gradio_app.py")
print("   📄 streamlit_app.py")
print("   📄 flask_api.py")


In [None]:
# STEP 3: LAUNCH GRADIO APPLICATION
print("🌐 STEP 3: LAUNCHING GRADIO APPLICATION")
print("=" * 50)

# Kill any existing processes on port 7860 (clean way)
try:
    subprocess.run(['pkill', '-f', 'gradio'], check=False, capture_output=True)
    time.sleep(2)
except:
    pass

# Launch Gradio in background
gradio_process = subprocess.Popen(
    ['python', 'gradio_app.py'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

print("🚀 Gradio launching...")
time.sleep(5)  # Give it time to start

print("\n✅ GRADIO APPLICATION READY!")
print("🌐 URL: http://localhost:7860")
print("💡 Features: Real AI chat with TinyLlama")
print("🎯 Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0")
print("\n📝 Note: Gradio is running in the background")
print("   Click the URL above to test the application")


In [None]:
# STEP 4: LAUNCH STREAMLIT APPLICATION
print("🖥️ STEP 4: LAUNCHING STREAMLIT APPLICATION")
print("=" * 50)

# Kill any existing processes on port 8501 (clean way)
try:
    subprocess.run(['pkill', '-f', 'streamlit'], check=False, capture_output=True)
    time.sleep(2)
except:
    pass

# Launch Streamlit in background
streamlit_process = subprocess.Popen(
    ['streamlit', 'run', 'streamlit_app.py', '--server.port=8501', '--server.headless=true'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

print("🚀 Streamlit launching...")
time.sleep(5)  # Give it time to start

print("\n✅ STREAMLIT APPLICATION READY!")
print("🌐 URL: http://localhost:8501")
print("💡 Features: Real AI chat with TinyLlama")
print("🎯 Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0")
print("\n📝 Note: Streamlit is running in the background")
print("   Click the URL above to test the application")


In [None]:
# STEP 5: LAUNCH FLASK API
print("🔌 STEP 5: LAUNCHING FLASK API")
print("=" * 50)

# Kill any existing processes on port 5001 (clean way)
try:
    subprocess.run(['pkill', '-f', 'flask'], check=False, capture_output=True)
    time.sleep(2)
except:
    pass

# Launch Flask in background
flask_process = subprocess.Popen(
    ['python', 'flask_api.py'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

print("🚀 Flask API launching...")
time.sleep(5)  # Give it time to start

print("\n✅ FLASK API READY!")
print("🌐 URL: http://localhost:5001")
print("💡 Features: REST API with real TinyLlama")
print("🎯 Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0")
print("\n📝 Test with:")
print("   curl -X POST http://localhost:5001/chat \\")
print("        -H 'Content-Type: application/json' \\")
print("        -d '{\"message\": \"Hello!\"}'")
print("\n📝 Note: Flask API is running in the background")


## 🎉 Deployment Complete!

All three applications are now running with real TinyLlama AI:

### 🌐 Application URLs:
1. **Gradio Interface**: http://localhost:7860
2. **Streamlit App**: http://localhost:8501  
3. **Flask API**: http://localhost:5001

### 🧠 Model Information:
- **Model**: TinyLlama/TinyLlama-1.1B-Chat-v1.0
- **Parameters**: 1.1B
- **Provider**: TinyLlama
- **Type**: Real AI (no mock models)

### 🎯 What You've Accomplished:
- ✅ Loaded a real TinyLlama model
- ✅ Created a Gradio web interface
- ✅ Built a Streamlit application
- ✅ Implemented a Flask REST API
- ✅ All apps use the same real AI model
- ✅ No mock models anywhere!

### 🧪 Test Your Applications:
1. **Gradio**: Click http://localhost:7860 for interactive chat
2. **Streamlit**: Click http://localhost:8501 for data analysis
3. **Flask API**: Use curl or Postman to test http://localhost:5001/chat

**All applications are running real AI - no simulations!**
