In [None]:
!pip install ngrok

In [None]:
import ngrok

# Set the authtoken
NGROK_TOKEN = "Put your Ngrok Token Here"
ngrok.set_auth_token(NGROK_TOKEN)

# Start the tunnel
public_url = ngrok.connect(5000)
print(f"Public URL: {public_url}")

Load the model and run the Flask File 

In [None]:
import threading
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
from transformers import BitsAndBytesConfig
import torch

# Initialize Flask app
app = Flask(__name__)

# # Define the output directory for your model
output_dir = "/content/drive/MyDrive/LlamaResultsSaved"  # Replace with your actual output directory

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=True)

# Configure quantization settings for 4-bit model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False
)

# Load the base model with quantization
base_model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your base model if necessary
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    offload_folder="./offload"  # Ensure you have this folder created
)

# Load LoRA adapter
finetuned_model = PeftModel.from_pretrained(model, output_dir)

# Initialize pipeline for text generation
pipe = pipeline(
    task="text-generation",
    model=finetuned_model,
    tokenizer=tokenizer,
    max_length=500,
    device=0  # Ensure it uses GPU if available
)

# Flask route for text generation
@app.route('/generate', methods=['POST'])
def generate_response():
    """API endpoint for generating responses."""
    data = request.get_json()
    prompt = data.get('prompt', '')

    if not prompt:
        return jsonify({"error": "Prompt is required"}), 400

    formatted_prompt = f"[INST] {prompt} [/INST]"
    result = pipe(formatted_prompt)

    response_text = result[0]['generated_text'][len(prompt) + 15:]
    return jsonify({"response": response_text})

# Run the Flask app
def run_flask():
    app.run(port=5000, use_reloader=False, debug=True)

# Start Flask app in a separate thread
flask_thread = threading.Thread(target=run_flask)
flask_thread.start()