In [1]:
'''
use transformers to load model --> AutoModelForCausalLM module to load model and tokenizer

- Apply quantization(e.g int8) to reduce RAM usage and optimize performance

- module 
'''
'''
# Libraries
transformers: To load and use Hugging Face models.
accelerate: For optimization in CPU environments.
bitsandbytes: To enable quantized models (if supported).
datasets: If any datasets are needed for testing.
torch: Install a CPU-compatible version.
'''

'''
pip install transformers accelerate bitsandbytes datasets flask 
pip install torch --index-url https://download.pytorch.org/whl/cpu
'''



'\nuse transformers to load model --> AutoModelForCausalLM module to load model and tokenizer\n\n- Apply quantization(e.g int8) to reduce RAM usage and optimize performance\n\n- module \n'

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from flask import Flask, request, jsonify


In [8]:
# Load model and tokenizer
model_name = "Qwen/Qwen2.5-Math-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    torch_dtype=torch.float32
)


In [13]:
# initialize flask api
app = Flask(__name__)

In [14]:
@app.route("/generate", methods=["POST"])
def generate_response():
    data = request.json
    prompt = data.get("prompt", "")
    if not prompt:
        return jsonify({"error": "Prompt is required"}), 400
    
    # Generate a response
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids) 
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=100,
        temperature=0.7,
        do_sample=True)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return jsonify({"response": response})

In [15]:
# run flask
if __name__ == "__main__":
    app.run(host="192.168.4.31", port=5000) 

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://192.168.4.31:5000
[33mPress CTRL+C to quit[0m
192.168.4.20 - - [26/Jan/2025 16:29:33] "[31m[1mPOST /generate HTTP/1.1[0m" 415 -
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
192.168.4.20 - - [26/Jan/2025 16:32:09] "POST /generate HTTP/1.1" 200 -


In [None]:
'''curl -X POST http://192.168.4.31:5000/generate \
-H "Content-Type: application/json" \
-d '{"prompt": "How can I debug a Python error?"}'
'''

In [None]:
'''
pip install nbconvert

jupyter nbconvert --to script install_deepseek.ipynb

jupyter nbconvert --to notebook --execute install_deepseek.ipynb

jupyter nbconvert --to notebook --execute install_deepseek.ipynb --output executed_install_deepseek.ipynb

ls -d ~/.cache/huggingface/hub/models--* | xargs du -sh
2.9G    /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-Math-1.5B

'''