In [1]:
import sys
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define your custom package installation path in Google Drive
#    You can change 'my_colab_packages' to any folder name you prefer.
custom_package_path = '/content/drive/MyDrive/my_colab_packages'

# Define a path for storing large language models persistently
model_storage_path = '/content/drive/MyDrive/colab_models'

# 3. Add the custom package path to Python's system path
if custom_package_path not in sys.path:
    sys.path.insert(0, custom_package_path)

print(f"Custom package path added to sys.path: {custom_package_path}")
print(f"Current sys.path: {sys.path}")

Mounted at /content/drive
Custom package path added to sys.path: /content/drive/MyDrive/my_colab_packages
Current sys.path: ['/content/drive/MyDrive/my_colab_packages', '/content', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.12/dist-packages/IPython/extensions', '/root/.ipython']


In [2]:
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

In [6]:
# Create the HuggingFace pipeline using the (downloaded or loaded) model and tokenizer
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
local_model_dir = os.path.join(model_storage_path, model_id.split('/')[-1])
tokenizer = AutoTokenizer.from_pretrained(local_model_dir)
model = AutoModelForCausalLM.from_pretrained(local_model_dir)
print("Loading complete.")
hf_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=0.1, # Reduced for more direct answers
    max_new_tokens=30 # Reduced to encourage shorter, exact answers
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)
model = ChatHuggingFace(llm=llm)

Device set to use cuda:0


Loading complete.


In [7]:

import gradio as gr



# Function to generate a response from the model
def generate_response(prompt_txt):
    raw_response = model.invoke(prompt_txt)
    # Extract only the assistant's response and clean up tags
    response_content = raw_response.content
    if '<|assistant|>' in response_content:
        # Split by assistant tag and take the part after it
        generated_text = response_content.split('<|assistant|>')[-1].strip()
    else:
        generated_text = response_content.strip()

    # Remove any trailing '</s>' or other potential artifacts
    if '</s>' in generated_text:
        generated_text = generated_text.split('</s>')[0].strip()

    return generated_text

# Create Gradio interface
chat_application = gr.Interface(
    fn=generate_response,
    allow_flagging="never",
    inputs=gr.Textbox(label="Input", lines=2, placeholder="Type your question here..."),
    outputs=gr.Textbox(label="Output"),
    title="LLM.ai Chatbot",
    description="Ask any question and the chatbot will try to answer."
)

# Launch the app
chat_application.launch(server_name="127.0.0.1", server_port= 7868)



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://93b48470fe81bc3fc8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


