In [None]:
!pip install transformers gradio bitsandbytes accelerate

### Inferencing

In [2]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "SkyR/linkedin-8bit-phi4"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    load_in_8bit=True,  # or use 8-bit
    torch_dtype=torch.float16
)

model = torch.compile(model)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/423 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/4.45G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

In [3]:
user_input = "Create a light-hearted linkedin post in importance of mental health at workplace."

prompt = f"<|user|>\n{user_input}\n<|assistant|>\n"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

output_ids = model.generate(
    input_ids,
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [7]:
response = output_text[len(user_input):].strip()
print(response)


Title: Unplugging for Productivity - A Mental Health Reminder!

Hey Team,

As we navigate our busy workdays, let's not forget that the mind is just as important to productivity and success. In fact, taking care of your mental well-being isn't optional; it's essential.

Just like you'd never skip lunch or ignore an oil change on your car, don't neglect self-care! Remember:

1️⃣ Regular breaks can recharge batteries 🧠💡
2️⃣ Open communication creates support systems 🤝
3️⃣ Exercise releases endorphins 📈

Let's make time today NOT JUST for meetings but also for moments where you laugh, stretch out those legs (or arms), breathe deeply, or simply take pride in what you're accomplishing together!!!

Together we're unstoppable – because when it comes down to it... We're all part of this incredible team working towards something amazing 🌟 

Stay healthy,
[Your Name]

P.S.: Feel free to share any tips you've found helpful with fellow coworkers using #MentalHealthMat


### Final App

In [22]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "SkyR/linkedin-8bit-phi4"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    load_in_8bit=True,  # or use 8-bit
    torch_dtype=torch.float16
)

model = torch.compile(model)

def generate_response(history, user_input):
    # Ensure history is a list of tuples (user_input, response)
    if not isinstance(history, list):
        history = []
    # Filter out any items that are not tuples of length 2
    history = [item for item in history if isinstance(item, tuple) and len(item) == 2]

    user_input = "Create a linked post of around 120 words on this: " + user_input

    prompt = f"<|user|>\n{user_input}\n<|assistant|>\n"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    output_ids = model.generate(
        input_ids,
        max_new_tokens=175,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Assuming the model output starts with the prompt, remove it
    # A more robust approach might involve finding the start of the assistant's response
    if output_text.startswith(user_input):
        response = output_text[len(user_input):].strip()
    else:
        response = output_text.strip()

    # Append the user input and response as a tuple to history
    history.append((user_input, response))

    return history

# Function to clear the chat
def clear_chat():
    return [], "" # Return empty list for history and empty string for textbox


with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=400)
    msg = gr.Textbox(label="Ask me to generate a LinkedIn post...")
    submit_btn = gr.Button("Submit")
    clear_btn = gr.Button("Clear") # Added clear button

    # Link components and events
    submit_btn.click(
        generate_response,
        inputs=[chatbot, msg],
        outputs=[chatbot],
        queue=False
    )
    msg.submit(
        generate_response,
        inputs=[chatbot, msg],
        outputs=[chatbot],
        queue=False
    )
    clear_btn.click(
        clear_chat,
        inputs=[],
        outputs=[chatbot, msg],
        queue=False
    )


demo.launch(share=True, debug=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  chatbot = gr.Chatbot(height=400)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://0c4d6b816bc1b74012.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://0c4d6b816bc1b74012.gradio.live


