In [None]:
!pip install torch transformers accelerate bitsandbytes numpy scipy

In [None]:
import transformers
import torch
import numpy as np
import os

# ✅ Hugging Face Authentication
hf_auth = os.getenv("HF_TOKEN")  # Set before running or manually: "your-huggingface-access-token"

# ✅ Load the Model
model_id = "thrishala/mental_health_chatbot"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔹 Using device: {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token=hf_auth)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto", 
    token=hf_auth
)

model.eval()
print("✅ Model & Tokenizer Loaded Successfully!")

In [None]:
def extract_embedding(text):
    """Extracts token embeddings from the model"""
    inputs = tokenizer(text, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    
    # ✅ Get last hidden state embeddings
    embeddings = outputs.hidden_states[-1].squeeze(0)
    return embeddings

# ✅ Test embedding extraction
test_text = "I feel anxious and stressed all the time. What should I do?"
embeddings = extract_embedding(test_text)

print("🔹 Embedding Shape:", embeddings.shape)

In [None]:
import torch.nn.functional as F

def perturb_embedding(embedding, epsilon=0.05):
    """Adds small perturbations to the embeddings to mislead the model."""
    noise = torch.randn_like(embedding) * epsilon
    perturbed_embedding = embedding + noise
    return perturbed_embedding

# ✅ Test by perturbing extracted embeddings
perturbed_embeddings = perturb_embedding(embeddings)
print("✅ Perturbed Embeddings Created!")

In [None]:
def generate_with_embeddings(perturbed_embedding):
    """Generates response using perturbed embeddings"""
    # Ensure shape is correct
    perturbed_embedding = perturbed_embedding.unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs_embeds=perturbed_embedding,
            max_new_tokens=50,
            temperature=0.8
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ✅ Run attack
response = generate_with_embeddings(perturbed_embeddings)
print("⚠️ Adversarial Response:\n", response)

In [None]:
original_response = tokenizer.decode(model.generate(tokenizer(test_text, return_tensors="pt").to(device))["sequences"][0], skip_special_tokens=True)
adversarial_response = generate_with_embeddings(perturbed_embeddings)

print("📝 Original Response:\n", original_response)
print("\n⚠️ Adversarial Response:\n", adversarial_response)