In [21]:
!pip install -q transformers datasets sentence-transformers faiss-cpu streamlit pyngrok tqdm

In [2]:
import os
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import streamlit as st

In [3]:
# This loads: train/validation/test splits; we will use validation or test for sample tweets.
dataset = load_dataset("cardiffnlp/tweet_eval", "emotion")  # requires internet
# view sizes
print("Splits:", dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/105k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/28.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

Splits: DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})


In [4]:
# Convert validation split to pandas (pick any split; here we use 'validation' to sample)
df_val = pd.DataFrame(dataset["validation"])
# dataset has 'text' and 'label' where label is an int index; let's get label names from dataset features
label_names = dataset["validation"].features["label"].names
print("Emotion labels:", label_names)

Emotion labels: ['anger', 'joy', 'optimism', 'sadness']


In [5]:
# Keep only text and true label name (for evaluation)
df_val = df_val[["text", "label"]].copy()
df_val["label_name"] = df_val["label"].apply(lambda x: label_names[x])

In [6]:
# We'll sample 10 messages (or use first 10)
sample_df = df_val.sample(n=10, random_state=42).reset_index(drop=True)
print(sample_df[["text","label_name"]])

                                                text label_name
0  Fed up of false info from @user mini store, pl...      anger
1                  God this match is dull #Wimbledon    sadness
2  @user there are more #frightening things in li...      anger
3  @user @user can i ask im trying to pout a code...    sadness
4  @user @user @user @user We are also wating whe...      anger
5  Only halfway through #madeforlove by @user But...        joy
6  Add me on snap Whoa.Jay.  #snap #streaks #snap...        joy
7   @user A make up remover and insect sting relief!        joy
8  @user devour the unborn\nhuman rejection\nfrom...      anger
9  I'm pre happy with my Arcadian run, beat a few...        joy


In [7]:
# 2) Load emotion classifier (bhadresh)
# -----------------------
EMOTION_MODEL = "bhadresh-savani/distilbert-base-uncased-emotion"
tokenizer_em = AutoTokenizer.from_pretrained(EMOTION_MODEL)
model_em = AutoModelForSequenceClassification.from_pretrained(EMOTION_MODEL)
# pipeline
em_pipeline = pipeline("text-classification", model=model_em, tokenizer=tokenizer_em, return_all_scores=True)

tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cuda:0


In [8]:
# Helper to get top predicted label
def predict_emotion(text):
    out = em_pipeline(text)[0]  # list of dicts [{'label':..., 'score':...}, ...]
    # pick highest score
    best = max(out, key=lambda x: x["score"])
    # label may be "LABEL_0" for some models; but bhadresh returns actual names usually
    label = best["label"]
    # normalize label strings (lowercase)
    return label.lower(), best["score"]

In [10]:
# Quick test (optional)
print(predict_emotion("I am so happy and excited!"))

('joy', 0.9988194108009338)


In [11]:
# 3) Prepare RAG corpus (templates)
# -----------------------
# For demo I create a small curated corpus of empathetic templates per emotion.
# You can expand this JSON file later or use 'empathetic_dialogues' dataset.
corpus = [
    {"emotion":"joy",    "text":"That's wonderful to hear — what made you feel this way?"},
    {"emotion":"joy",    "text":"Amazing! I'd love to hear more about what's bringing you joy."},
    {"emotion":"sadness","text":"I'm so sorry you're feeling down. Do you want to tell me what's going on?"},
    {"emotion":"sadness","text":"That sounds really tough — I'm here to listen if you want to share more."},
    {"emotion":"anger",  "text":"I can hear how upset you are. Do you want to talk about what happened?"},
    {"emotion":"anger",  "text":"It's understandable to feel angry about that — what's the worst part for you?"},
    {"emotion":"surprise","text":"Oh — that's surprising. How do you feel about that?"},
    {"emotion":"fear",    "text":"That sounds scary. Are you safe right now? Would you like to talk it through?"},
    {"emotion":"neutral", "text":"Thanks for sharing that — want to tell me more?"}
]
corpus_df = pd.DataFrame(corpus)

In [12]:
# 4) Build embeddings + FAISS index for corpus (sentence-transformers)
# -----------------------
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # fast
texts = corpus_df["text"].tolist()
embs = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

d = embs.shape[1]
index = faiss.IndexFlatIP(d)  # cosine if vectors normalized
index.add(embs)
print("FAISS index size:", index.ntotal)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index size: 9


In [13]:
# retrieval function: given user text + emotion, retrieve top-k templates prioritized by emotion match
def retrieve_templates(user_text, detected_emotion, top_k=3):
    # embed user text
    q_emb = embed_model.encode([user_text], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_emb, top_k)
    results = []
    for idx in I[0]:
        if idx < 0: continue
        row = corpus_df.iloc[idx].to_dict()
        # prefer same-emotion templates by boosting them to top
        same_em = (row["emotion"].lower() == detected_emotion.lower())
        results.append({"idx": idx, "text": row["text"], "emotion": row["emotion"], "same_emotion": same_em})
    # sort so that same_emotion come first
    results = sorted(results, key=lambda x: (not x["same_emotion"], x["idx"]))
    return results[:top_k]

In [14]:
# 5) Few-shot prompt engineering function
# -----------------------
# This function builds a small prompt using detected emotion + retrieved templates.
# For simplicity we will produce the final answer by template re-framing (no heavy generation).
def few_shot_reply(user_text, detected_emotion):
    templates = retrieve_templates(user_text, detected_emotion, top_k=3)
    # Create a composed reply: pick best template (same emotion if available) and personalize
    if len(templates) == 0:
        base = "Thanks for sharing. Do you want to tell me more?"
    else:
        base = templates[0]["text"]
    # Simple personalization: reflect back a short phrase + template
    # reflection (short): pick first 8 words of user text to paraphrase (very simple)
    words = user_text.strip().split()
    refl = " ".join(words[:8]) + ("..." if len(words)>8 else "")
    reply = f"I hear you: \"{refl}\". {base} \n\n(Disclaimer: I'm not a therapist; seek professional help for serious issues.)"
    return reply, templates

In [15]:
# 6) Run prediction on the 10 samples -> produce Q/A pairs
# -----------------------
qa_results = []
for i, row in sample_df.iterrows():
    text = row["text"]
    true_label = row["label_name"]
    pred_label, score = predict_emotion(text)
    reply, templates = few_shot_reply(text, pred_label)
    qa_results.append({
        "text": text,
        "true_label": true_label,
        "pred_label": pred_label,
        "pred_score": float(score),
        "reply": reply,
        "used_templates": templates
    })

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [16]:
# Convert to DataFrame
qa_df = pd.DataFrame(qa_results)
print(qa_df[["text","true_label","pred_label","pred_score"]])

                                                text true_label pred_label  \
0  Fed up of false info from @user mini store, pl...      anger      anger   
1                  God this match is dull #Wimbledon    sadness    sadness   
2  @user there are more #frightening things in li...      anger       fear   
3  @user @user can i ask im trying to pout a code...    sadness      anger   
4  @user @user @user @user We are also wating whe...      anger      anger   
5  Only halfway through #madeforlove by @user But...        joy        joy   
6  Add me on snap Whoa.Jay.  #snap #streaks #snap...        joy   surprise   
7   @user A make up remover and insect sting relief!        joy        joy   
8  @user devour the unborn\nhuman rejection\nfrom...      anger      anger   
9  I'm pre happy with my Arcadian run, beat a few...        joy       fear   

   pred_score  
0    0.988618  
1    0.995341  
2    0.992512  
3    0.734891  
4    0.771435  
5    0.995088  
6    0.730909  
7    0.985027

In [17]:
# 7) Quick local evaluation (how many correct among 10)
# -----------------------
# We consider a prediction correct when pred_label contains the true label string (simple)
def is_correct(pred, true):
    return true.lower() in pred.lower() or pred.lower() in true.lower()

qa_df["correct"] = qa_df.apply(lambda r: is_correct(r["pred_label"], r["true_label"]), axis=1)
correct_count = qa_df["correct"].sum()
print(f"Correct predictions out of {len(qa_df)}: {correct_count}")

Correct predictions out of 10: 6


In [18]:
# 8) Streamlit UI
# -----------------------
# Save qa_df to CSV to be read by Streamlit app, or embed results into Streamlit app directly.
qa_df.to_csv("qa_results.csv", index=False)

# Minimal Streamlit app code is below - save as app.py (see next code block).
print("Saved qa_results.csv - now create app.py using the Streamlit code in the repository.")

Saved qa_results.csv - now create app.py using the Streamlit code in the repository.


In [69]:
# 🚫 Kill any old Streamlit / ngrok processes
!pkill streamlit || echo "No old streamlit running"
!pkill ngrok || echo "No old ngrok running"

No old ngrok running


In [70]:
# ============================================================
# 📦 Install required packages
# ============================================================
!pip install -q transformers datasets sentence-transformers faiss-cpu streamlit pyngrok torch

# ============================================================
# 🔑 Ngrok Authentication
# ============================================================
NGROK_AUTH = "32W0jQ9gdgoCYUmcjw6MPIb0mA1_4d5as2uYGziumhtcD3mB2"   # <-- put your token here
!ngrok config add-authtoken $NGROK_AUTH

# ============================================================
# 📝 Write app.py (Chatbot with Advanced UI)
# ============================================================
app_code = """
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sentence_transformers import SentenceTransformer
import faiss

# -----------------------
# 1) Load Emotion Classifier
# -----------------------
EMOTION_MODEL = "bhadresh-savani/distilbert-base-uncased-emotion"
tokenizer_em = AutoTokenizer.from_pretrained(EMOTION_MODEL)
model_em = AutoModelForSequenceClassification.from_pretrained(EMOTION_MODEL)
em_pipeline = pipeline("text-classification", model=model_em, tokenizer=tokenizer_em, return_all_scores=True)

def predict_emotion(text):
    out = em_pipeline(text)[0]
    best = max(out, key=lambda x: x["score"])
    return best["label"].lower(), best["score"]

# -----------------------
# 2) RAG Corpus + FAISS
# -----------------------
corpus = [
    {"emotion":"joy",    "text":"That's wonderful to hear — what made you feel this way?"},
    {"emotion":"joy",    "text":"Amazing! I'd love to hear more about what's bringing you joy."},
    {"emotion":"sadness","text":"I'm so sorry you're feeling down. Do you want to tell me what's going on?"},
    {"emotion":"sadness","text":"That sounds really tough — I'm here to listen if you want to share more."},
    {"emotion":"anger",  "text":"I can hear how upset you are. Do you want to talk about what happened?"},
    {"emotion":"anger",  "text":"It's understandable to feel angry about that — what's the worst part for you?"},
    {"emotion":"surprise","text":"Oh — that's surprising. How do you feel about that?"},
    {"emotion":"fear",    "text":"That sounds scary. Are you safe right now? Would you like to talk it through?"},
    {"emotion":"neutral", "text":"Thanks for sharing that — want to tell me more?"}
]

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
texts = [c["text"] for c in corpus]
embs = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
d = embs.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embs)

def retrieve_templates(user_text, detected_emotion, top_k=3):
    q_emb = embed_model.encode([user_text], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_emb, top_k)
    results = []
    for idx in I[0]:
        row = corpus[idx]
        same_em = (row["emotion"].lower() == detected_emotion.lower())
        results.append({"text": row["text"], "emotion": row["emotion"], "same_emotion": same_em})
    results = sorted(results, key=lambda x: (not x["same_emotion"]))
    return results[:top_k]

def few_shot_reply(user_text, detected_emotion):
    templates = retrieve_templates(user_text, detected_emotion)
    if len(templates) == 0:
        base = "Thanks for sharing. Do you want to tell me more?"
    else:
        base = templates[0]["text"]
    reply = f"I hear you. {base}\\n\\n(Disclaimer: I'm not a therapist; please seek professional help for serious issues.)"
    return reply

# -----------------------
# 3) Streamlit UI (Advanced Chat Design)
# -----------------------
st.set_page_config(page_title="EmpathyBot Chat", page_icon="🤖", layout="wide")
st.title("💬 EmpathyBot - Sentiment Aware Chat 🤖")

# Keep chat history
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

# Input field
user_input = st.text_input("Type your message:", key="user_message", placeholder="Say something...")

# Send button
if st.button("Send"):
    if user_input.strip() != "":
        pred_label, score = predict_emotion(user_input)
        reply = few_shot_reply(user_input, pred_label)

        st.session_state.chat_history.append({"role": "user", "content": user_input})
        st.session_state.chat_history.append({"role": "bot", "content": f"[Emotion: {pred_label}, {score:.2f}] → {reply}"})
    else:
        st.warning("Please type a message before sending.")

# Display chat history
st.markdown("### Chat History")
chat_container = st.container()

with chat_container:
    for msg in st.session_state.chat_history:
        if msg["role"] == "user":
            st.markdown(
                f"<div style='text-align:right; color:white; background:#0b93f6; padding:10px; "
                f"border-radius:15px; margin:5px; max-width:70%; float:right; clear:both;'>"
                f"<b>You:</b> {msg['content']}</div>",
                unsafe_allow_html=True,
            )
        else:
            st.markdown(
                f"<div style='text-align:left; color:black; background:#e5e5ea; padding:10px; "
                f"border-radius:15px; margin:5px; max-width:70%; float:left; clear:both;'>"
                f"<b>Bot:</b> {msg['content']}</div>",
                unsafe_allow_html=True,
            )
"""

with open("app.py", "w", encoding="utf-8") as f:
    f.write(app_code)

print("✅ app.py created with Advanced UI")

# ============================================================
# 🚀 Run Streamlit + ngrok
# ============================================================
import subprocess, time
from pyngrok import ngrok

process = subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501"])
time.sleep(5)

public_url = ngrok.connect(8501)
print("🌍 Public URL:", public_url)


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
✅ app.py created with Advanced UI
🌍 Public URL: NgrokTunnel: "https://22538509e6d0.ngrok-free.app" -> "http://localhost:8501"
