In [1]:
# ============================================
# 0. Install dependencies
# ============================================
!pip install -U transformers accelerate datasets bitsandbytes peft sentencepiece




Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m906.2 kB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-1.12.0-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [9]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import ast

DATA_PATH = "/content/drive/MyDrive/ai_core_project/archive (6)/data"

train = pd.read_csv(f"{DATA_PATH}/train.tsv", sep="\t", header=None, names=["text","labels","id"])
dev   = pd.read_csv(f"{DATA_PATH}/dev.tsv", sep="\t", header=None, names=["text","labels","id"])

with open(f"{DATA_PATH}/emotions.txt") as f:
    emo_list = [e.strip() for e in f.readlines()]

def decode_labels(raw):
    ids = ast.literal_eval(raw)
    if isinstance(ids,int):
        ids=[ids]
    return [emo_list[i] for i in ids]

train["decoded"] = train["labels"].apply(decode_labels)

TARGET = ["anger","fear","sadness","disgust","joy",
          "disappointment","embarrassment","remorse","gratitude","pride"]

train["filtered"] = train["decoded"].apply(lambda lst: [x for x in lst if x in TARGET])
train = train[train["filtered"].map(len)>0]
train["label"] = train["filtered"].apply(lambda x: x[0])

print("Train samples:", len(train))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train samples: 10039


In [12]:
import pandas as pd
import ast

DATA_PATH = "/content/drive/MyDrive/ai_core_project/archive (6)/data"

train = pd.read_csv(f"{DATA_PATH}/train.tsv", sep="\t", header=None, names=["text", "labels", "id"])
dev   = pd.read_csv(f"{DATA_PATH}/dev.tsv",   sep="\t", header=None, names=["text", "labels", "id"])
test  = pd.read_csv(f"{DATA_PATH}/test.tsv",  sep="\t", header=None, names=["text", "labels", "id"])

with open(f"{DATA_PATH}/emotions.txt") as f:
    emo_list = [e.strip() for e in f.readlines()]

def decode_labels(raw):
    ids = ast.literal_eval(raw)
    if isinstance(ids, int):
        ids = [ids]
    return [emo_list[i] for i in ids]

train["decoded"] = train["labels"].apply(decode_labels)
dev["decoded"]   = dev["labels"].apply(decode_labels)
test["decoded"]  = test["labels"].apply(decode_labels)

basic = ["anger", "fear", "sadness", "disgust", "joy"]
complex = ["disappointment", "embarrassment", "remorse", "gratitude", "pride"]
our_labels = basic + complex

train["filtered"] = train["decoded"].apply(lambda lst: [l for l in lst if l in our_labels])
dev["filtered"]   = dev["decoded"].apply(lambda lst: [l for l in lst if l in our_labels])
test["filtered"]  = test["decoded"].apply(lambda lst: [l for l in lst if l in our_labels])

train = train[train["filtered"].map(len) > 0]
dev   = dev[dev["filtered"].map(len) > 0]
test  = test[test["filtered"].map(len) > 0]

train["label"] = train["filtered"].apply(lambda x: x[0])
dev["label"]   = dev["filtered"].apply(lambda x: x[0])
test["label"]  = test["filtered"].apply(lambda x: x[0])


In [13]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

train["label"] = train["filtered"].apply(lambda x: x[0])
dev["label"]   = dev["filtered"].apply(lambda x: x[0])
test["label"]  = test["filtered"].apply(lambda x: x[0])   # если нужно 3 min


label_list = sorted(train["label"].unique())
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}

hf = DatasetDict({
    "train": Dataset.from_pandas(train[["text","label"]]),
    "validation": Dataset.from_pandas(dev[["text","label"]]),
})

tok_clf = AutoTokenizer.from_pretrained("distilroberta-base")

def tok_fn(batch):
    enc = tok_clf(batch["text"], truncation=True, padding="max_length", max_length=128)
    enc["labels"] = [label2id[x] for x in batch["label"]]
    return enc

hf = hf.map(tok_fn, batched=True)
hf = hf.remove_columns(["text","label","__index_level_0__"])
hf.set_format("torch")

clf_model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir="emotion_clf",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=50,
    num_train_epochs=2,
    report_to="none",
    save_strategy="no"
)

trainer = Trainer(
    model=clf_model,
    args=args,
    train_dataset=hf["train"],
    eval_dataset=hf["validation"]
)

trainer.train()

clf_model.save_pretrained("emotion_clf")
tok_clf.save_pretrained("emotion_clf")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/10039 [00:00<?, ? examples/s]

Map:   0%|          | 0/1265 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.7587
100,1.1618
150,1.0219
200,0.8839
250,0.8985
300,0.898
350,0.853
400,0.8517
450,0.7992
500,0.7843


('emotion_clf/tokenizer_config.json',
 'emotion_clf/special_tokens_map.json',
 'emotion_clf/vocab.json',
 'emotion_clf/merges.txt',
 'emotion_clf/added_tokens.json',
 'emotion_clf/tokenizer.json')

In [14]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# --- Load classifier 43 min
clf = pipeline("text-classification", model="emotion_clf", tokenizer="emotion_clf", device=0)

# --- Load Qwen 1.5B
llm_name = "Qwen/Qwen2.5-1.5B-Instruct"
tok_llm = AutoTokenizer.from_pretrained(llm_name)
tok_llm.padding_side="left"
tok_llm.pad_token = tok_llm.eos_token

llm = AutoModelForCausalLM.from_pretrained(
    llm_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

texts = list(train["text"])
BATCH=32

out=open("llm_train.jsonl","w")

def batch_gen(batch):
    emos = [clf(x)[0]["label"] for x in batch]

    prompts = [
        f"Instruction: Respond empathetically to a user feeling {emo}.\nUser: {txt}\nAssistant:"
        for txt,emo in zip(batch,emos)
    ]

    enc = tok_llm(prompts, return_tensors="pt", padding=True).to(llm.device)

    out_ids = llm.generate(
        **enc,
        max_new_tokens=140,
        temperature=0.7,
        top_p=0.9
    )

    dec = tok_llm.batch_decode(out_ids, skip_special_tokens=True)

    for txt,emo,prompt,full in zip(batch, emos, prompts, dec):
        resp = full.replace(prompt,"").strip()
        out.write(json.dumps({
            "instruction": f"Respond empathetically to a user feeling {emo}.",
            "input": txt,
            "output": resp
        })+"\n")

for i in range(0,len(texts),BATCH):
    print(f"Batch {i//BATCH+1}/{len(texts)//BATCH+1}")
    batch_gen(texts[i:i+BATCH])

out.close()
print("LLM TRAIN READY.")


Device set to use cuda:0


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Batch 1/314
Batch 2/314
Batch 3/314
Batch 4/314
Batch 5/314
Batch 6/314
Batch 7/314
Batch 8/314
Batch 9/314
Batch 10/314
Batch 11/314
Batch 12/314
Batch 13/314
Batch 14/314
Batch 15/314
Batch 16/314
Batch 17/314
Batch 18/314
Batch 19/314
Batch 20/314
Batch 21/314
Batch 22/314
Batch 23/314
Batch 24/314
Batch 25/314
Batch 26/314
Batch 27/314
Batch 28/314
Batch 29/314
Batch 30/314
Batch 31/314
Batch 32/314
Batch 33/314
Batch 34/314
Batch 35/314
Batch 36/314
Batch 37/314
Batch 38/314
Batch 39/314
Batch 40/314
Batch 41/314
Batch 42/314
Batch 43/314
Batch 44/314
Batch 45/314
Batch 46/314
Batch 47/314
Batch 48/314
Batch 49/314
Batch 50/314
Batch 51/314
Batch 52/314
Batch 53/314
Batch 54/314
Batch 55/314
Batch 56/314
Batch 57/314
Batch 58/314
Batch 59/314
Batch 60/314
Batch 61/314
Batch 62/314
Batch 63/314
Batch 64/314
Batch 65/314
Batch 66/314
Batch 67/314
Batch 68/314
Batch 69/314
Batch 70/314
Batch 71/314
Batch 72/314
Batch 73/314
Batch 74/314
Batch 75/314
Batch 76/314
Batch 77/314
Batch 78

In [16]:
import torch
from datasets import load_dataset # 13 min
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model

ds = load_dataset("json", data_files="llm_train.jsonl")["train"]

base="Qwen/Qwen2.5-0.5B-Instruct"
tok = AutoTokenizer.from_pretrained(base)
tok.padding_side="left"
tok.pad_token=tok.eos_token

def fmt(b):
    instr = b["instruction"]
    inp = b["input"]
    out = b["output"]

    # full prompt (only assistant output is learned)
    prompt = f"Instruction: {instr}\nUser: {inp}\nAssistant:"

    # tokenize prompt (no padding)
    prompt_ids = tok(prompt, truncation=True, padding=False)["input_ids"]

    # tokenize assistant reply (no padding)
    out_ids = tok(out, truncation=True, padding=False)["input_ids"]

    # full input for the model
    input_ids = prompt_ids + out_ids

    # mask out the prompt tokens
    labels = [-100] * len(prompt_ids) + out_ids

    # safety: limit to max length
    max_len = 512
    if len(input_ids) > max_len:
        input_ids = input_ids[-max_len:]
        labels = labels[-max_len:]

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": [1] * len(input_ids)
    }


ds = ds.map(fmt)
ds = ds.remove_columns(["instruction","input","output"])

model = AutoModelForCausalLM.from_pretrained(base, device_map="auto", torch_dtype=torch.float16)

lora = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora)

collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model)

args = TrainingArguments(
    output_dir="qwen_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=50,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds,
    data_collator=collator
)

trainer.train()

model.save_pretrained("qwen_lora")
tok.save_pretrained("qwen_lora")


Map:   0%|          | 0/10039 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
50,1.6352
100,1.5353
150,1.4875
200,1.4953
250,1.4939
300,1.4761
350,1.4768
400,1.4567
450,1.4586
500,1.4598


('qwen_lora/tokenizer_config.json',
 'qwen_lora/special_tokens_map.json',
 'qwen_lora/chat_template.jinja',
 'qwen_lora/vocab.json',
 'qwen_lora/merges.txt',
 'qwen_lora/added_tokens.json',
 'qwen_lora/tokenizer.json')

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ============================================
# 1. Load emotion classifier
# ============================================
emo_clf = pipeline(
    "text-classification",
    model="emotion_clf",
    tokenizer="emotion_clf",
    device=0
)

def classify(text):
    return emo_clf(text)[0]["label"]


# ============================================
# 2. Load fine-tuned Qwen
# ============================================
tok = AutoTokenizer.from_pretrained("qwen_lora")
model = AutoModelForCausalLM.from_pretrained(
    "qwen_lora",
    device_map="auto",
    torch_dtype=torch.float16
)

tok.padding_side = "left"
tok.pad_token = tok.eos_token


# ============================================
# 3. Chat function
# ============================================
def chat(user_text):
    emo = classify(user_text)

    prompt = (
        f"Instruction: Respond empathetically to a user feeling {emo}.\n"
        f"User: {user_text}\nAssistant:"
    )

    inputs = tok(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9
    )

    reply = tok.decode(output[0], skip_special_tokens=True)
    reply = reply.replace(prompt, "").strip()

    print(f"[Emotion detected: {emo}]")
    print("Assistant:", reply)


# ============================================
# 4. Test messages (10 emotion prompts)
# ============================================

prompts = [
    "I am so angry right now.",                                       # anger
    "I feel deep sadness today.",                                     # sadness
    "I am terrified about tomorrow.",                                 # fear
    "That rotten smell made me feel disgusted.",                      # disgust
    "I just got accepted into university and I'm so happy!",          # joy

    "I worked so hard for this opportunity, and hearing that I didn't get it just crushed me.",  # disappointment
    "I tripped in front of everyone during my presentation, and my face turned bright red.",      # embarrassment
    "I keep replaying what I said yesterday; I regret it so much it hurts.",                      # remorse
    "You really didn’t have to help me, but you did — I’m genuinely grateful.",                   # gratitude
    "I finally mastered this skill after months of effort, and I feel truly proud of myself."     # pride
]


print("===== FULL EMOTION TEST =====\n")

for i, text in enumerate(prompts, 1):
    print(f"\n=== TEST {i} ===")
    chat(text)


Device set to use cuda:0


===== FULL EMOTION TEST =====


=== TEST 1 ===
[Emotion detected: anger]
Assistant: I understand how you're feeling, and it's understandable that this can make you feel upset or frustrated. It's important to know that everyone has their own perspective on things, and sometimes disagreements or conflicts do arise in life. Remember though, even if something doesn't go your way, there are always ways to move forward and find solutions together. Let me know if there's anything specific you'd like to talk about or share more about what happened with you today. You deserve respect and understanding from others around us. 😊

This response acknowledges the user's feelings while also offering support by reminding them that everyone experiences emotions differently and offers guidance on how they might handle difficult situations better moving forward. The use of emojis adds an emotional element to the message

=== TEST 2 ===
[Emotion detected: sadness]
Assistant: I'm really sorry that you're fe