In [None]:
import os
print(os.listdir('/content/'))


['.config', 'dailydialog_train.json', 'sample_data']


In [None]:
import json
import re
import spacy

nlp = spacy.load("en_core_web_sm")

CONJUNCTIONS = ['and', 'but', 'or', 'so', 'because', 'although', 'though', 'yet', 'for', 'nor']

def split_into_clauses(text):

    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    clauses = []
    for sent in sentences:

        pattern = r'\b(?:' + '|'.join(CONJUNCTIONS) + r')\b'
        parts = re.split(pattern, sent, flags=re.IGNORECASE)

        clauses.extend([p.strip() for p in parts if p.strip()])
    return clauses

def process_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    result = {}
    for conv_id, utterances in data.items():
        result[conv_id] = []
        for utt in utterances:
            utt_clauses = []
            for turn in utt:
                clauses = split_into_clauses(turn['utterance'])
                utt_clauses.append({
                    'turn': turn['turn'],
                    'speaker': turn['speaker'],
                    'utterance': turn['utterance'],
                    'clauses': clauses
                })
            result[conv_id].append(utt_clauses)
    return result

train_path = '/content/dailydialog_train.json'
test_path = '/content/dailydialog_test (1).json'

train_clauses = process_file(train_path)
test_clauses = process_file(test_path)

with open('dailydialog_train_clauses.json', 'w', encoding='utf-8') as f:
    json.dump(train_clauses, f, indent=2, ensure_ascii=False)

with open('dailydialog_test_clauses.json', 'w', encoding='utf-8') as f:
    json.dump(test_clauses, f, indent=2, ensure_ascii=False)

print("Clause segmentation complete. Results saved as *_clauses.json.")

Clause segmentation complete. Results saved as *_clauses.json.


In [None]:
import json
from sentence_transformers import SentenceTransformer, util


model = SentenceTransformer('all-MiniLM-L6-v2')
EMOTION_KEYWORDS = {
    "happy", "happiness", "sad", "sadness", "angry", "anger", "fear", "afraid", "disgust", "surprise", "joy", "excited", "upset", "worried", "anxious"
}

def is_emotion_clause(clause, utterance, emotion):

    clause_lower = clause.lower()
    if any(word in clause_lower for word in EMOTION_KEYWORDS):
        return True

    if emotion and emotion.lower() != 'neutral':
        emb_clause = model.encode(clause, convert_to_tensor=True)
        emb_utter = model.encode(utterance, convert_to_tensor=True)
        sim = util.cos_sim(emb_clause, emb_utter).item()
        if sim > 0.7:
            return True
    return False

def annotate_clauses_semantic_better(clause_json_path, original_json_path, output_path, sim_threshold=0.6):
    with open(clause_json_path, 'r', encoding='utf-8') as f:
        clause_data = json.load(f)
    with open(original_json_path, 'r', encoding='utf-8') as f:
        original_data = json.load(f)

    annotated = {}

    for conv_id, utterance_list in clause_data.items():
        annotated[conv_id] = []
        original_utterances = original_data[conv_id]
        for utt_clauses, utt_original in zip(utterance_list, original_utterances):
            annotated_turns = []
            for turn_clauses, turn_original in zip(utt_clauses, utt_original):
                clauses = turn_clauses['clauses']
                emotion = turn_original.get('emotion', None)
                utterance = turn_original.get('utterance', "")
                cause_spans = turn_original.get('expanded emotion cause span', [])
                cause_embeddings = model.encode([c.strip() for c in cause_spans], convert_to_tensor=True) if cause_spans else []
                annotated_clauses = []
                for clause in clauses:
                    labels = []

                    is_cause = False
                    if cause_spans and len(cause_embeddings) > 0:
                        clause_emb = model.encode(clause.strip(), convert_to_tensor=True)
                        sims = util.cos_sim(clause_emb, cause_embeddings)[0]
                        if sims.max().item() > sim_threshold:
                            is_cause = True
                            labels.append('cause')

                    is_emotion = is_emotion_clause(clause, utterance, emotion)
                    if is_emotion:
                        labels.append('emotion')
                    if not labels:
                        labels = ['neither']
                    annotated_clauses.append({
                        'clause': clause,
                        'labels': labels
                    })
                annotated_turns.append({
                    'turn': turn_clauses['turn'],
                    'speaker': turn_clauses['speaker'],
                    'utterance': turn_clauses['utterance'],
                    'clauses': annotated_clauses,
                    'emotion': emotion
                })
            annotated[conv_id].append(annotated_turns)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(annotated, f, indent=2, ensure_ascii=False)

    print(f"Improved semantic annotation complete. Results saved to {output_path}")

annotate_clauses_semantic_better(
    clause_json_path='dailydialog_train_clauses.json',
    original_json_path='/content/dailydialog_train.json',
    output_path='dailydialog_train_annotated_clauses_semantic_better.json'
)
annotate_clauses_semantic_better(
    clause_json_path='dailydialog_test_clauses.json',
    original_json_path='/content/dailydialog_test (1).json',
    output_path='dailydialog_test_annotated_clauses_semantic_better.json'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Improved semantic annotation complete. Results saved to dailydialog_train_annotated_clauses_semantic_better.json
Improved semantic annotation complete. Results saved to dailydialog_test_annotated_clauses_semantic_better.json


In [None]:
! pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
import json
import torch
from torch_geometric.data import Data
from sentence_transformers import SentenceTransformer
import spacy


nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

LABEL2IDX = {'emotion': 1, 'cause': 2, 'both': 3, 'neither': 0}

def label_to_idx(labels):
    labels_set = set(labels)
    if 'emotion' in labels_set and 'cause' in labels_set:
        return LABEL2IDX['both']
    elif 'emotion' in labels_set:
        return LABEL2IDX['emotion']
    elif 'cause' in labels_set:
        return LABEL2IDX['cause']
    else:
        return LABEL2IDX['neither']

def build_graph_from_conversation(conv_turns):
    node_texts = []
    node_labels = []
    node_utter_idx = []
    for turn_idx, turn in enumerate(conv_turns):
        for clause in turn['clauses']:
            text = clause.get('clause', '').strip()
            if not text:
                continue
            labels = clause.get('labels', ['neither'])
            node_texts.append(text)
            node_labels.append(label_to_idx(labels))
            node_utter_idx.append(turn_idx)
    if not node_texts:
        return None
    node_features = embedder.encode(node_texts, convert_to_tensor=True)

    edge_index = [[], []]
    curr = 0
    for turn_idx, turn in enumerate(conv_turns):
        n_clauses = len([cl for cl in turn['clauses'] if cl.get('clause', '').strip()])
        for i in range(n_clauses - 1):
            edge_index[0].append(curr + i)
            edge_index[1].append(curr + i + 1)
            edge_index[0].append(curr + i + 1)
            edge_index[1].append(curr + i)
        curr += n_clauses
    for idx in range(len(node_texts)):
        edge_index[0].append(idx)
        edge_index[1].append(idx)
    # 4. Build Data object
    data = Data(
        x=node_features,
        edge_index=torch.tensor(edge_index, dtype=torch.long),
        y=torch.tensor(node_labels, dtype=torch.long),
        utter_idx=torch.tensor(node_utter_idx, dtype=torch.long),
        texts=node_texts
    )
    return data

def build_graphs_from_file(annotated_json_path, output_path_prefix):
    with open(annotated_json_path, 'r', encoding='utf-8') as f:
        annotated = json.load(f)
    graphs = {}
    for conv_id, conv in annotated.items():
        for conv_turns in conv:
            graph = build_graph_from_conversation(conv_turns)
            if graph is not None:
                graphs[conv_id] = graph
    for conv_id, graph in graphs.items():
        torch.save(graph, f"{output_path_prefix}_{conv_id}.pt")
    print(f"Saved {len(graphs)} graphs as PyTorch Geometric Data objects.")

build_graphs_from_file(
    annotated_json_path='dailydialog_train_annotated_clauses_semantic_better.json',
    output_path_prefix='train_graph'
)

Saved 834 graphs as PyTorch Geometric Data objects.


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class EvolveGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=2):
        super().__init__()
        self.num_layers = num_layers
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels))
        for _ in range(num_layers-2):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        self.convs.append(GCNConv(hidden_channels, out_channels))

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = F.relu(conv(x, edge_index))
        x = self.convs[-1](x, edge_index)
        return x

In [None]:
import torch
from torch_geometric.data import DataLoader
import glob

graph_files = glob.glob("train_graph_*.pt")

graphs = [torch.load(f, weights_only=False) for f in graph_files]

train_graphs = graphs[:int(0.8*len(graphs))]
val_graphs = graphs[int(0.8*len(graphs)):]

train_loader = DataLoader(train_graphs, batch_size=1, shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=1)



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EvolveGCN(
    in_channels=graphs[0].x.size(1),
    hidden_channels=64,
    out_channels=4
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            total += data.y.size(0)
    return correct / total

for epoch in range(1, 21):
    loss = train()
    acc = evaluate(val_loader)
    print(f"Epoch {epoch}, Loss: {loss:.4f}, Val Acc: {acc:.4f}")

Epoch 1, Loss: 0.9958, Val Acc: 0.7153
Epoch 2, Loss: 0.9250, Val Acc: 0.7133
Epoch 3, Loss: 0.9015, Val Acc: 0.7080
Epoch 4, Loss: 0.8850, Val Acc: 0.7147
Epoch 5, Loss: 0.8667, Val Acc: 0.7183
Epoch 6, Loss: 0.8526, Val Acc: 0.7189
Epoch 7, Loss: 0.8381, Val Acc: 0.7150
Epoch 8, Loss: 0.8240, Val Acc: 0.7224
Epoch 9, Loss: 0.8114, Val Acc: 0.7189
Epoch 10, Loss: 0.7967, Val Acc: 0.7162
Epoch 11, Loss: 0.7829, Val Acc: 0.7044
Epoch 12, Loss: 0.7716, Val Acc: 0.7209
Epoch 13, Loss: 0.7583, Val Acc: 0.7088
Epoch 14, Loss: 0.7455, Val Acc: 0.7038
Epoch 15, Loss: 0.7348, Val Acc: 0.7077
Epoch 16, Loss: 0.7218, Val Acc: 0.7145
Epoch 17, Loss: 0.7106, Val Acc: 0.7130
Epoch 18, Loss: 0.6995, Val Acc: 0.7062
Epoch 19, Loss: 0.6883, Val Acc: 0.7159
Epoch 20, Loss: 0.6790, Val Acc: 0.7159


In [None]:
model.eval()
with torch.no_grad():
    for data in val_loader:
        data = data.to(device)
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        # pred: 0=neither, 1=emotion, 2=cause, 3=both
        print("Predicted labels:", pred.cpu().numpy())

Predicted labels: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
Predicted labels: [3 0 0 0 0 0 0 0 0 0 0 0]
Predicted labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Predicted labels: [2 0 0 0 0 0 3 0 0]
Predicted labels: [0 0 0 3 3 1 1]
Predicted labels: [0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 3]
Predicted labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 1 1 0 0]
Predicted labels: [3 3 0 0 0 0 0 0 0 0 0 3 0 0 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]
Predicted labels: [3 0 0 0 0 3 3 3 3 3 0 0 1 0 0 0 0]
Predicted labels: [3 3 0 0 0 0 0 3 3 0 0 0 0 0 0 0]
Predicted labels: [3 0 3 0 3 0 0 0 0 3 3 0 0 0]
Predicted labels: [0 0 0 3 3 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 0 0 0]
Predicted labels: [0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
Predicted labels: [0 0 0 0 0 0 1 3 0 0 0 0 0 1 0 0 0 1]
Predicted labels: [0 0 0 0 1 3 0]
Predicted labels: [3 0 0 0 0 0]
Predicted labels: [0 0 0 0 0 0 0 

TESTING THE ABOVE

In [None]:
build_graphs_from_file(
    annotated_json_path='dailydialog_test_annotated_clauses_semantic_better.json',
    output_path_prefix='test_graph'
)

Saved 225 graphs as PyTorch Geometric Data objects.


In [None]:
import glob
import torch

test_graph_files = glob.glob("test_graph_*.pt")
test_graphs = [torch.load(f, weights_only=False) for f in test_graph_files]

In [None]:
from torch_geometric.data import DataLoader

test_loader = DataLoader(test_graphs, batch_size=1, shuffle=False)



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for data in test_loader:
        data = data.to(device)
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        all_preds.extend(pred.cpu().numpy())
        all_labels.extend(data.y.cpu().numpy())


from sklearn.metrics import classification_report, accuracy_score

print("Test Accuracy:", accuracy_score(all_labels, all_preds))
labels = [0, 1, 2, 3]


Test Accuracy: 0.7370274277242401
