In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, mean_absolute_error, confusion_matrix
from IPython.display import display

from pipeline import train_seq2seq_model, predict_scores_for_spans

# Folder where this notebook lives
nb_folder = os.getcwd()
print("Notebook folder:", nb_folder)

# ‚úÖ Go 4 levels up to reach: ai_pipeline_testing/
project_root = os.path.abspath(os.path.join(nb_folder, "..", "..", "..", ".."))
print("Project root:", project_root)

# Training and validation JSONL
train_jsonl = os.path.join(
    project_root,
    "data", "processed", "training",
    "whatsapp_aspect_training_300.jsonl",
)
val_jsonl = os.path.join(
    project_root,
    "data", "processed", "validation",
    "validation_gpt_50.jsonl",
)

print("Train JSONL:", train_jsonl, "exists:", os.path.exists(train_jsonl))
print("Val   JSONL:", val_jsonl, "exists:", os.path.exists(val_jsonl))

# ‚úÖ LLM preprocessing results (relative to cs_scoring)
flat_spans_path = os.path.abspath(
    os.path.join(
        nb_folder,
        "..",        # mapping/
        "..",        # cs_scoring/
        "llm_preprocessing",
        "results",
        "b2b_feedback_attribute_spans_flat.csv",
    )
)
print("Flat spans CSV:", flat_spans_path, "exists:", os.path.exists(flat_spans_path))

# Where to store seq2seq outputs
results_dir = os.path.join(nb_folder, "results")
os.makedirs(results_dir, exist_ok=True)
print("Results dir:", results_dir)

  from .autonotebook import tqdm as notebook_tqdm


Notebook folder: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq
Project root: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing
Train JSONL: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\data\processed\training\whatsapp_aspect_training_300.jsonl exists: True
Val   JSONL: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\data\processed\validation\validation_gpt_50.jsonl exists: True
Flat spans CSV: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\llm_preprocessing\results\b2b_feedback_attribute_spans_flat.csv exists: True
Results dir: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results


In [2]:
# llm_preprocessing output (all spans)
flat_spans_df = pd.read_csv(flat_spans_path)

# Validation set (ground truth) ‚Äì chat-style JSONL ‚Üí flat rows
val_rows = []

with open(val_jsonl, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        rec = json.loads(line)

        msgs = rec.get("messages", [])
        # 1) get user message text
        user_msg = None
        for m in msgs:
            if m.get("role") == "user":
                user_msg = m.get("content", "")
                break
        if not user_msg:
            continue

        # Extract the actual text after "Message:" if present
        if "Message:" in user_msg:
            span_text = user_msg.split("Message:", 1)[1].strip()
        else:
            span_text = user_msg.strip()

        # 2) get assistant JSON with aspects
        assistant_msg = None
        for m in msgs:
            if m.get("role") == "assistant":
                assistant_msg = m.get("content", "")
                break
        if not assistant_msg:
            continue

        try:
            label_obj = json.loads(assistant_msg)
        except json.JSONDecodeError:
            # if something is weird, skip this record
            continue

        aspects = label_obj.get("aspects", {})
        # aspects is like {"product": 5, "service": 2, ...}
        for aspect_name, score in aspects.items():
            # Capitalise to match LLM preprocessing ("Product", "Service", etc.)
            attr = aspect_name.capitalize()
            val_rows.append({
                "attribute": attr,
                "text_span": span_text,
                "score": int(score),
            })

val_df = pd.DataFrame(val_rows)

print("flat_spans_df shape:", flat_spans_df.shape)
print("val_df shape (flattened from validation_gpt_50.jsonl):", val_df.shape)
display(flat_spans_df.head())
display(val_df.head())


flat_spans_df shape: (123, 4)
val_df shape (flattened from validation_gpt_50.jsonl): (46, 3)


Unnamed: 0,row_index,comment,attribute,text_span
0,0,Status?,Delivery,Status?
1,1,As per our call just now pls rush the 6 inch A...,Product,6 inch ANSI 150 flanges
2,1,As per our call just now pls rush the 6 inch A...,Delivery,rush the 6 inch ANSI 150 flanges to Tuas site ...
3,3,Can faster or not? Client side keep asking me ...,Product,DI fittings
4,3,Can faster or not? Client side keep asking me ...,Delivery,Need the DI fittings by COB today


Unnamed: 0,attribute,text_span,score
0,Product,"Pipe casting quality is excellent, no defects ...",5
1,Product,Material specs sometimes do not match the draw...,2
2,Service,Sales team is very responsive on WhatsApp and ...,5
3,Service,Customer service can be defensive when we rais...,2
4,Delivery,Delivery is usually on time but sometimes the ...,3


In [3]:
# We just evaluate on the validation spans themselves
eval_df = val_df.copy()

print("eval_df shape (validation spans used for eval):", eval_df.shape)
display(eval_df.head())


eval_df shape (validation spans used for eval): (46, 3)


Unnamed: 0,attribute,text_span,score
0,Product,"Pipe casting quality is excellent, no defects ...",5
1,Product,Material specs sometimes do not match the draw...,2
2,Service,Sales team is very responsive on WhatsApp and ...,5
3,Service,Customer service can be defensive when we rais...,2
4,Delivery,Delivery is usually on time but sometimes the ...,3


In [4]:
# Save flattened validation to a JSONL that the seq2seq pipeline can use
val_jsonl_flat = os.path.join(
    project_root,
    "data", "processed", "validation",
    "validation_flat_for_seq2seq.jsonl",
)

os.makedirs(os.path.dirname(val_jsonl_flat), exist_ok=True)

with open(val_jsonl_flat, "w", encoding="utf-8") as f:
    for _, row in val_df.iterrows():
        rec = {
            "attribute": row["attribute"],
            "text_span": row["text_span"],
            "score": int(row["score"]),
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Wrote flattened validation JSONL:", val_jsonl_flat)
print("Rows:", len(val_df))

Wrote flattened validation JSONL: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\data\processed\validation\validation_flat_for_seq2seq.jsonl
Rows: 46


In [5]:
models_to_train = [
    "google/flan-t5-base",
    "facebook/bart-base",
]
models_to_train

['google/flan-t5-base', 'facebook/bart-base']

In [6]:
trained_model_dirs = {}

for model_name in models_to_train:
    safe_name = model_name.replace("/", "_")
    output_dir = os.path.join(results_dir, safe_name)

    print(f"\nüöÄ Training {model_name}")
    print("Output dir:", output_dir)

    trainer, tokenizer, model = train_seq2seq_model(
        model_name=model_name,
        train_jsonl=train_jsonl,
        val_jsonl=val_jsonl_flat,   # <-- use the flat validation file
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_batch_size=8,
        learning_rate=5e-5,
    )

    trained_model_dirs[model_name] = output_dir

trained_model_dirs


üöÄ Training google/flan-t5-base
Output dir: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results\google_flan-t5-base
üîß Loading tokenizer and model: google/flan-t5-base
üì• Building train dataset from: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\data\processed\training\whatsapp_aspect_training_300.jsonl


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 454/454 [00:00<00:00, 14370.34 examples/s]


üì• Building validation dataset from: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\data\processed\validation\validation_flat_for_seq2seq.jsonl


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 46/46 [00:00<00:00, 7180.96 examples/s]
  trainer = Seq2SeqTrainer(


Step,Training Loss



üöÄ Training facebook/bart-base
Output dir: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results\facebook_bart-base
üîß Loading tokenizer and model: facebook/bart-base
üì• Building train dataset from: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\data\processed\training\whatsapp_aspect_training_300.jsonl


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 454/454 [00:00<00:00, 28270.47 examples/s]


üì• Building validation dataset from: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\data\processed\validation\validation_flat_for_seq2seq.jsonl


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 46/46 [00:00<00:00, 11498.78 examples/s]
  trainer = Seq2SeqTrainer(


Step,Training Loss




{'google/flan-t5-base': 'c:\\Users\\tengc\\Downloads\\develop_ai_pipelines_testing\\ai_pipeline_testing\\notebooks\\cs_scoring\\mapping\\seq2seq\\results\\google_flan-t5-base',
 'facebook/bart-base': 'c:\\Users\\tengc\\Downloads\\develop_ai_pipelines_testing\\ai_pipeline_testing\\notebooks\\cs_scoring\\mapping\\seq2seq\\results\\facebook_bart-base'}

In [7]:
spans_predictions = {}

for model_name, model_dir in trained_model_dirs.items():
    print(f"\nüîÆ Predicting with {model_name} on validation spans")

    # Use eval_df (attribute + text_span) as input
    pred_input_df = eval_df[["attribute", "text_span"]].copy()

    pred_df = predict_scores_for_spans(
        model_path=model_dir,
        spans_df=pred_input_df,
        attribute_col="attribute",
        text_col="text_span",
        batch_size=16,
    )

    # Attach ground-truth scores from eval_df
    merged = pred_df.merge(
        eval_df,
        on=["attribute", "text_span"],
        how="left",
    )

    spans_predictions[model_name] = merged

    safe_name = model_name.replace("/", "_")
    out_csv = os.path.join(results_dir, f"{safe_name}_pred_on_validation.csv")
    merged.to_csv(out_csv, index=False, encoding="utf-8-sig")

    # Also save JSONL in validation-like schema, but with predicted score
    out_jsonl = os.path.join(results_dir, f"{safe_name}_pred_on_validation.jsonl")
    with open(out_jsonl, "w", encoding="utf-8") as f:
        for _, row in merged.iterrows():
            rec = {
                "attribute": row["attribute"],
                "text_span": row["text_span"],
                "score": int(row["pred_score"]) if not pd.isna(row["pred_score"]) else None,
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print(f"‚úÖ Saved CSV:   {out_csv}")
    print(f"‚úÖ Saved JSONL: {out_jsonl}")


üîÆ Predicting with google/flan-t5-base on validation spans
üñ• Using device: cpu
‚úÖ Saved CSV:   c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results\google_flan-t5-base_pred_on_validation.csv
‚úÖ Saved JSONL: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results\google_flan-t5-base_pred_on_validation.jsonl

üîÆ Predicting with facebook/bart-base on validation spans
üñ• Using device: cpu
‚úÖ Saved CSV:   c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results\facebook_bart-base_pred_on_validation.csv
‚úÖ Saved JSONL: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results\facebook_bart-base_pred_on_validation.jsonl


In [8]:
from pathlib import Path

print("Results dir:", results_dir)

pred_dfs = {}

for model_name in models_to_train:
    safe_name = model_name.replace("/", "_")
    pred_csv = os.path.join(results_dir, f"{safe_name}_pred_on_validation.csv")

    if not os.path.exists(pred_csv):
        print(f"‚ö†Ô∏è Prediction CSV not found for {model_name}: {pred_csv}")
        continue

    df = pd.read_csv(pred_csv)
    pred_dfs[model_name] = df

    print(f"\nModel: {model_name}")
    print("CSV path:", pred_csv)
    print("Rows:", len(df))
    print("Columns:", list(df.columns))

    # Show a few rows of the key columns
    cols_to_show = [c for c in ["attribute", "text_span", "pred_score", "score"] if c in df.columns]
    display(df[cols_to_show].head())


Results dir: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results

Model: google/flan-t5-base
CSV path: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results\google_flan-t5-base_pred_on_validation.csv
Rows: 46
Columns: ['attribute', 'text_span', 'pred_score', 'score']


Unnamed: 0,attribute,text_span,pred_score,score
0,Product,"Pipe casting quality is excellent, no defects ...",5,5
1,Product,Material specs sometimes do not match the draw...,2,2
2,Service,Sales team is very responsive on WhatsApp and ...,5,5
3,Service,Customer service can be defensive when we rais...,2,2
4,Delivery,Delivery is usually on time but sometimes the ...,4,3



Model: facebook/bart-base
CSV path: c:\Users\tengc\Downloads\develop_ai_pipelines_testing\ai_pipeline_testing\notebooks\cs_scoring\mapping\seq2seq\results\facebook_bart-base_pred_on_validation.csv
Rows: 46
Columns: ['attribute', 'text_span', 'pred_score', 'score']


Unnamed: 0,attribute,text_span,pred_score,score
0,Product,"Pipe casting quality is excellent, no defects ...",5,5
1,Product,Material specs sometimes do not match the draw...,3,2
2,Service,Sales team is very responsive on WhatsApp and ...,4,5
3,Service,Customer service can be defensive when we rais...,2,2
4,Delivery,Delivery is usually on time but sometimes the ...,5,3


In [9]:
print("\n===== FINAL METRICS (vs WhatsApp validation) =====\n")

for model_name, merged in spans_predictions.items():
    # merged has: attribute, text_span, pred_score, score
    eval_subset = merged.dropna(subset=["pred_score", "score"]).copy()

    if len(eval_subset) == 0:
        print(f"‚ö†Ô∏è No valid predictions for model {model_name}")
        continue

    y_true = eval_subset["score"].astype(int).values
    y_pred = eval_subset["pred_score"].astype(int).values

    acc = accuracy_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=[1, 2, 3, 4, 5])

    print(f"üéØ MODEL: {model_name}")
    print(f"  Samples used: {len(eval_subset)}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  MAE:      {mae:.4f}")
    print("  Confusion matrix (rows=true, cols=pred):")
    print(cm)
    print()


===== FINAL METRICS (vs WhatsApp validation) =====

üéØ MODEL: google/flan-t5-base
  Samples used: 46
  Accuracy: 0.5000
  MAE:      0.5217
  Confusion matrix (rows=true, cols=pred):
[[ 0  4  0  0  0]
 [ 0 12  1  1  0]
 [ 0  4  1  3  0]
 [ 0  0  0  1  9]
 [ 0  0  0  1  9]]

üéØ MODEL: facebook/bart-base
  Samples used: 46
  Accuracy: 0.4565
  MAE:      0.6739
  Confusion matrix (rows=true, cols=pred):
[[1 3 0 0 0]
 [0 9 2 2 1]
 [0 4 0 2 2]
 [0 0 0 2 8]
 [0 0 0 1 9]]

