In [15]:
# =================== CELL 1: INSTALLS ===================
!pip install -q transformers==4.40.0 datasets sentence-transformers nltk scikit-learn sacrebleu rouge_score sentencepiece accelerate safetensors

print("Installs done.")


Installs done.


In [16]:
# =================== CELL 2: USER PATHS & FLAGS ===================
import os, glob, zipfile, json, shutil, uuid, subprocess, sys, re, math
from tqdm import tqdm
import xml.etree.ElementTree as ET

# -----------------------
# FLAGS / USER OPTIONS
# -----------------------
DO_EXTRACT_IF_ZIP_FOUND = True
DO_TRAIN_LORA = False   # leave False (LoRA code optional)
DO_EVAL = False

FORCE_EXTRACTIVE_PRED = False

# -----------------------
# USER PATHS - change these if needed
# -----------------------
AMI_ZIP = "/content/ami_public_manual_1.6.2.zip"
AMI_ROOT = "/content/ami_corpus_manual"
TRANSCRIPT_FILE = "/content/ami_corpus_manual/words/ES2009d.A.words.xml"
ABSSUM_FILE = "/content/ami_corpus_manual/abstractive/ES2002a.abssumm.xml"
OUT_DIR = "/content/ami_project_out"

# Derived
LORE_DIR = os.path.join(OUT_DIR, "flan_t5_lora_ami")
TEXT_SUM_DIR = os.path.join(OUT_DIR, "ami_summaries_txt")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(TEXT_SUM_DIR, exist_ok=True)

print("USER PATHS:")
print("AMI_ZIP:", AMI_ZIP)
print("AMI_ROOT:", AMI_ROOT)
print("OUT_DIR:", OUT_DIR)
# =================== CELL 3: HELPERS & AUTO-LOCATE ZIP ===================
def safe_print_header(msg):
    print("\n" + "="*6 + " " + msg + " " + "="*6)

def try_extract_zip_to(zip_path, target_dir):
    try:
        with zipfile.ZipFile(zip_path, 'r') as z:
            z.extractall(target_dir)
        return True
    except Exception as e:
        print("Zip extraction failed:", e)
        return False

def find_file_by_basename(basename, search_root="/content"):
    matches = sorted(glob.glob(os.path.join(search_root, "**", basename), recursive=True))
    return matches[0] if matches else None

safe_print_header("FILE CHECK & AUTO-LOCATE")
print("Trying to locate AMI files...")

transcript_exists = os.path.exists(TRANSCRIPT_FILE)
abssum_exists = os.path.exists(ABSSUM_FILE)

if not transcript_exists:
    c = find_file_by_basename(os.path.basename(TRANSCRIPT_FILE), "/content")
    if c:
        TRANSCRIPT_FILE = c; transcript_exists = True; print("Located transcript:", c)
if not abssum_exists:
    c = find_file_by_basename(os.path.basename(ABSSUM_FILE), "/content")
    if c:
        ABSSUM_FILE = c; abssum_exists = True; print("Located abstractive summary:", c)

if (not transcript_exists or not abssum_exists) and DO_EXTRACT_IF_ZIP_FOUND and os.path.exists(AMI_ZIP):
    print("AMI zip exists — extracting to", AMI_ROOT)
    if try_extract_zip_to(AMI_ZIP, AMI_ROOT):
        c = find_file_by_basename(os.path.basename(TRANSCRIPT_FILE), AMI_ROOT)
        if c: TRANSCRIPT_FILE = c; transcript_exists = True; print("Found transcript after extract:", c)
        c = find_file_by_basename(os.path.basename(ABSSUM_FILE), AMI_ROOT)
        if c: ABSSUM_FILE = c; abssum_exists = True; print("Found abssum after extract:", c)

print("Final exists -> transcript:", os.path.exists(TRANSCRIPT_FILE), "| abssum:", os.path.exists(ABSSUM_FILE))
if not (os.path.exists(TRANSCRIPT_FILE) and os.path.exists(ABSSUM_FILE)):
    print("WARNING: Required file(s) missing. The pipeline can still search inside AMI_ROOT or exit.")
# =================== CELL 4: XML PARSERS ===================
safe_print_header("PARSERS")
def tag_name(elem):
    t = elem.tag
    if isinstance(t, str) and '}' in t:
        return t.split('}',1)[1].lower()
    return str(t).lower()

def parse_abssumm_xml(path):
    if not path or not os.path.exists(path):
        print("Abstractive summary file missing:", path); return ""
    try:
        tree = ET.parse(path); root = tree.getroot()
    except Exception as e:
        print("Parse error (summary):", e); return ""
    texts=[]
    candidates = ['abssumm','abstractive','abstract','summary','s','p']
    for elem in root.iter():
        tag = tag_name(elem)
        for c in candidates:
            if tag.endswith(c):
                if elem.text and elem.text.strip(): texts.append(elem.text.strip())
                for ch in elem:
                    if ch.text and ch.text.strip(): texts.append(ch.text.strip())
    if not texts:
        def rec(el):
            if el.text and el.text.strip(): texts.append(el.text.strip())
            for ch in el: rec(ch)
        rec(root)
    return " ".join(texts).strip()

def parse_words_xml_speaker_turns(path):
    if not path or not os.path.exists(path):
        print("Transcript file missing:", path); return []
    try:
        tree = ET.parse(path); root = tree.getroot()
    except Exception as e:
        print("Parse error (words):", e); return []
    turns=[]
    for u in root.iter():
        tag = tag_name(u)
        if tag.endswith('u') or tag.endswith('turn') or 'segment' in tag or 'utterance' in tag:
            speaker = u.attrib.get('who') or u.attrib.get('pname') or u.attrib.get('speaker') or "UNK"
            parts=[]; start=None; end=None
            for w in u.iter():
                wt = (w.text or "").strip()
                if wt: parts.append(wt)
                s = w.attrib.get('starttime') or w.attrib.get('stime') or w.attrib.get('start')
                e = w.attrib.get('endtime') or w.attrib.get('etime') or w.attrib.get('end')
                if s and start is None:
                    try: start=float(s)
                    except: pass
                if e:
                    try: end=float(e)
                    except: pass
            text = " ".join(parts).strip()
            if text:
                turns.append({"speaker":speaker,"start":start,"end":end,"text":text})
    if not turns:
        tokens=[]
        for elem in root.iter():
            if tag_name(elem).endswith('w'):
                t=(elem.text or "").strip()
                if t: tokens.append(t)
        if tokens:
            turns=[{"speaker":"UNK","start":None,"end":None,"text":" ".join(tokens)}]
    return turns
# =================== CELL 5: BUILD PAIRS ===================
safe_print_header("BUILD PAIRS")
pairs=[]
if os.path.exists(TRANSCRIPT_FILE) and os.path.exists(ABSSUM_FILE):
    summary_text = parse_abssumm_xml(ABSSUM_FILE)
    turns = parse_words_xml_speaker_turns(TRANSCRIPT_FILE)
    dialogue_tagged = "\n".join([f"[{t['speaker']}] {t['text']}" for t in turns])
    dialogue_raw = " ".join([t['text'] for t in turns])
    if summary_text and (dialogue_raw or dialogue_tagged):
        pairs.append({"meeting_id": os.path.basename(TRANSCRIPT_FILE).split('.')[0],
                      "dialogue": dialogue_raw,
                      "dialogue_tagged": dialogue_tagged,
                      "turns": turns,
                      "summary": summary_text})
        print("Built 1 pair from provided files.")
    else:
        print("Parsed summary or transcript is empty; skipping.")
else:
    if os.path.exists(AMI_ROOT):
        found_summaries = sorted(glob.glob(os.path.join(AMI_ROOT,"**","*.abssumm.xml"), recursive=True) + glob.glob(os.path.join(AMI_ROOT,"**","*abstractive*.xml"), recursive=True))
        found_transcripts = sorted(glob.glob(os.path.join(AMI_ROOT,"**","*.words.xml"), recursive=True))
        if found_summaries and found_transcripts:
            def meeting_id_from_path(p):
                bn=os.path.basename(p)
                m=re.search(r"([A-Z]{2}\d{4}[a-d]?)",bn)
                if m: return m.group(1)
                return os.path.splitext(bn)[0]
            trans_map = {meeting_id_from_path(p): p for p in found_transcripts}
            for s in found_summaries:
                mid = meeting_id_from_path(s); t = trans_map.get(mid)
                if t:
                    summary_text = parse_abssumm_xml(s)
                    turns = parse_words_xml_speaker_turns(t)
                    dialogue_tagged = "\n".join([f"[{tt['speaker']}] {tt['text']}" for tt in turns])
                    dialogue_raw = " ".join([tt['text'] for tt in turns])
                    if summary_text and (dialogue_raw or dialogue_tagged):
                        pairs.append({"meeting_id":mid,"dialogue":dialogue_raw,"dialogue_tagged":dialogue_tagged,"turns":turns,"summary":summary_text})
            if pairs:
                print(f"Built {len(pairs)} pair(s) from AMI_ROOT fallback search.")
            else:
                print("Could not pair found files.")
        else:
            print("No suitable files found under AMI_ROOT.")
    else:
        print("AMI_ROOT missing:", AMI_ROOT)

if pairs:
    with open(os.path.join(OUT_DIR,"ami_pairs_preview.json"), "w", encoding="utf-8") as f:
        json.dump(pairs[:5], f, ensure_ascii=False, indent=2)
    print("Saved preview ->", os.path.join(OUT_DIR,"ami_pairs_preview.json"))
else:
    print("No pairs created. Skipping pipeline.")
# =================== CELL 6: NLTK punkt download & SBERT optional ===================
safe_print_header("SETUP NLP DEPENDENCIES")
import nltk
# download punkt; if network prevents it, we'll fallback later
try:
    nltk.download('punkt', quiet=False)
except Exception as e:
    print("NLTK punkt download failed (will use fallback sentence splitter):", e)

# safe sentence tokenizer function (uses nltk if available, otherwise simple regex)
import re
def safe_sent_tokenize(text):
    try:
        from nltk.tokenize import sent_tokenize as _nltk_sent
        return _nltk_sent(text)
    except Exception:
        # fallback: naive split on sentence-ending punctuation
        parts = re.split(r'(?<=[.!?])\s+', text.strip())
        parts = [p.strip() for p in parts if len(p.strip())>0]
        return parts

# Try to load SBERT (optional)
sbert_model = None
try:
    from sentence_transformers import SentenceTransformer
    import torch
    device_sbert = "cuda" if torch.cuda.is_available() else "cpu"
    sbert_model = SentenceTransformer("all-MiniLM-L6-v2", device=device_sbert)
    print("Loaded SBERT on device:", device_sbert)
except Exception as e:
    print("SBERT unavailable or failed to load:", e)
    sbert_model = None
# =================== CELL 7: Extractive helpers, cleaning, scoring ===================
safe_print_header("EXTRACTIVE CLEANING HELPERS")
def sent_tokenize_safe(text):
    return safe_sent_tokenize(text)

FILLERS_RE = re.compile(r"\b(um+|uh+|erm+|ah+|you know|i mean|like|sort of|kind of|okay|ok|so)\b", flags=re.I)
MULTI_SPACE_RE = re.compile(r"\s{2,}")
KEYWORDS = [
    "decide", "decision", "decided", "action", "task", "assign", "deadline", "due", "will",
    "agree", "agreed", "approve", "approved", "budget", "cost", "price", "deliver", "deliverable",
    "complete by", "milestone", "requirement", "require", "responsible", "owner", "follow up",
    "follow-up", "todo", "to-do", "estimate", "risk", "issue", "deliverable", "deadline", "schedule"
]
NUMERIC_RE = re.compile(r"\b(\d{1,4}(?:[.,]\d{1,3})?|\d+%|\b\d+\b)\b")
CURRENCY_RE = re.compile(r"\b(€|\$|usd|eur|inr|rs|rupees|pound|£)\b", flags=re.I)
DATE_WORDS = re.compile(r"\b(today|tomorrow|monday|tuesday|wednesday|thursday|friday|saturday|sunday|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|week|month|year)\b", flags=re.I)

def clean_sentence(text):
    s = text.strip()
    s = FILLERS_RE.sub("", s)
    s = re.sub(r"[,;:\-]{2,}", ",", s)
    s = MULTI_SPACE_RE.sub(" ", s)
    s = s.strip(" ,.;:-")
    s = re.sub(r"\b(\w+)\s+\1\b", r"\1", s)
    if len(s.split()) < 3:
        return ""
    return s

def rule_score_sentence(text):
    low = text.lower()
    score = 0.0
    for kw in KEYWORDS:
        if kw in low:
            score += 1.8
    if NUMERIC_RE.search(text):
        score += 1.0
    if CURRENCY_RE.search(text):
        score += 1.2
    if DATE_WORDS.search(text):
        score += 0.8
    if len(text.split()) < 4:
        score -= 0.6
    if text.strip().endswith("?") and not any(kw in low for kw in KEYWORDS):
        score -= 0.5
    if re.search(r"\b(will|must|should|need|required|deliver|deliverable|due)\b", low):
        score += 0.6
    return score

def extract_key_points_chronological_clean(turns, top_k=12, min_score=1.0):
    chronological = []
    for t in turns:
        sents = sent_tokenize_safe(t.get("text",""))
        for s in sents:
            s_cleaned = clean_sentence(s)
            if not s_cleaned:
                continue
            score = rule_score_sentence(s_cleaned)
            if re.search(r"\b(i will|we will|I'll|we will|i'll|we'll)\b", s_cleaned, flags=re.I):
                score += 0.6
            chronological.append({"speaker": t.get("speaker","UNK"), "start": t.get("start"), "text": s_cleaned, "score": round(score,3)})
    picked = [p for p in chronological if p["score"] >= min_score]
    if len(picked) == 0:
        sorted_by_score = sorted(chronological, key=lambda x: (x["score"], len(x["text"])), reverse=True)
        candidates = sorted_by_score[:top_k]
        # maintain original order
        idx_map = {id(x): i for i,x in enumerate(chronological)}
        picked = sorted(candidates, key=lambda x: idx_map[id(x)])
    else:
        picked = picked[:top_k]
    return picked

def format_extractive_points_chronological(points):
    lines=[]
    for p in points:
        start = f"{p['start']:.1f}s" if p['start'] is not None else "?"
        lines.append(f"[{p['speaker']}] (at {start}): {p['text']}")
    return "\n".join(lines)
# =================== CELL 8: Tokenizer chunking helpers (safe by sentence) ===================
safe_print_header("TOKENIZER CHUNK HELPERS")
from transformers import AutoTokenizer

def get_tokenizer(model_name="google/flan-t5-base"):
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if not hasattr(tok, "model_max_length") or tok.model_max_length is None or tok.model_max_length > 1_000_000_000:
        tok.model_max_length = 512
    return tok

def chunk_text_by_tokenizer_safe(text, tokenizer_obj, max_tokens=None, stride=0):
    """
    Break text into chunks that respect tokenizer_obj.model_max_length.
    We split text into sentences, then package sentences into chunks while
    checking token length on per-sentence basis. This avoids encoding huge full text.
    """
    if tokenizer_obj is None:
        # fallback: naive sentence chunks
        sents = sent_tokenize_safe(text)
        out=[]
        cur=[]
        cur_len=0
        limit = max_tokens or 512
        for s in sents:
            l = len(s.split())
            if cur_len + l > limit:
                out.append(" ".join(cur))
                cur=[s]; cur_len=l
            else:
                cur.append(s); cur_len += l
        if cur: out.append(" ".join(cur))
        return out

    sents = sent_tokenize_safe(text)
    max_m = max_tokens if max_tokens else (tokenizer_obj.model_max_length - 32)
    chunks=[]
    cur_ids_count = 0
    cur_ids = []
    cur_sents = []
    for s in sents:
        if not s.strip(): continue
        # encode sentence alone to measure tokens (short)
        ids = tokenizer_obj(s, add_special_tokens=False)["input_ids"]
        l = len(ids)
        if l > max_m:
            # sentence itself longer than max — force-split by words (rare)
            words = s.split()
            tmp=[]
            tmp_count=0
            for w in words:
                w_ids = tokenizer_obj(w, add_special_tokens=False)["input_ids"]
                wc = len(w_ids)
                if tmp_count + wc > max_m:
                    if tmp: chunks.append(" ".join(tmp))
                    tmp=[w]; tmp_count = wc
                else:
                    tmp.append(w); tmp_count += wc
            if tmp: chunks.append(" ".join(tmp))
            # reset current
            cur_ids_count = 0; cur_ids = []; cur_sents=[]
            continue
        # if adding this sentence exceeds max, flush current chunk
        if cur_ids_count + l > max_m:
            chunks.append(" ".join(cur_sents))
            # prepare new chunk (with overlap/stride if desired)
            if stride>0:
                # keep last few sents for overlap (not implemented fully here to keep simple)
                cur_sents = [s]
                cur_ids_count = l
            else:
                cur_sents = [s]
                cur_ids_count = l
        else:
            cur_sents.append(s)
            cur_ids_count += l
    if cur_sents:
        chunks.append(" ".join(cur_sents))
    return chunks
# =================== CELL 9: Extractive clustering helper (SBERT optional) ===================
safe_print_header("EXTRACTIVE CLUSTERING (SBERT optional)")
def extractive_by_clustering(text, num_sentences=6):
    try:
        if sbert_model is None:
            raise Exception("SBERT missing")
        sents = sent_tokenize_safe(text)
        if len(sents) <= num_sentences:
            return " ".join(sents)
        emb = sbert_model.encode(sents, convert_to_numpy=True, show_progress_bar=False)
        from sklearn.cluster import KMeans
        import numpy as np
        k = min(num_sentences, len(sents))
        kmeans = KMeans(n_clusters=k, random_state=42).fit(emb)
        centers = kmeans.cluster_centers_
        chosen=[]
        for c in centers:
            idx=int(((emb-c)**2).sum(axis=1).argmin())
            chosen.append((idx, sents[idx]))
        chosen = sorted(chosen, key=lambda x:x[0])
        return " ".join([c for _,c in chosen])
    except Exception:
        return " ".join(text.split(".")[:num_sentences])
# =================== CELL 10: Prepare examples (extractive) ===================
safe_print_header("PREPARE EXAMPLES")
tokenizer = None
try:
    tokenizer = get_tokenizer("google/flan-t5-base")
    print("Tokenizer model_max_length:", tokenizer.model_max_length)
except Exception as e:
    print("Tokenizer load failed:", e)
    tokenizer = None

def create_training_examples(pairs_list, tokenizer_obj=None, chunk_tokens=None):
    examples={"input_texts":[],"target_texts":[]}
    for p in pairs_list:
        prefix="Summarize this meeting. Preserve speaker attribution.\n\n"
        if tokenizer_obj:
            chunks = chunk_text_by_tokenizer_safe(p["dialogue_tagged"], tokenizer_obj, max_tokens=chunk_tokens or (tokenizer_obj.model_max_length-32))
        else:
            chunks=[p["dialogue_tagged"]]
        chunk_summaries=[extractive_by_clustering(ch, num_sentences=5) for ch in chunks]
        combined_input=prefix + "\n\n".join(chunk_summaries)
        examples["input_texts"].append(combined_input)
        examples["target_texts"].append(p["summary"])
    return examples

train_examples = create_training_examples(pairs, tokenizer, chunk_tokens=512)
print("Prepared", len(train_examples["input_texts"]), "example(s).")


# =================== CELL 11: LoRA placeholder ===================
safe_print_header("LoRA (optional)")
print("LoRA disabled by default (DO_TRAIN_LORA = False). Enable & configure if you want adapter training.")


# =================== CELL 12: MODEL LOADING & INFERENCE (safe chunking) ===================
safe_print_header("INFERENCE & CHRONOLOGICAL EXTRACTION")
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

MODEL_DIR = LORE_DIR if os.path.exists(LORE_DIR) else "google/flan-t5-base"

try:
    tokenizer = get_tokenizer(MODEL_DIR if os.path.exists(MODEL_DIR) else "google/flan-t5-base")
except Exception as e:
    print("Fallback tokenizer load failed:", e)
    tokenizer = get_tokenizer("google/flan-t5-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

try:
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to(device)
    model.eval()
except Exception as e:
    print("Model load failed:", e)
    model = None

def run_abstractive(prompt_text, max_len=200):
    if model is None or tokenizer is None:
        return ""
    # chunk prompt into safe-sized chunks
    max_in = tokenizer.model_max_length - 32
    chunks = chunk_text_by_tokenizer_safe(prompt_text, tokenizer, max_tokens=max_in)
    preds=[]
    for ch in chunks:
        inputs = tokenizer(ch, return_tensors="pt", truncation=True, max_length=min(max_in, tokenizer.model_max_length)).to(device)
        try:
            out = model.generate(**inputs, max_length=max_len, num_beams=4, early_stopping=True)
            preds.append(tokenizer.decode(out[0], skip_special_tokens=True))
        except Exception as e:
            print("Generation error for chunk:", e)
            preds.append("")
    return " ".join([p for p in preds if p]).strip()

# =================== CELL 13: RUN INFERENCE & SAVE (fixed) ===================
safe_print_header("RUN INFERENCE & SAVE")
inference_results = []
for p in tqdm(pairs, desc="process-meetings"):
    prefix="Summarize this meeting. Preserve speaker attribution.\n\n"
    if tokenizer:
        chunks = chunk_text_by_tokenizer_safe(p["dialogue_tagged"], tokenizer, max_tokens=min(1024, tokenizer.model_max_length-32))
    else:
        chunks = [p["dialogue_tagged"]]
    chunk_summaries = [extractive_by_clustering(ch, num_sentences=5) for ch in chunks]
    combined_input = prefix + "\n\n".join(chunk_summaries)

    model_pred = ""
    try:
        model_pred = run_abstractive(combined_input, max_len=300)
    except Exception as e:
        print("Model generation failed:", e)
        model_pred = ""

    if not model_pred:
        model_pred = " ".join(chunk_summaries[:3])

    chrono_points = extract_key_points_chronological_clean(p["turns"], top_k=12, min_score=1.0)
    chrono_text = format_extractive_points_chronological(chrono_points)

    def importance_score_text(text):
        ssum = 0.0
        for sent in sent_tokenize_safe(text):
            ssum += rule_score_sentence(sent)
        return ssum

    model_score = importance_score_text(model_pred)
    extractive_score = importance_score_text(chrono_text)
    use_extractive = FORCE_EXTRACTIVE_PRED or (extractive_score >= model_score) or (len(model_pred.split()) < 12)
    final_pred = chrono_text.strip() if use_extractive and chrono_text.strip() else model_pred.strip()
    final_pred = re.sub(r"\n{2,}", "\n", final_pred).strip()

    inference_results.append({
        "meeting_id": p["meeting_id"],
        "pred": final_pred,
        "ref": p["summary"],
        "model_pred_raw": model_pred,
        "extractive_chronological": chrono_text,
        "extractive_points": chrono_points
    })

out_inf = os.path.join(OUT_DIR,"ami_inference_results.json")
with open(out_inf,"w",encoding="utf-8") as f:
    json.dump(inference_results, f, ensure_ascii=False, indent=2)
print("Saved inference results ->", out_inf)


# =================== CELL 14: SAVE readable summaries & preview ===================
safe_print_header("FORMATTED OUTPUTS AND SAVING")
try:
    from IPython.display import display, HTML
    html_parts=[]
    for r in inference_results:
        mid = r.get("meeting_id","meeting")
        pred = r.get("pred","").strip()
        ref = r.get("ref","").strip()
        chrono = r.get("extractive_chronological","").strip()
        print(f"SAMPLE MEETING: {mid}\n")
        print("CHRONOLOGICAL IMPORTANT POINTS (speaker -> text):\n")
        print(chrono if chrono else "(none extracted)")
        print("\n---\n")
        print(f"PRED (final):\n{pred}\n")
        print(f"REF (ground-truth):\n{ref}\n")
        print("-"*100)
        safe_name = "".join([c if c.isalnum() or c in "-_." else "_" for c in str(mid)])
        txt_path = os.path.join(TEXT_SUM_DIR, f"summary_{safe_name}_important.txt")
        with open(txt_path,"w",encoding="utf-8") as tf:
            tf.write("CHRONOLOGICAL IMPORTANT POINTS (speaker -> text)\n\n")
            tf.write(chrono + "\n\n")
            tf.write("PRED (final):\n" + pred + "\n\n")
            tf.write("REF (ground-truth):\n" + ref + "\n")
        rel = os.path.relpath(txt_path, start="/content")
        link = f'<a href="/{rel}" download>Download text</a>'
        preview_html = f"<h4>Meeting: {mid} — {link}</h4><pre style='white-space:pre-wrap;background:#f6f6f6;padding:8px;border-radius:6px'><strong>CHRONOLOGICAL IMPORTANT POINTS:</strong>\n{chrono}\n\n<strong>PRED:</strong>\n{pred}\n\n<strong>REF:</strong>\n{ref}</pre>"
        html_parts.append(preview_html)
    display(HTML("<div style='font-family:sans-serif'>" + "\n<hr/>\n".join(html_parts) + "</div>"))
    print("Saved chronological text summaries to:", TEXT_SUM_DIR)
except Exception as e:
    print("HTML preview failed:", e)
    print("Saved text files to:", TEXT_SUM_DIR)
# =================== CELL 15: Zip LoRA (if present) & done ===================
safe_print_header("EXPORT LoRA IF EXISTS")
if os.path.exists(LORE_DIR):
    zip_path=os.path.join(OUT_DIR,"flan_t5_lora_ami.zip")
    if os.path.exists(zip_path): os.remove(zip_path)
    shutil.make_archive(os.path.join(OUT_DIR,"flan_t5_lora_ami"), 'zip', LORE_DIR)
    print("Created LoRA zip ->", zip_path)
else:
    print("No LoRA dir present at:", LORE_DIR)

safe_print_header("DONE")
print("Outputs (JSON, text files) are in:", OUT_DIR)



USER PATHS:
AMI_ZIP: /content/ami_public_manual_1.6.2.zip
AMI_ROOT: /content/ami_corpus_manual
OUT_DIR: /content/ami_project_out

Trying to locate AMI files...
Final exists -> transcript: True | abssum: True


Built 1 pair from provided files.
Saved preview -> /content/ami_project_out/ami_pairs_preview.json



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loaded SBERT on device: cpu




Tokenizer model_max_length: 512
Prepared 1 example(s).

LoRA disabled by default (DO_TRAIN_LORA = False). Enable & configure if you want adapter training.

Using device: cpu



process-meetings: 100%|██████████| 1/1 [00:13<00:00, 13.86s/it]

Saved inference results -> /content/ami_project_out/ami_inference_results.json

SAMPLE MEETING: ES2009d

CHRONOLOGICAL IMPORTANT POINTS (speaker -> text):

[UNK] (at ?): here we have our detailed design meeting where we will look at the prototype and right , I finally figured out what this whole second bullet point is about in my that my coach was sending to me
[UNK] (at ?): Otherwise it's just saying I'm the secretary and I'm therefore I'm taking the minutes , s just to go just real briefly to go over minutes from last meeting , , I will open them slowly , no ?
[UNK] (at ?): basically the moral of the story from our last minute last meeting was that we that we had meetings from we had presentations done by the Industrial Designer , or from Nathan , and Ron and from Sarah about what we can do here and what limitations we're operating with excuse me what limitations we're operating under , what risk we'd be looking at with some of the various approaches we were discussing and we essenti




Saved chronological text summaries to: /content/ami_project_out/ami_summaries_txt

No LoRA dir present at: /content/ami_project_out/flan_t5_lora_ami

Outputs (JSON, text files) are in: /content/ami_project_out
