In [7]:
import pandas as pd
from pathlib import Path

def read_meta(path, has_mmse):
    df = pd.read_csv(
        path, sep=';', names=['ID','age','gender'] + (['mmse'] if has_mmse else []),
        skiprows=1, engine='python'
    )
    df.ID = df.ID.str.strip()
    return df

cc = read_meta(Path('train/transcription/cc_meta_data.txt'), True).assign(label=0)
cd = read_meta(Path('train/transcription/cd_meta_data.txt'), True).assign(label=1)
train_meta = pd.concat([cc, cd], ignore_index=True)

test_meta = read_meta(Path('test/meta_data.txt'), False)

train_paths = list(Path('train/transcription').rglob('*.cha'))
test_paths  = list(Path('test/transcription').rglob('*.cha'))

train_df = (
    pd.DataFrame({'path': [str(p) for p in train_paths]})
      .assign(ID=lambda d: d.path.map(lambda p: Path(p).stem))
      .merge(train_meta[['ID','label']], on='ID')
      .assign(split='train')
)
test_df = (
    pd.DataFrame({'path': [str(p) for p in test_paths]})
      .assign(ID=lambda d: d.path.map(lambda p: Path(p).stem))
      .merge(test_meta[['ID']], on='ID')
      .assign(label=-1, split='test')
)

df = pd.concat([train_df, test_df], ignore_index=True)
df.to_csv('transcript_paths.csv', index=False)
print("Samples by split & label:\n", df.groupby(['split','label']).size())

Samples by split & label:
 split  label
test   -1       47
train   0       54
        1       54
dtype: int64


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

class TranscriptDataset(Dataset):
    def __init__(self, paths, labels, tokenizer_name='bert-base-uncased', max_length=512):
        self.paths      = paths
        self.labels     = labels
        self.tokenizer  = BertTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        text = Path(self.paths[idx]).read_text(encoding='utf-8').replace('\n',' ')
        enc  = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids':      enc.input_ids.squeeze(0),
            'attention_mask': enc.attention_mask.squeeze(0),
            'labels':         torch.tensor(self.labels[idx], dtype=torch.long)
        }

def make_dataloaders(df, batch_size=4, max_length=256):
    train = df[df.split=='train']
    test  = df[df.split=='test']

    train_ds = TranscriptDataset(
        train.path.tolist(), train.label.tolist(),
        max_length=max_length
    )
    test_ds  = TranscriptDataset(
        test.path.tolist(), test.label.tolist(),
        max_length=max_length
    )

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

In [3]:
import torch
from transformers import BertModel

class ADClassifier(torch.nn.Module):
    def __init__(self, pretrained_model='bert-base-uncased', n_classes=2):
        super().__init__()
        self.bert       = BertModel.from_pretrained(pretrained_model)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        out    = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = out.pooler_output
        return self.classifier(pooled)

In [9]:
import torch
from transformers import BertModel

class ADClassifier(torch.nn.Module):
    def __init__(self, n_classes=2):
        super().__init__()
        self.bert       = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        out    = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = out.pooler_output
        return self.classifier(pooled)


In [10]:
from transformers import pipeline
from nltk.corpus import wordnet
import numpy as np

en2de = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de")
de2en = pipeline("translation_de_to_en", model="Helsinki-NLP/opus-mt-de-en")

def back_translate(text):
    de = en2de(text, max_length=512)[0]['translation_text']
    return de2en(de, max_length=512)[0]['translation_text']

def synonym_replace(text, pct=0.1):
    words = text.split()
    n     = max(1, int(len(words)*pct))
    for idx in np.random.choice(len(words), n, replace=False):
        syns = wordnet.synsets(words[idx])
        if syns:
            words[idx] = syns[0].lemmas()[0].name().replace('_',' ')
    return ' '.join(words)

def remove_pause(text):
    return ' '.join(w for w in text.split() if w.lower() not in {'uh','um'})

def apply_perturbation(text, method='none', pct=0.1):
    if method=='bt':    return back_translate(text)
    if method=='syn':   return synonym_replace(text, pct)
    if method=='pause': return remove_pause(text)
    return text

Device set to use cpu
Device set to use cpu


In [None]:
import torch
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

def train_epoch(model, loader, opt, sched, device, epoch):
    model.train()
    total_loss = 0.0
    print(f"Epoch {epoch} training...")
    for step, batch in enumerate(loader, 1):
        ids   = batch['input_ids'].to(device)
        mask  = batch['attention_mask'].to(device)
        lbls  = batch['labels'].to(device)

        logits = model(ids, mask)
        loss   = torch.nn.functional.cross_entropy(logits, lbls)
        loss.backward()

        opt.step(); sched.step(); opt.zero_grad()
        total_loss += loss.item()

        if step % 20 == 0:
            print(f"    [Step {step}/{len(loader)}] avg loss {(total_loss/step):.4f}")
    avg = total_loss / len(loader)
    print(f"Epoch {epoch} training complete. Avg loss {avg:.4f}\n")
    return avg

@torch.no_grad()
def evaluate(model, loader, device, epoch):
    model.eval()
    preds, trues = [], []
    print(f"Evaluating after Epoch {epoch}...")
    for batch in loader:
        # Pull labels and mask out any test (label==-1)
        labels = batch['labels'].cpu().tolist()
        valid_mask = [l>=0 for l in labels]
        if not any(valid_mask):
            continue

        # Select only valid examples
        ids         = batch['input_ids'][valid_mask].to(device)
        mask_tensor = batch['attention_mask'][valid_mask].to(device)
        lbls_tensor = batch['labels'][valid_mask].to(device)

        logits = model(ids, mask_tensor)
        batch_preds = logits.argmax(-1).cpu().tolist()

        preds.extend(batch_preds)
        trues.extend(lbls_tensor.cpu().tolist())

    acc = accuracy_score(trues, preds)
    f1  = f1_score(trues, preds)
    print(f"Eval Epoch {epoch}: Acc={acc:.4f}, F1={f1:.4f}\n")
    return acc, f1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("🖥  Using device:", device)

df = pd.read_csv('transcript_paths.csv')
train_loader, test_loader = make_dataloaders(df, batch_size=4, max_length=256)
print(f"Train samples: {len(train_loader.dataset)}, Eval samples (with labels): {sum(1 for _,l in zip(test_loader.dataset.paths, test_loader.dataset.labels) if l>=0)}\n")

model = ADClassifier().to(device)
epochs = 10
total_steps = len(train_loader) * epochs
opt   = AdamW(model.parameters(), lr=2e-5)
sched = get_linear_schedule_with_warmup(opt, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)

best_f1 = 0.0
for epoch in range(1, epochs+1):
    train_loss = train_epoch(model, train_loader, opt, sched, device, epoch)
    acc, f1    = evaluate(model, test_loader, device, epoch)

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"New best F1={f1:.4f} saved to best_model.pt\n")

print("Training complete.")

🖥  Using device: cpu
📊  Train samples: 108, Eval samples (with labels): 0

🔄  Epoch 1 training...
    [Step 20/27] avg loss 0.6999
✅  Epoch 1 training complete. Avg loss 0.7012

🔍  Evaluating after Epoch 1...


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🎯  Eval Epoch 1: Acc=nan, F1=0.0000

🔄  Epoch 2 training...
    [Step 20/27] avg loss 0.6944
✅  Epoch 2 training complete. Avg loss 0.6939

🔍  Evaluating after Epoch 2...


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🎯  Eval Epoch 2: Acc=nan, F1=0.0000

🔄  Epoch 3 training...
    [Step 20/27] avg loss 0.4717
✅  Epoch 3 training complete. Avg loss 0.3925

🔍  Evaluating after Epoch 3...


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🎯  Eval Epoch 3: Acc=nan, F1=0.0000

🔄  Epoch 4 training...
    [Step 20/27] avg loss 0.0778
✅  Epoch 4 training complete. Avg loss 0.0659

🔍  Evaluating after Epoch 4...


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🎯  Eval Epoch 4: Acc=nan, F1=0.0000

🔄  Epoch 5 training...
    [Step 20/27] avg loss 0.0197
✅  Epoch 5 training complete. Avg loss 0.0181

🔍  Evaluating after Epoch 5...


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🎯  Eval Epoch 5: Acc=nan, F1=0.0000

🔄  Epoch 6 training...
    [Step 20/27] avg loss 0.0109
✅  Epoch 6 training complete. Avg loss 0.0103

🔍  Evaluating after Epoch 6...


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🎯  Eval Epoch 6: Acc=nan, F1=0.0000

🔄  Epoch 7 training...
    [Step 20/27] avg loss 0.0078
✅  Epoch 7 training complete. Avg loss 0.0075

🔍  Evaluating after Epoch 7...


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🎯  Eval Epoch 7: Acc=nan, F1=0.0000

🔄  Epoch 8 training...


In [None]:
import torch, pandas as pd
from pathlib import Path
from transformers import BertTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ADClassifier().to(device)
model.load_state_dict(torch.load('best_model.pt', map_location=device))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = pd.read_csv('transcript_paths.csv')
test = df[df.split=='test'].reset_index(drop=True)

records = []
for method in ['none','bt','syn','pause']:
    for pct in [0.1, 0.3, 0.5]:
        correct = 0
        for _, row in test.iterrows():
            text = Path(row.path).read_text(encoding='utf-8')
            pert = apply_perturbation(text, method, pct)
            enc  = tokenizer(pert, truncation=True, padding='max_length',
                             max_length=256, return_tensors='pt').to(device)
            logits = model(enc.input_ids, enc.attention_mask)
            pred   = logits.argmax(-1).item()
            # If you have true label in row.label, compare; otherwise skip
            if row.label >= 0 and pred == row.label:
                correct += 1
        acc = correct / len(test) if row.label>=0 else None
        records.append({'method': method, 'pct': pct, 'accuracy': acc})

print(pd.DataFrame(records).pivot('pct','method','accuracy'))