In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
def load_dataset(path="../data/processed/text_combined.csv"):
    return pd.read_csv(path, index_col=0)

df = load_dataset()
df.head()

Unnamed: 0,participant_id,text,target_depr,target_ptsd,split
0,300,so I'm going to interview in Spanish okay good...,0,0,dev
1,301,yeah there's also on Craigslist so that's why ...,0,0,dev
2,302,just move around a little bit when you're fini...,0,0,train
3,303,wow okay when you're finished when she's done ...,0,0,train
4,304,so we'll just move around a little bit tonight...,0,0,train


In [3]:
# device selection for heavy models
if torch.cuda.is_available(): DEVICE = 'cuda'
elif torch.backends.mps.is_available(): DEVICE = 'mps'
else: DEVICE = 'cpu'


# preprocessing and embedding related functions

def get_punctuation_model():
    print("INFO: Loading punctuation model...")
    from punctuationmodel import PunctuationModel
    return PunctuationModel(device=DEVICE)


def get_embedding_model():
    print("INFO: Loading embedding model...")
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=DEVICE)


from tokenizer import nltk_sentence_tokenize as sent_tokenize
from tqdm import tqdm


def build_embeddings(df, text_col="text", emb_col="embedding", mask_col="mask", max_sentences=128):
    tqdm.pandas(desc="INFO: Building embeddings")
    punctuation_model = get_punctuation_model()
    embedding_model = get_embedding_model()
    
    all_embeddings = []
    all_masks = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = str(row[text_col])
        
        punc_text = punctuation_model.restore_punctuation(text)
        sentences = sent_tokenize(punc_text)

        # optional: fiter out short sentences
        # sentences = [s for s in sentences if len(s.split()) >= 3]

        if len(sentences) == 0: 
            sentences = [""]

        # truncate to max number of sentences
        sentences = sentences[:max_sentences]

        embeddings = embedding_model.encode(sentences, convert_to_numpy=True)  # (T, D)

        T, D = embeddings.shape

        if T < max_sentences:
            pad = np.zeros((max_sentences - T, D), dtype=embeddings.dtype)
            embeddings = np.vstack([embeddings, pad])
            mask = np.zeros(max_sentences, dtype=np.int64)
            mask[:T] = 1
        else:
            mask = np.ones(max_sentences, dtype=np.int64)

        all_embeddings.append(embeddings)
        all_masks.append(mask)

    result_df = df.copy()
    result_df[emb_col] = all_embeddings
    result_df[mask_col] = all_masks
    return result_df

In [23]:
def split(df):
    return (
        df[df.split == "train"].reset_index(drop=True),
        df[df.split == "dev"].reset_index(drop=True),
        df[df.split == "test"].reset_index(drop=True),
    )


def extract(df):
    # "embedding" column should be np.ndarray (N, T, D)
    X = np.stack(df["embedding"].values)  # (N, T, D)
    M = np.stack(df["mask"].values)       # (N, T)
    y = df["target_depr"].to_numpy()          # (N,)

    return X, M, y

In [5]:
# build embeddings

embeddings_df = build_embeddings(df)

INFO: Loading punctuation model...




INFO: Loading embedding model...


100%|██████████| 275/275 [06:12<00:00,  1.35s/it]


((20864, 768), (163, 128), (163,))

In [26]:
train_df, dev_df, test_df = split(embeddings_df)
X_train, M_train, y_train = extract(train_df)
X_dev, M_dev, y_dev = extract(dev_df)
X_test, M_test, y_test = extract(test_df)

assert X_train.shape == (train_df.shape[0], 128, 768)
assert M_train.shape == (train_df.shape[0], 128)
assert y_train.shape == (train_df.shape[0],)

assert X_dev.shape == (dev_df.shape[0], 128, 768)
assert M_dev.shape == (dev_df.shape[0], 128)
assert y_dev.shape == (dev_df.shape[0],)


In [27]:
# building Dataset and DataLoader

class DocDataset(Dataset):
    def __init__(self, X, M, y):
        """
        X: (N, T_max, D)  - sentence embeddings
        M: (N, T_max)     - mask 1/0 (1 = real sentence)
        y: (N,)           - binary labels (0/1)
        """
        self.X = torch.tensor(X, dtype=torch.float32)
        self.M = torch.tensor(M, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32) # for BCEWithLogitsLoss

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.M[idx], self.y[idx]

   
train_ds = DocDataset(X_train, M_train, y_train)
dev_ds = DocDataset(X_dev, M_dev, y_dev)
test_ds = DocDataset(X_test, M_test, y_test)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_ds, batch_size=16, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=16, shuffle=False)

In [37]:
class BiLSTMAttn(nn.Module):
    def __init__(self, emb_dim=768, hidden_size=128, num_layers=1, dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )
        self.dropout = nn.Dropout(dropout)
        self.attn_fc = nn.Linear(2 * hidden_size, 1)
        self.classifier = nn.Sequential(
            nn.Linear(2 * hidden_size, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
 
    def forward(self, x, mask):
        # BiLSTM
        lstm_out, _ = self.lstm(x)      # (B, T, 2H)
        lstm_out = self.dropout(lstm_out) # (B, T, 2H)
 
        # Attention
        # raw_scores: (B, T, 1)
        raw_scores = self.attn_fc(lstm_out)
        raw_scores = raw_scores.squeeze(-1)  # (B, T)
 
        raw_scores = raw_scores.masked_fill(mask == 0, -1e9) # (B, T)
 
        attn_weights = torch.softmax(raw_scores, dim=1)  # (B, T)
 
        attn_weights = attn_weights.unsqueeze(-1)        # (B, T, 1)
        doc_repr = torch.sum(lstm_out * attn_weights, dim=1) # (B, 2H)
 
        logits = self.classifier(doc_repr).squeeze(-1)   # (B,)
        return logits, attn_weights.squeeze(-1), doc_repr

In [38]:
# full training and evaluation loops

def train(model, train_loader, dev_loader, optimizer, criterion, n_epochs=10):

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0.0
        for X_b, M_b, y_b in train_loader:
            X_b = X_b.to(DEVICE)
            M_b = M_b.to(DEVICE)
            y_b = y_b.to(DEVICE)

            optimizer.zero_grad()
            logits, _, _ = model(X_b, M_b)
            loss = criterion(logits, y_b)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * X_b.size(0)

        avg_loss = total_loss / len(train_loader.dataset)

        # Evaluation on dev set can be added here
        model.eval()
        with torch.no_grad():
            total_dev_loss = 0.0
            for X_b, M_b, y_b in dev_loader:
                X_b = X_b.to(DEVICE)
                M_b = M_b.to(DEVICE)
                y_b = y_b.to(DEVICE)

                logits, _, _ = model(X_b, M_b)
                loss = criterion(logits, y_b)
                total_dev_loss += loss.item() * X_b.size(0)

            avg_dev_loss = total_dev_loss / len(dev_loader.dataset)
    
        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {avg_loss:.4f}, Dev Loss: {avg_dev_loss:.4f}")
    
    return model

In [54]:

y_train_np = y_train  # если это numpy массив 0/1

n_neg = np.sum(y_train_np == 0)
n_pos = np.sum(y_train_np == 1)
print("n_neg:", n_neg, "n_pos:", n_pos)

pos_weight_value = n_neg / n_pos
pos_weight = torch.tensor([pos_weight_value], dtype=torch.float32, device=DEVICE)

# creating model, optimizer, loss function
model = BiLSTMAttn(emb_dim=X_train.shape[2], hidden_size=128).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

trained_model = train(model, train_loader, dev_loader, optimizer, criterion, n_epochs=15)

n_neg: 126 n_pos: 37
Epoch 1/15, Train Loss: 1.0735, Dev Loss: 1.0492
Epoch 2/15, Train Loss: 1.0714, Dev Loss: 1.0484
Epoch 3/15, Train Loss: 1.0691, Dev Loss: 1.0475
Epoch 4/15, Train Loss: 1.0670, Dev Loss: 1.0445
Epoch 5/15, Train Loss: 1.0533, Dev Loss: 1.0253
Epoch 6/15, Train Loss: 1.0080, Dev Loss: 1.2244
Epoch 7/15, Train Loss: 1.0869, Dev Loss: 1.0207
Epoch 8/15, Train Loss: 0.9864, Dev Loss: 1.0274
Epoch 9/15, Train Loss: 0.9206, Dev Loss: 1.0172
Epoch 10/15, Train Loss: 0.8575, Dev Loss: 0.9184
Epoch 11/15, Train Loss: 0.8520, Dev Loss: 0.8366
Epoch 12/15, Train Loss: 0.8017, Dev Loss: 0.9406
Epoch 13/15, Train Loss: 0.7004, Dev Loss: 1.0830
Epoch 14/15, Train Loss: 0.6679, Dev Loss: 1.1375
Epoch 15/15, Train Loss: 0.6596, Dev Loss: 0.8437


In [56]:
import numpy as np
from sklearn.metrics import roc_curve

def find_best_threshold_youden(y_true, y_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    J = tpr - fpr
    idx = np.argmax(J)
    return thresholds[idx], J[idx]


from sklearn.metrics import roc_auc_score

@torch.no_grad()
def eval_model(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_examples = 0
    all_probs = []
    all_targets = []

    for X_b, M_b, y_b in loader:
        X_b = X_b.to(device)
        M_b = M_b.to(device)
        y_b = y_b.to(device)

        logits, _, _ = model(X_b, M_b)
        loss = criterion(logits, y_b)

        batch_size = X_b.size(0)
        total_loss += loss.item() * batch_size
        total_examples += batch_size

        probs = torch.sigmoid(logits).cpu().numpy()
        all_probs.append(probs)
        all_targets.append(y_b.cpu().numpy())

    import numpy as np
    all_probs = np.concatenate(all_probs)
    all_targets = np.concatenate(all_targets)

    try:
        auc = roc_auc_score(all_targets, all_probs)
    except ValueError:
        auc = np.nan  # если только один класс

    return total_loss / total_examples, auc, all_probs, all_targets


dev_loss, dev_auc, dev_probs, dev_y = eval_model(model, dev_loader, criterion, DEVICE)
best_thr, best_J = find_best_threshold_youden(dev_y, dev_probs)
print(f"Dev AUC={dev_auc:.4f}, best threshold={best_thr:.4f}, Youden J={best_J:.4f}")

from sklearn.metrics import classification_report, confusion_matrix
dev_pred = (dev_probs >= best_thr).astype(int)
print("Dev Classification Report:")
print(classification_report(dev_y, dev_pred))
print("Dev Confusion Matrix:")
print(confusion_matrix(dev_y, dev_pred))

test_loss, test_auc, test_probs, test_y = eval_model(model, test_loader, criterion, DEVICE)
test_pred = (test_probs >= best_thr).astype(int)
print(f"Test AUC={test_auc:.4f}")
print("Test Classification Report:")
print(classification_report(test_y, test_pred))
print("Test Confusion Matrix:")
print(confusion_matrix(test_y, test_pred))



Dev AUC=0.7633, best threshold=0.6227, Youden J=0.5455
Dev Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.80      0.85        44
         1.0       0.50      0.75      0.60        12

    accuracy                           0.79        56
   macro avg       0.71      0.77      0.73        56
weighted avg       0.83      0.79      0.80        56

Dev Confusion Matrix:
[[35  9]
 [ 3  9]]
Test AUC=0.7919
Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.59      0.72        39
         1.0       0.48      0.88      0.62        17

    accuracy                           0.68        56
   macro avg       0.70      0.74      0.67        56
weighted avg       0.79      0.68      0.69        56

Test Confusion Matrix:
[[23 16]
 [ 2 15]]
