In [60]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_from_disk, load_dataset
from joblib import load
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    confusion_matrix,
    f1_score,
    brier_score_loss,
)
import pandas as pd
import torch
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [15]:
# Load downloaded mediations
base_path = '../data/mediation_search_results'
data_files = {
    'train': f"{base_path}-train.csv",
    'test': f"{base_path}-test.csv"
}
mediations = load_dataset('csv', data_files=data_files)

# Load test set text and search strings
test_dispute = pd.read_csv('../data/dispute_string-test.csv')
test_text = pd.read_csv(data_files['test'])

Using custom data configuration default-31792a7577eeeaac
Reusing dataset csv (/home/evan/.cache/huggingface/datasets/csv/default-31792a7577eeeaac/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

In [34]:
# Join data and relabel
validation_data = pd.concat([test_text, test_dispute], axis=1)
validation_data = validation_data.rename(
    columns={'Unnamed: 0': 'index'}
    )[['index', 'dispute', 'text', 'label']]

Unnamed: 0,index,dispute,text,label
0,28092,(Russia) AND [Buryat*] AND [mediat* OR envoy O...,DIGEST,0
1,15793,"(Colombia) AND [""Indigenous People"" OR ""Indige...",PUBLISHED BY THE PRS GROUP\n\n \n\nCOLOMBIA\n\...,0
2,25802,(Myanmar) AND [Kachin*] AND [mediat* OR envoy ...,"By Htet Aung Kyaw, senior journalist of the Os...",0
3,99890,(Sri Lanka) w/seg [Muslim*] w/seg [mediation O...,"Sri Lanka, Jan. 26 -- Sheikh Afeefuddin Al Jai...",0
4,39765,(South Africa) AND [Afrikaner*] AND [mediat* O...,"Four more bodies found\n\nMONTCOAL, West Virgi...",0
...,...,...,...,...
31176,45303,(Sri Lanka) AND [Muslim*] AND [mediat* OR envo...,Text of report by Sri Lankan Ministry of Defen...,0
31177,67361,"(Sudan) AND [""Southerner"" OR ""Southerners""] AN...",Sudan's fragile peace overcame a major hurdle ...,0
31178,39824,(South Africa) AND [Afrikaner*] AND [mediat* O...,The transport revolution which began with the ...,0
31179,63421,"(Sudan) AND [""Southerner"" OR ""Southerners""] AN...",Excerpt from report by Sudanese electronic ne...,0


In [58]:
def create_train_test_splits(dataset):
    # Create splits
    X_train = np.array(dataset["train"]["hidden_state"])
    X_valid = np.array(dataset["test"]["hidden_state"])
    y_train = np.array(dataset["train"]["label"])
    y_valid = np.array(dataset["test"]["label"])
    return X_train, y_train, X_valid, y_valid

def generate_preds(model, X):
    y_preds = model.predict(X)
    y_proba = model.predict_proba(X)
    return y_preds, y_proba

def score_model(model, X, y):
    y_preds = model.predict(X)
    y_proba = model.predict_proba(X)
    model_scores = {}
    model_scores["score"] = model.score(X, y)
    model_scores["brier"] = brier_score_loss(y, y_proba[:, 1])
    model_scores["f1"] = f1_score(y_preds, y)
    return model_scores, y_preds

def plot_confusion_matrix(y_preds, y_true, labels, subtitle=None):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title(f"Normalized confusion matrix\n{subtitle}")
    plt.show()

In [40]:
# load embbeddings and prediction heads
model_dbert = "distilbert-base-uncased-feature-embeddings-quantized"
model_lformer = "allenai/longformer-base-4096-feature-embeddings"
models = [model_dbert, model_lformer]

lr_dbert = "distilbert-base-uncased-quantized-mediations-logistic-classifier.joblib"
lr_lformer = "longformer-base-4096-mediations-logistic-classifier.joblib"
lrs = [lr_dbert, lr_lformer]

embeddings_datasets = []
prediction_heads = []
for model, lr in zip(models, lrs):
    # Load
    model_outfile = f"../models/{model}"
    lr_outfile = f"../models/{lr}"
    embeddings_datasets.append(load_from_disk(model_outfile))
    prediction_heads.append(load(lr_outfile))


In [72]:
# write results to excel
with pd.ExcelWriter('../data/model_validation_output.xlsx') as writer:  
    for idx, model in enumerate(['distilbert-quantized', 'longformer']):
        # Get prediction and probability
        _, _, X_valid, _ = create_train_test_splits(embeddings_datasets[idx])
        y_pred, y_proba = generate_preds(prediction_heads[idx], X_valid)
        
        # Append to validation
        validation_data['prediction'] = y_pred
        validation_data['probability'] = y_proba[:,1]
        
        # Write
        validation_data.to_excel(writer, sheet_name=model)