**Een Identifier die gebruik maakt van active learning. 500 vragen zijn zelf gelabeled. Vervolgens fine tunen we een bert model hierop. We laten deze getunede bert dan zelf 1000 ongeziene vragen labelen. We kijken naar de 20% waar het model het minst zeker van is. Die kijken we handmatig na. De gecorrigeerde voegen we dan toe aan de training data en zo herhalen we dit proces tot het bert model naar behoren werkt.** 

In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load labeled data
df = pd.read_excel("Trainig_data.xlsx")
df.columns = ['question', 'label']

# Remove duplicate questions
df = df.drop_duplicates(subset='question')

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['question'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)


In [3]:
'''
Een random lange zin om te zien of we met 128 tokens per zin goed zitten. Deze zin bevat 44 dus 128 zou ok moeten zijn 
'''


tokenizer = BertTokenizer.from_pretrained('GroNLP/bert-base-dutch-cased')
tokenizer2 = BertTokenizer.from_pretrained("bert-base-multilingual-cased" )
tokenizer3 = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

text = "Zal de minister initiatieven nemen om ervoor te zorgen dat personen in een palliatief zorgtraject in de toekomst wel aanspraak kunnen maken op een tegemoetkoming van hulpmiddelen? Zo ja, welke en binnen welke termijn?"

tokens = tokenizer.tokenize(text)
tokens2 = tokenizer2.tokenize(text)
tokens3 = tokenizer3.tokenize(text)

print("Number of tokens:", len(tokens))
print("Tokens:", tokens)

print("Number of tokens:", len(tokens2))
print("Tokens:", tokens2)

print("Number of tokens:", len(tokens3))
print("Tokens:", tokens3)

Number of tokens: 44
Tokens: ['Zal', 'de', 'minister', 'initiatie', '##ven', 'nemen', 'om', 'ervoor', 'te', 'zorgen', 'dat', 'personen', 'in', 'een', 'pal', '##lia', '##tief', 'zorgt', '##ra', '##ject', 'in', 'de', 'toekomst', 'wel', 'aan', '##spraak', 'kunnen', 'maken', 'op', 'een', 'tegemoetkoming', 'van', 'hulpmiddel', '##en', '?', 'Zo', 'ja', ',', 'welke', 'en', 'binnen', 'welke', 'termijn', '?']
Number of tokens: 55
Tokens: ['Za', '##l', 'de', 'minister', 'init', '##iati', '##even', 'nemen', 'om', 'ervoor', 'te', 'zo', '##rgen', 'dat', 'personen', 'in', 'een', 'pal', '##lia', '##tief', 'zo', '##rgt', '##raj', '##ect', 'in', 'de', 'toe', '##komst', 'wel', 'aan', '##spraak', 'kunnen', 'maken', 'op', 'een', 'te', '##gem', '##oe', '##tkom', '##ing', 'van', 'hulp', '##mid', '##delen', '?', 'Zo', 'ja', ',', 'welke', 'en', 'binnen', 'welke', 'term', '##ijn', '?']
Number of tokens: 40
Tokens: ['Zal', 'Ġde', 'Ġminister', 'Ġinitiatieven', 'Ġnemen', 'Ġom', 'Ġervoor', 'Ġte', 'Ġzorgen', 'Ġdat'

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('GroNLP/bert-base-dutch-cased')  # or 'GroNLP/bert-base-dutch-cased'

def tokenize(texts):
    return tokenizer(
        texts, padding=True, truncation=True, return_tensors="pt", max_length=128
    )

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)


In [5]:
# from transformers import BertTokenizer
# def remove_hoeveel(text):
#     return ' '.join([w for w in text.split() if w.lower() != 'hoeveel'])

# train_texts = [remove_hoeveel(t) for t in train_texts]
# val_texts = [remove_hoeveel(t) for t in val_texts]


# tokenizer = BertTokenizer.from_pretrained('GroNLP/bert-base-dutch-cased')  # or 'GroNLP/bert-base-dutch-cased'

# def tokenize(texts):
#     return tokenizer(
#         texts, padding=True, truncation=True, return_tensors="pt", max_length=128
#     )

# train_encodings = tokenize(train_texts)
# val_encodings = tokenize(val_texts)

In [6]:

class QuestionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = QuestionDataset(train_encodings, train_labels)
val_dataset = QuestionDataset(val_encodings, val_labels)


In [8]:
model = BertForSequenceClassification.from_pretrained('GroNLP/bert-base-dutch-cased', num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,                 # 👈 Keep only the last checkpoint
    learning_rate=2e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",  # 🔥 Log only once per epoch
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

# ✅ 10. Define Metrics for Evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4551,0.285572,0.888608,0.884785,0.888608,0.885546
2,0.3019,0.32788,0.870886,0.876406,0.870886,0.873104
3,0.1686,0.46482,0.898734,0.898734,0.898734,0.898734
4,0.1068,0.551833,0.891139,0.892582,0.891139,0.891795
5,0.0625,0.613993,0.891139,0.893648,0.891139,0.892214
6,0.0399,0.625638,0.898734,0.899614,0.898734,0.899145
7,0.0264,0.678512,0.891139,0.892582,0.891139,0.891795
8,0.0163,0.700469,0.893671,0.895597,0.893671,0.894518


TrainOutput(global_step=1584, training_loss=0.14717642225400365, metrics={'train_runtime': 280.7652, 'train_samples_per_second': 44.963, 'train_steps_per_second': 5.642, 'total_flos': 830378490716160.0, 'train_loss': 0.14717642225400365, 'epoch': 8.0})

In [9]:
print("hoeveel" in ' '.join(train_texts).lower())  # Should print False


True


In [10]:
# import torch
# import torch.nn.functional as F
# import pandas as pd

# # Device setup
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Load data
# unlabeled_df = pd.read_excel("unlabeled_questions.xlsx")
# questions = unlabeled_df['question'].tolist()
# encodings = tokenize(questions)
# encodings = {k: v.to(device) for k, v in encodings.items()}

# # Run model on the data
# with torch.no_grad():
#     outputs = model(**encodings)
#     probs = F.softmax(outputs.logits, dim=1)
    
#     predicted_probs, predicted_labels = torch.max(probs, dim=1)
#     uncertainty = 1 - predicted_probs  # lower confidence = more uncertain

# # Add results to DataFrame
# unlabeled_df['predicted_label'] = predicted_labels.cpu().numpy()
# unlabeled_df['confidence'] = predicted_probs.cpu().numpy()
# unlabeled_df['uncertainty'] = uncertainty.cpu().numpy()

# # Sort by uncertainty and get top 20%
# top_uncertain = unlabeled_df.sort_values(by='uncertainty', ascending=False).head(int(0.2 * len(unlabeled_df)))

# # Save to Excel for manual labeling
# top_uncertain.to_excel("to_label_eng.xlsx", index=False)



In [11]:
# Load test data
df_test = pd.read_excel("Testset.xlsx")
df_test.columns = ['question', 'label']

# Clean and convert to string
df_test = df_test.dropna(subset=['question', 'label'])
df_test['question'] = df_test['question'].astype(str)

# Extract questions and labels
test_texts = df_test['question'].tolist()
test_labels = df_test['label'].tolist()

# Tokenize
test_encodings = tokenizer(
    test_texts, padding=True, truncation=True, return_tensors="pt", max_length=128
)


In [12]:
from transformers import Trainer
import numpy as np
import pandas as pd

# Create the test dataset
test_dataset = QuestionDataset(test_encodings, test_labels)

# Run predictions
predictions_output = trainer.predict(test_dataset)

# Get raw logits
logits = predictions_output.predictions
predicted_labels = np.argmax(logits, axis=1)

# Load test questions (in same order)
df_test = pd.read_excel("Testset.xlsx")
df_test = df_test.dropna(subset=['question']).reset_index(drop=True)
df_test.columns = ['question', 'true_label']

# Create final DataFrame with question + true + predicted
df = pd.DataFrame({
    "question": df_test["question"],
    "true_label": df_test["true_label"],
    "predicted_label": predicted_labels
})

# Save
df.to_csv("bert_predictions.csv", index=False)




In [13]:
import numpy as np
from sklearn.metrics import classification_report

# Get raw predictions
outputs = trainer.predict(test_dataset)
preds = np.argmax(outputs.predictions, axis=1)

# Print evaluation report
print(classification_report(test_labels, preds))


              precision    recall  f1-score   support

           0       0.94      0.94      0.94       362
           1       0.83      0.83      0.83       137

    accuracy                           0.91       499
   macro avg       0.88      0.88      0.88       499
weighted avg       0.91      0.91      0.91       499



In [14]:
import pandas as pd
import re
from sklearn.metrics import classification_report

# Load test data
df_test = pd.read_excel("Testset.xlsx")
df_test.columns = ['question', 'label']
df_test = df_test.dropna(subset=['question'])
df_test['question'] = df_test['question'].astype(str)

# Your regex sets
statistical_sets = {
    "Set 1": [
        r"\b(hoeveel|aantal|percentage van|percentage|cijfer over|data over|statistieken van)\b",
        r"\b(trend in|evolutie van|groei van|toename van|afname van|ontwikkeling van)\b",
        r"\b(?:verschaffen|geven|tonen|lijst|overzicht van)?\s*(de|een)?\s*(gegevens|statistieken|cijfers)\b"
    ],
    # "Set 1.1": [
    #     r"\b(aantal|percentage van|percentage|cijfer over|data over|statistieken van)\b",
    #     r"\b(trend in|evolutie van|groei van|toename van|afname van|ontwikkeling van)\b",
    #     r"\b(?:verschaffen|geven|tonen|lijst|overzicht van)?\s*(de|een)?\s*(gegevens|statistieken|cijfers)\b"
    # ],
    # "Set 2": [
    #     r"\b(hoe vaak|hoe groot|gemiddelde van|mediaan van|ratio van|procent van)\b",
    #     r"\b(stijging van|daling van|verandering in|ontwikkeling in|schommeling van|impact op)\b",
    #     r"\b(?:verschaffen|geven|tonen|lijst|overzicht van|analyse van)?\s*(de|een)?\s*(cijfers|gegevens|statistieken|tellingen)\b"
    # ],
    # "Set 3": [
    #     r"\b(bruto|netto|inkomen|uitgaven|begroting|subsidies|belasting|tarief|BBP|inflatie|schulden|werkloosheid|bestedingen|consumptie)\b",
    #     r"\b(bevolking|leeftijdsgroep|demografie|migratie|emigratie|immigratie|huishoudens|gezinnen|verhouding tussen|dichtheid)\b",
    #     r"\b(aantal|hoeveelheid|grootte van|gemiddelde|mediaan|percentage|spreiding|percentiel|kwartiel|standaarddeviatie)\b"
    # ],
    # "Set 4": [
    #     r"\b(zorgkosten|patiënten|ziekenhuisopnames|sterftecijfers|levensverwachting|gezondheidsuitgaven|vaccinaties|epidemieën|medicatiegebruik)\b",
    #     r"\b(reistijd|filedruk|kilometers afgelegd|verkeersongevallen|CO2-uitstoot|luchtvervuiling|hernieuwbare energie|klimaatverandering|waterkwaliteit)\b",
    #     r"\b(grondprijzen|woningmarkt|huurprijzen|hypotheken|verkoopcijfers|bouwvergunningen|energieverbruik)\b"
    # ],
    # "Set 5": [
    #     r"\b(vergeleken met|ten opzichte van|in vergelijking met|in het verleden|sinds \d{4}|tussen \d{4} en \d{4})\b",
    #     r"\b(ontwikkeling sinds|historische gegevens|trendanalyse|jaarverslagen|statistische rapporten)\b"
    # ],
    # "Set 6": [
    #     r"\b(hoeveel|aantal|percentage|statistieken|cijfers|gegevens|data)\b",
    #     r"\b(trend in|evolutie van|groei van|toename van|afname van|ontwikkeling van)\b",
    #     r"\b(per\s+\w+|per\s+\d+|in\s+\d{4}|tussen\s+\d{4}\s+en\s+\d{4})\b",  
    #     r"\b(wat is het aantal|hoe groot is|hoe vaak|gemiddelde van|ratio van)\b"
    # ],
    # "Set 7": [
    #     r"^(Wat is het aantal|Hoeveel)\b",
    #     r"\b(totale budget|welk budget|eenzelfde hoogte|gemiddeld aantal|geëvolueerd in de periode|jaarlijkse kostprijs |het aantal)\b",
    #     r"\b(bedroeg|welk|Kan de minister|wat was)\s*(de|een)?\s*(factuur|budget|overzicht|kostprijs)\b",
    #     r"\b(?:verschaffen|geven|tonen|lijst|overzicht van|bedroeg)?\s*(de|een)?\s*(gegevens|statistieken|cijfers|factuur|overzicht)\b"
    # ],
    # "Set 8": [
    #     r"^(Wat is het aantal|Hoeveel)\b",
    #     r"\b(totale budget|welk budget|cijfer over|eenzelfde hoogte|gemiddeld aantal|geëvolueerd in de periode|jaarlijkse kostprijs|het aantal)\b",
    #     r"\b(bedroeg|welk|Kan de minister|wat was)\s*(de|een)?\s*(factuur|budget|overzicht|kostprijs)\b",
    #     r"\b(?:verschaffen|geven|tonen|lijst|overzicht van|bedroeg)?\s*(de|een)?\s*(gegevens|statistieken|cijfers|factuur|overzicht)\b"
    # ],
    "Set 2": [
        r"^(Hoeveel)\b"
    ]
}

# Function to apply regex rules to each question
def matches_any(question, patterns):
    return any(re.search(pat, question, flags=re.IGNORECASE) for pat in patterns)

# Evaluate each regex set
for set_name, patterns in statistical_sets.items():
    df_test[f"{set_name}_match"] = df_test['question'].apply(lambda q: int(matches_any(q, patterns)))

    if 'label' in df_test.columns:
        print(f"\n🧪 {set_name} — Evaluation against true labels:")
        print(classification_report(df_test['label'], df_test[f"{set_name}_match"], digits=3))



🧪 Set 1 — Evaluation against true labels:
              precision    recall  f1-score   support

           0      0.914     0.914     0.914       362
           1      0.774     0.774     0.774       137

    accuracy                          0.876       499
   macro avg      0.844     0.844     0.844       499
weighted avg      0.876     0.876     0.876       499


🧪 Set 2 — Evaluation against true labels:
              precision    recall  f1-score   support

           0      0.859     0.994     0.922       362
           1      0.975     0.569     0.719       137

    accuracy                          0.878       499
   macro avg      0.917     0.782     0.820       499
weighted avg      0.891     0.878     0.866       499



In [15]:
import pandas as pd
import re
from sklearn.metrics import classification_report, precision_recall_fscore_support

# Load test data
df_test = pd.read_excel("Testset.xlsx")
df_test.columns = ['question', 'label']
df_test = df_test.dropna(subset=['question'])
df_test['question'] = df_test['question'].astype(str)

# Define regex sets
statistical_sets = {
    "Set_1": [
        r"\b(hoeveel|aantal|percentage van|percentage|cijfer over|data over|statistieken van)\b",
        r"\b(trend in|evolutie van|groei van|toename van|afname van|ontwikkeling van)\b",
        r"\b(?:verschaffen|geven|tonen|lijst|overzicht van)?\s*(de|een)?\s*(gegevens|statistieken|cijfers)\b"
    ],
    "Set_2": [
        r"^(Hoeveel)\b"
    ]
}

# Function to apply regex rules
def matches_any(question, patterns):
    return any(re.search(pat, question, flags=re.IGNORECASE) for pat in patterns)

# Save classification reports
metrics_list = []

# Apply and evaluate each regex set
for set_name, patterns in statistical_sets.items():
    col_name = f"{set_name}_match"
    df_test[col_name] = df_test['question'].apply(lambda q: int(matches_any(q, patterns)))

    if 'label' in df_test.columns:
        print(f"\n🧪 {set_name} — Evaluation against true labels:")
        report = classification_report(df_test['label'], df_test[col_name], digits=3, output_dict=True)
        print(classification_report(df_test['label'], df_test[col_name], digits=3))
        
        # Save key metrics
        prf = precision_recall_fscore_support(df_test['label'], df_test[col_name], average='binary')
        metrics_list.append({
            'Regex_Set': set_name,
            'Precision': prf[0],
            'Recall': prf[1],
            'F1-score': prf[2],
            'Support': prf[3]
        })

# Save predictions and metrics
df_test.to_csv("regex_predictions.csv", index=False)
pd.DataFrame(metrics_list).to_csv("regex_evaluation_metrics.csv", index=False)



🧪 Set_1 — Evaluation against true labels:
              precision    recall  f1-score   support

           0      0.914     0.914     0.914       362
           1      0.774     0.774     0.774       137

    accuracy                          0.876       499
   macro avg      0.844     0.844     0.844       499
weighted avg      0.876     0.876     0.876       499


🧪 Set_2 — Evaluation against true labels:
              precision    recall  f1-score   support

           0      0.859     0.994     0.922       362
           1      0.975     0.569     0.719       137

    accuracy                          0.878       499
   macro avg      0.917     0.782     0.820       499
weighted avg      0.891     0.878     0.866       499



In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from nltk.corpus import stopwords

# Make sure stopwords are downloaded
# import nltk; nltk.download('stopwords')
dutch_stopwords = stopwords.words('dutch')

# Load data
df_train = pd.read_excel("Trainig_data.xlsx")
df_test = pd.read_excel("Testset.xlsx")
df_train.columns = df_test.columns = ['question', 'label']
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

# Vectorize
vectorizer = TfidfVectorizer(max_features=5000, stop_words=dutch_stopwords)
X_train = vectorizer.fit_transform(df_train['question'])
y_train = df_train['label']
X_test = vectorizer.transform(df_test['question'])
y_test = df_test['label']

# Store original test set
results_df = df_test.copy()

# Models
models = {
    "Naive_Bayes": MultinomialNB(alpha=0.01),
    "Logistic_Regression": LogisticRegression(
        C=10, class_weight='balanced', penalty='l2', solver='saga', max_iter=1000
    ),
    "SVM": SVC(C=1, class_weight='balanced'),
    "Random_Forest": RandomForestClassifier(
        class_weight='balanced', max_depth=None, n_estimators=250
    ),
    "XGBoost": XGBClassifier(
        max_depth=4, learning_rate=0.3, n_estimators=250,
        use_label_encoder=False, eval_metric='logloss'
    )
}

# Train, evaluate, and store predictions
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results_df[name + '_pred'] = preds
    print(f"\n🧪 {name}:\n")
    print(classification_report(y_test, preds, digits=3))

# Save predictions to CSV
results_df.to_csv("all_model_predictions.csv", index=False)




🧪 Naive_Bayes:

              precision    recall  f1-score   support

           0      0.795     0.934     0.859       362
           1      0.676     0.365     0.474       137

    accuracy                          0.778       499
   macro avg      0.735     0.649     0.666       499
weighted avg      0.762     0.778     0.753       499


🧪 Logistic_Regression:

              precision    recall  f1-score   support

           0      0.931     0.936     0.934       362
           1      0.830     0.818     0.824       137

    accuracy                          0.904       499
   macro avg      0.880     0.877     0.879       499
weighted avg      0.903     0.904     0.904       499


🧪 SVM:

              precision    recall  f1-score   support

           0      0.911     0.964     0.937       362
           1      0.888     0.752     0.814       137

    accuracy                          0.906       499
   macro avg      0.900     0.858     0.876       499
weighted avg      0.905

In [17]:
import pandas as pd

# Load the files
bert_df = pd.read_csv("bert_predictions.csv")
models_df = pd.read_csv("all_model_predictions.csv")
regex_df = pd.read_csv("regex_predictions.csv")

# Add a temporary row index to preserve order
bert_df["row_index"] = range(len(bert_df))
models_df["row_index"] = range(len(models_df))
regex_df["row_index"] = range(len(regex_df))

# Standardize column names
bert_df = bert_df.rename(columns={"true_label": "label"})
models_df = models_df.rename(columns={"label": "label"})
regex_df = regex_df.rename(columns={"label": "label"})

# Merge on row_index (not question/label) to preserve order
combined = pd.merge(bert_df, models_df.drop(columns=["question", "label"]), on="row_index")
combined = pd.merge(combined, regex_df.drop(columns=["question", "label"]), on="row_index")

# Drop row_index and reorder columns
combined = combined.drop(columns=["row_index"])
cols = ['question', 'label'] + [col for col in combined.columns if col not in ['question', 'label']]
combined = combined[cols]

# Save to file
combined.to_csv("combined_predictions.csv", index=False)


In [18]:
from scipy.stats import friedmanchisquare
import numpy as np
import pandas as pd

df = pd.read_csv("combined_predictions.csv")
models = [col for col in df.columns if col.endswith('_pred') or col.endswith('_match')]

# Binary correct predictions
correct_matrix = np.array([(df[model] == df['label']).astype(int) for model in models]).T

# Friedman test
stat, p = friedmanchisquare(*correct_matrix.T)
print(f"Friedman test statistic = {stat:.3f}, p-value = {p:.6f}")


Friedman test statistic = 116.042, p-value = 0.000000


In [19]:
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar
from itertools import combinations

# Load predictions
df = pd.read_csv("combined_predictions.csv")
models = [col for col in df.columns if col.endswith('_label') or col.endswith('_pred') or col.endswith('_match')]

# Ground truth
y_true = df['label'].values

# Prepare p-value result matrix
p_matrix = pd.DataFrame(index=models, columns=models, dtype=float)

# McNemar test function
def mcnemar_p(y_true, pred1, pred2):
    table = [[0, 0], [0, 0]]
    for y, a, b in zip(y_true, pred1, pred2):
        correct_a = int(a == y)
        correct_b = int(b == y)
        table[correct_b][correct_a] += 1
    result = mcnemar(table, exact=True)
    return result.pvalue

# Compare all model pairs
for model1, model2 in combinations(models, 2):
    p = mcnemar_p(y_true, df[model1], df[model2])
    p_matrix.loc[model1, model2] = p
    p_matrix.loc[model2, model1] = p  # symmetric

# Fill diagonal with 1s
np.fill_diagonal(p_matrix.values, 1.0)

# Print results
print("\n📊 McNemar's test p-values (lower = significant difference):")
print(p_matrix.round(4))




📊 McNemar's test p-values (lower = significant difference):
                          predicted_label  Naive_Bayes_pred  \
predicted_label                    1.0000               0.0   
Naive_Bayes_pred                   0.0000               1.0   
Logistic_Regression_pred           0.8746               0.0   
SVM_pred                           1.0000               0.0   
Random_Forest_pred                 1.0000               0.0   
XGBoost_pred                       0.6718               0.0   
Set_1_match                        0.0440               0.0   
Set_2_match                        0.0627               0.0   

                          Logistic_Regression_pred  SVM_pred  \
predicted_label                             0.8746    1.0000   
Naive_Bayes_pred                            0.0000    0.0000   
Logistic_Regression_pred                    1.0000    1.0000   
SVM_pred                                    1.0000    1.0000   
Random_Forest_pred                          1.0000 

In [20]:
def print_contingency(y_true, pred1, pred2, name1, name2):
    table = [[0, 0], [0, 0]]
    for y, a, b in zip(y_true, pred1, pred2):
        correct_a = int(a == y)
        correct_b = int(b == y)
        table[correct_b][correct_a] += 1
    print(f"\n{name1} vs {name2} contingency table (McNemar):")
    print(pd.DataFrame(table, columns=[f"{name2} correct", f"{name2} wrong"], 
                             index=[f"{name1} correct", f"{name1} wrong"]))


In [21]:
print_contingency(df['label'], df['predicted_label'], df['Logistic_Regression_pred'], 'BERT', 'LogReg')



BERT vs LogReg contingency table (McNemar):
              LogReg correct  LogReg wrong
BERT correct              27            21
BERT wrong                19           432


In [22]:
import pandas as pd

# Load predictions
df = pd.read_csv("combined_predictions.csv")

# Count correct predictions by BERT
bert_correct = (df['predicted_label'] == df['label']).sum()
total = len(df)

print(f"BERT predicted correctly on {bert_correct} out of {total} samples.")
print(f"Accuracy: {bert_correct / total:.3f}")


BERT predicted correctly on 453 out of 499 samples.
Accuracy: 0.908


In [23]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar

# Load your combined predictions
df = pd.read_csv("combined_predictions.csv")

# Extract relevant predictions
y_true = df['label']
bert = df['predicted_label']
logreg = df['Logistic_Regression_pred']

# Count how many predictions each model got correct
bert_correct = (bert == y_true)
logreg_correct = (logreg == y_true)

print(f"BERT correct: {bert_correct.sum()} / {len(df)}")
print(f"LogReg correct: {logreg_correct.sum()} / {len(df)}")

# Build McNemar contingency table from scratch
# Rows: BERT (correct vs wrong)
# Cols: LogReg (correct vs wrong)
table = pd.crosstab(bert_correct, logreg_correct)
table.index = ['BERT wrong', 'BERT correct']
table.columns = ['LogReg wrong', 'LogReg correct']

print("\n✅ Contingency Table:")
print(table)

# Optional: McNemar p-value
print("\nMcNemar's test p-value:", mcnemar(table.values, exact=True).pvalue)


BERT correct: 453 / 499
LogReg correct: 451 / 499

✅ Contingency Table:
              LogReg wrong  LogReg correct
BERT wrong              27              19
BERT correct            21             432

McNemar's test p-value: 0.8746293123804207


In [24]:
from sklearn.metrics import f1_score
import numpy as np

def bootstrap_f1_diff(y_true, y_pred1, y_pred2, n=1000):
    diffs = []
    for _ in range(n):
        idx = np.random.choice(len(y_true), len(y_true), replace=True)
        f1_1 = f1_score(y_true[idx], y_pred1[idx])
        f1_2 = f1_score(y_true[idx], y_pred2[idx])
        diffs.append(f1_1 - f1_2)
    return np.mean(diffs), np.percentile(diffs, [2.5, 97.5])

# Example use
import pandas as pd
df = pd.read_csv("combined_predictions.csv")

y_true = df['label'].values
bert = df['predicted_label'].values
logreg = df['Logistic_Regression_pred'].values

mean_diff, ci = bootstrap_f1_diff(y_true, bert, logreg)

print(f"🔍 BERT vs Logistic Regression — F1 difference:")
print(f"Mean diff: {mean_diff:.4f}")
print(f"95% CI:    [{ci[0]:.4f}, {ci[1]:.4f}]")



🔍 BERT vs Logistic Regression — F1 difference:
Mean diff: 0.0090
95% CI:    [-0.0365, 0.0569]
