In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import numpy as np
import pandas as pd
from summac.model_summac import SummaCZS, SummaCConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score,f1_score
from tqdm import tqdm

In [4]:
model_zs = SummaCZS(granularity="sentence", model_name="vitc", device="cuda")  # Use GPU
model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cuda", start_file="default", agg="mean")  # Use GPU

<All keys matched successfully>


  print(self.load_state_dict(torch.load(start_file)))


In [5]:
# model_zs = SummaCZS(granularity="sentence", model_name="vitc", device="cpu") # If you have a GPU: switch to: device="cuda"
# model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cpu", start_file="default", agg="mean")

In [5]:
# VALIDATION SET -> Helping find best Threshold for Binary Classification
# Takes validation set and a model as input. Model can be model_zs or model_conv
def get_best_threshold(validation_df,model):
    true_labels = []
    predicted_scores = []

    for index, row in tqdm(validation_df.iterrows(),total=len(validation_df)):
        document = row['Scraped Content']
        claim = row['Headline']
        score = model.score([str(document)], [str(claim)])

        true_labels.append(row['Actual Decision'])
        predicted_scores.append(score["scores"][0])

    # Evaluate performance at different thresholds
    thresholds = [i * 0.01 for i in range(-100, 101)]
    best_threshold = 0
    best_accuracy = 0

    for threshold in thresholds:
        predicted_labels = [1 if score >= threshold else 0 for score in predicted_scores]

        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels, zero_division=1)
        recall = recall_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels)

        print(f"Threshold: {threshold:.2f}")
        print(f"Accuracy: {accuracy:.3f}")
        print(f"Precision: {precision:.3f}")
        print(f"Recall: {recall:.3f}")
        print(f"F1 Score: {f1:.3f}")
        print('-' * 50)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = threshold

    print(f"Best Threshold: {best_threshold:.2f}")
    print(f"Best Accuracy: {best_accuracy:.3f}")

    roc_auc = roc_auc_score(true_labels, predicted_scores)
    print(f"ROC-AUC: {roc_auc:.3f}")
    return best_threshold


In [6]:
# TEST SET
def test_accuracy(test_df,best_threshold,model):
    true_test_labels = []
    predicted_test_scores = []

    for index, row in tqdm(test_df.iterrows(),total=len(test_df)):
        document = row['Scraped Content']
        claim = row['Headline']
        score = model.score([str(document)], [str(claim)])

        true_test_labels.append(row['Actual Decision'])
        predicted_test_scores.append(score["scores"][0])

    # Convert predicted scores to binary labels using the best_threshold
    predicted_test_labels = [1 if score >= best_threshold else 0 for score in predicted_test_scores]

    test_accuracy = accuracy_score(true_test_labels, predicted_test_labels)
    test_precision = precision_score(true_test_labels, predicted_test_labels, zero_division=1)
    test_recall = recall_score(true_test_labels, predicted_test_labels)
    test_f1 = f1_score(true_test_labels, predicted_test_labels)
    test_roc_auc = roc_auc_score(true_test_labels, predicted_test_scores)

    print(f"Test Accuracy: {test_accuracy:.3f}")
    print(f"Test Precision: {test_precision:.3f}")
    print(f"Test Recall: {test_recall:.3f}")
    print(f"Test F1 Score: {test_f1:.3f}")
    print(f"Test ROC-AUC: {test_roc_auc:.3f}")

In [8]:
# Article Pipeline using model_zs
df = pd.read_csv('/content/Pipeline_Article.csv',delimiter='|')

test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision']) # 80:20 split for test and validation

best_threshold = get_best_threshold(validation_df,model_zs) # Get best threshold value for binary classification

test_accuracy(test_df,best_threshold,model_zs) # Run on test set using best_threshold to get metrics

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/235M [00:00<?, ?B/s]

100%|██████████| 138/138 [01:19<00:00,  1.73it/s]


Threshold: -1.00
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.99
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.98
Accuracy: 0.507
Precision: 0.504
Recall: 1.000
F1 Score: 0.670
--------------------------------------------------
Threshold: -0.97
Accuracy: 0.507
Precision: 0.504
Recall: 1.000
F1 Score: 0.670
--------------------------------------------------
Threshold: -0.96
Accuracy: 0.529
Precision: 0.515
Recall: 1.000
F1 Score: 0.680
--------------------------------------------------
Threshold: -0.95
Accuracy: 0.536
Precision: 0.519
Recall: 1.000
F1 Score: 0.683
--------------------------------------------------
Threshold: -0.94
Accuracy: 0.543
Precision: 0.523
Recall: 1.000
F1 Score: 0.687
--------------------------------------------------
Threshold: -0.93
Accuracy: 0.543
Precision: 0.523
Recall: 1.000
F1 Score: 0.687
---

100%|██████████| 549/549 [02:50<00:00,  3.22it/s]

Test Accuracy: 0.843
Test Precision: 0.902
Test Recall: 0.771
Test F1 Score: 0.831
Test ROC-AUC: 0.915





In [9]:
# Article Pipeline using model_conv
df = pd.read_csv('/content/Pipeline_Article.csv',delimiter='|')

test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision']) # 80:20 split for test and validation

best_threshold = get_best_threshold(validation_df,model_conv) # Get best threshold value for binary classification

test_accuracy(test_df,best_threshold,model_conv) # Run on test set using best_threshold to get metrics

  histograms = torch.FloatTensor(histograms).to(self.device)
100%|██████████| 138/138 [00:48<00:00,  2.83it/s]


Threshold: -1.00
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.99
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.98
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.97
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.96
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.95
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.94
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.93
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
---

100%|██████████| 549/549 [02:32<00:00,  3.60it/s]

Test Accuracy: 0.836
Test Precision: 0.960
Test Recall: 0.702
Test F1 Score: 0.811
Test ROC-AUC: 0.883





In [10]:
# QNA Pipeline using model_zs
df = pd.read_csv('/content/Pipeline_QNA.csv',delimiter='|')

test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision']) # 80:20 split for test and validation

best_threshold = get_best_threshold(validation_df,model_zs) # Get best threshold value for binary classification

test_accuracy(test_df,best_threshold,model_zs) # Run on test set using best_threshold to get metrics

100%|██████████| 138/138 [00:12<00:00, 11.49it/s]


Threshold: -1.00
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.99
Accuracy: 0.507
Precision: 0.504
Recall: 1.000
F1 Score: 0.670
--------------------------------------------------
Threshold: -0.98
Accuracy: 0.536
Precision: 0.519
Recall: 1.000
F1 Score: 0.683
--------------------------------------------------
Threshold: -0.97
Accuracy: 0.565
Precision: 0.535
Recall: 1.000
F1 Score: 0.697
--------------------------------------------------
Threshold: -0.96
Accuracy: 0.580
Precision: 0.543
Recall: 1.000
F1 Score: 0.704
--------------------------------------------------
Threshold: -0.95
Accuracy: 0.601
Precision: 0.556
Recall: 1.000
F1 Score: 0.715
--------------------------------------------------
Threshold: -0.94
Accuracy: 0.601
Precision: 0.556
Recall: 1.000
F1 Score: 0.715
--------------------------------------------------
Threshold: -0.93
Accuracy: 0.609
Precision: 0.561
Recall: 1.000
F1 Score: 0.719
---

100%|██████████| 550/550 [01:05<00:00,  8.40it/s]

Test Accuracy: 0.776
Test Precision: 0.792
Test Recall: 0.749
Test F1 Score: 0.770
Test ROC-AUC: 0.867





In [11]:
# QNA Pipeline using model_conv
df = pd.read_csv('/content/Pipeline_QNA.csv',delimiter='|')

test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision']) # 80:20 split for test and validation

best_threshold = get_best_threshold(validation_df,model_conv) # Get best threshold value for binary classification

test_accuracy(test_df,best_threshold,model_conv) # Run on test set using best_threshold to get metrics

100%|██████████| 138/138 [00:11<00:00, 12.24it/s]


Threshold: -1.00
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.99
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.98
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.97
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.96
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.95
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.94
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.93
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
---

100%|██████████| 550/550 [00:59<00:00,  9.18it/s]

Test Accuracy: 0.625
Test Precision: 0.632
Test Recall: 0.600
Test F1 Score: 0.616
Test ROC-AUC: 0.656





In [14]:
# Mistral Pipeline using model_zs
df = pd.read_csv('/content/Pipeline_SLM(Mistral).csv',delimiter='|')

test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision']) # 80:20 split for test and validation

best_threshold = get_best_threshold(validation_df,model_zs) # Get best threshold value for binary classification

test_accuracy(test_df,best_threshold,model_zs) # Run on test set using best_threshold to get metrics

100%|██████████| 138/138 [00:13<00:00, 10.07it/s]


Threshold: -1.00
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.99
Accuracy: 0.522
Precision: 0.511
Recall: 1.000
F1 Score: 0.676
--------------------------------------------------
Threshold: -0.98
Accuracy: 0.522
Precision: 0.511
Recall: 0.986
F1 Score: 0.673
--------------------------------------------------
Threshold: -0.97
Accuracy: 0.536
Precision: 0.519
Recall: 0.986
F1 Score: 0.680
--------------------------------------------------
Threshold: -0.96
Accuracy: 0.551
Precision: 0.527
Recall: 0.986
F1 Score: 0.687
--------------------------------------------------
Threshold: -0.95
Accuracy: 0.572
Precision: 0.540
Recall: 0.986
F1 Score: 0.697
--------------------------------------------------
Threshold: -0.94
Accuracy: 0.572
Precision: 0.540
Recall: 0.986
F1 Score: 0.697
--------------------------------------------------
Threshold: -0.93
Accuracy: 0.580
Precision: 0.544
Recall: 0.986
F1 Score: 0.701
---

100%|██████████| 550/550 [00:44<00:00, 12.47it/s]

Test Accuracy: 0.758
Test Precision: 0.767
Test Recall: 0.742
Test F1 Score: 0.754
Test ROC-AUC: 0.840





In [15]:
# Mistral Pipeline using model_conv
df = pd.read_csv('/content/Pipeline_SLM(Mistral).csv',delimiter='|')

test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision']) # 80:20 split for test and validation

best_threshold = get_best_threshold(validation_df,model_conv) # Get best threshold value for binary classification

test_accuracy(test_df,best_threshold,model_conv) # Run on test set using best_threshold to get metrics

100%|██████████| 138/138 [00:11<00:00, 12.45it/s]


Threshold: -1.00
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.99
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.98
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.97
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.96
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.95
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.94
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.93
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
---

100%|██████████| 550/550 [00:40<00:00, 13.42it/s]

Test Accuracy: 0.658
Test Precision: 0.692
Test Recall: 0.571
Test F1 Score: 0.625
Test ROC-AUC: 0.677





In [12]:
# Phi3 Pipeline using model_zs
df = pd.read_csv('/content/Pipeline_SLM(Phi).csv',delimiter='|')

test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision']) # 80:20 split for test and validation

best_threshold = get_best_threshold(validation_df,model_zs) # Get best threshold value for binary classification

test_accuracy(test_df,best_threshold,model_zs) # Run on test set using best_threshold to get metrics

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/235M [00:00<?, ?B/s]

100%|██████████| 138/138 [00:23<00:00,  5.99it/s]


Threshold: -1.00
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.99
Accuracy: 0.507
Precision: 0.504
Recall: 1.000
F1 Score: 0.670
--------------------------------------------------
Threshold: -0.98
Accuracy: 0.536
Precision: 0.519
Recall: 1.000
F1 Score: 0.683
--------------------------------------------------
Threshold: -0.97
Accuracy: 0.551
Precision: 0.527
Recall: 1.000
F1 Score: 0.690
--------------------------------------------------
Threshold: -0.96
Accuracy: 0.558
Precision: 0.531
Recall: 1.000
F1 Score: 0.693
--------------------------------------------------
Threshold: -0.95
Accuracy: 0.565
Precision: 0.535
Recall: 1.000
F1 Score: 0.697
--------------------------------------------------
Threshold: -0.94
Accuracy: 0.565
Precision: 0.535
Recall: 1.000
F1 Score: 0.697
--------------------------------------------------
Threshold: -0.93
Accuracy: 0.565
Precision: 0.535
Recall: 1.000
F1 Score: 0.697
---

100%|██████████| 549/549 [01:16<00:00,  7.17it/s]

Test Accuracy: 0.709
Test Precision: 0.770
Test Recall: 0.596
Test F1 Score: 0.672
Test ROC-AUC: 0.804





In [13]:
# Phi3 Pipeline using model_conv
df = pd.read_csv('/content/Pipeline_SLM(Phi).csv',delimiter='|')

test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision']) # 80:20 split for test and validation

best_threshold = get_best_threshold(validation_df,model_conv) # Get best threshold value for binary classification

test_accuracy(test_df,best_threshold,model_conv) # Run on test set using best_threshold to get metrics

  histograms = torch.FloatTensor(histograms).to(self.device)
100%|██████████| 138/138 [00:15<00:00,  9.17it/s]


Threshold: -1.00
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.99
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.98
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.97
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.96
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.95
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.94
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
--------------------------------------------------
Threshold: -0.93
Accuracy: 0.500
Precision: 0.500
Recall: 1.000
F1 Score: 0.667
---

100%|██████████| 549/549 [01:08<00:00,  8.03it/s]

Test Accuracy: 0.654
Test Precision: 0.694
Test Recall: 0.553
Test F1 Score: 0.615
Test ROC-AUC: 0.662





Note : This was run on google colab, using the T4 GPU