In [None]:
import numpy as np
import json
from scipy.spatial.distance import cdist

# Load the necessary data files
with open('../../model/documents_dict.json', 'r', encoding='utf-8') as f:
    documents = json.load(f)

with open('../../model/encoded_segments.json', 'r', encoding='utf-8') as f:
    encoded_segments = json.load(f)

with open('../../model/segment_encodings.json', 'r', encoding='utf-8') as f:
    segment_encodings = json.load(f)  

with open('../../model/sat_segments_dict.json', 'r', encoding='utf-8') as f:
    sat_segments_dict = json.load(f)

In [6]:
# Define the SAT topic
sat_topic = 'referen'

# Creating list of SAT topics.
topics = list(sat_segments_dict.keys())

In [7]:
total_TP = 0
total_FN = 0
total_FP = 0
total_TN = 0

for sat_topic in topics:    
    # Print selected SAT topic
    print('Selected SAT:', sat_topic)

    # Step 1: Get the encoding vectors for the SAT
    sat_segment_ids = sat_segments_dict[sat_topic]
    sat_segment_indices = [encoded_segments.index(segment_id) for segment_id in sat_segment_ids]
    sat_encodings = [segment_encodings[index] for index in sat_segment_indices]

    print('Number of segments in SAT:', len(sat_segment_ids))
    print()

    # Step 2: Build a similarity matrix with SAT segments in rows and corpus segments in columns
    print('Building matrix…')
    sim_matrix = cdist(sat_encodings, segment_encodings, metric='cosine')  # Cosine similarity
    print('Similarity matrix dimensions:', sim_matrix.shape)
    print()

    # Step 3: Set a similarity threshold for classification
    threshold = 0.8

    # Step 4: Initialize counters for True Positives (TP), False Negatives (FN), False Positives (FP), and True Negatives (TN)
    TP = 0
    FN = 0
    FP = 0
    TN = 0

    # Step 5: For each segment, classify based on similarity threshold
    for idx, sentence_id in enumerate(encoded_segments):
        # Get the similarity values for each SAT segment to the current sentence
        similarities = sim_matrix[:, idx]

        # Segments predicted to be similar (above threshold)
        predicted_similar = similarities >= threshold

        # Segments that are actually part of the SAT topic (actual similar)
        actual_similar = sentence_id in sat_segments_dict[sat_topic]

        # True Positives (TP): Segments that were correctly predicted as similar to the SAT topic
        if predicted_similar.any() and actual_similar:
            TP += 1

        # False Negatives (FN): Segments that are part of the SAT topic but predicted below the threshold
        if not predicted_similar.any() and actual_similar:
            FN += 1

        # False Positives (FP): Segments that were incorrectly predicted as similar to the SAT topic
        if predicted_similar.any() and not actual_similar:
            FP += 1

        # True Negatives (TN): Segments that were correctly predicted as not similar to the SAT topic
        if not predicted_similar.any() and not actual_similar:
            TN += 1

    # Output the results
    print(f"True Positives (TP): {TP}")
    print(f"False Negatives (FN): {FN}")
    print(f"False Positives (FP): {FP}")
    print(f"True Negatives (TN): {TN}")


Selected SAT: referen
Number of segments in SAT: 348

Building matrix…
Similarity matrix dimensions: (348, 163596)

True Positives (TP): 348
False Negatives (FN): 0
False Positives (FP): 163248
True Negatives (TN): 0
Selected SAT: flag
Number of segments in SAT: 74

Building matrix…
Similarity matrix dimensions: (74, 163596)

True Positives (TP): 58
False Negatives (FN): 16
False Positives (FP): 163482
True Negatives (TN): 40
Selected SAT: anthem
Number of segments in SAT: 79

Building matrix…
Similarity matrix dimensions: (79, 163596)

True Positives (TP): 63
False Negatives (FN): 16
False Positives (FP): 163474
True Negatives (TN): 43
Selected SAT: seprel
Number of segments in SAT: 64

Building matrix…
Similarity matrix dimensions: (64, 163596)

True Positives (TP): 56
False Negatives (FN): 8
False Positives (FP): 163514
True Negatives (TN): 18
Selected SAT: freerel
Number of segments in SAT: 78

Building matrix…
Similarity matrix dimensions: (78, 163596)

True Positives (TP): 64
Fal

Similarity matrix dimensions: (16, 163596)

True Positives (TP): 16
False Negatives (FN): 0
False Positives (FP): 163576
True Negatives (TN): 4
Selected SAT: transfer
Number of segments in SAT: 82

Building matrix…
Similarity matrix dimensions: (82, 163596)

True Positives (TP): 79
False Negatives (FN): 3
False Positives (FP): 163514
True Negatives (TN): 0
Selected SAT: water
Number of segments in SAT: 45

Building matrix…
Similarity matrix dimensions: (45, 163596)

True Positives (TP): 45
False Negatives (FN): 0
False Positives (FP): 163551
True Negatives (TN): 0
Selected SAT: opgroup
Number of segments in SAT: 28

Building matrix…
Similarity matrix dimensions: (28, 163596)

True Positives (TP): 28
False Negatives (FN): 0
False Positives (FP): 163568
True Negatives (TN): 0
Selected SAT: lhseats
Number of segments in SAT: 118

Building matrix…
Similarity matrix dimensions: (118, 163596)

True Positives (TP): 118
False Negatives (FN): 0
False Positives (FP): 163478
True Negatives (TN): 

True Positives (TP): 40
False Negatives (FN): 0
False Positives (FP): 163555
True Negatives (TN): 1
Selected SAT: conres
Number of segments in SAT: 77

Building matrix…
Similarity matrix dimensions: (77, 163596)

True Positives (TP): 70
False Negatives (FN): 7
False Positives (FP): 163494
True Negatives (TN): 25
Selected SAT: conlim
Number of segments in SAT: 40

Building matrix…
Similarity matrix dimensions: (40, 163596)

True Positives (TP): 32
False Negatives (FN): 8
False Positives (FP): 163437
True Negatives (TN): 119
Selected SAT: conterm
Number of segments in SAT: 66

Building matrix…
Similarity matrix dimensions: (66, 163596)

True Positives (TP): 31
False Negatives (FN): 35
False Positives (FP): 163381
True Negatives (TN): 149
Selected SAT: dignity
Number of segments in SAT: 275

Building matrix…
Similarity matrix dimensions: (275, 163596)

True Positives (TP): 275
False Negatives (FN): 0
False Positives (FP): 163321
True Negatives (TN): 0
Selected SAT: region
Number of segmen

Similarity matrix dimensions: (139, 163596)

True Positives (TP): 139
False Negatives (FN): 0
False Positives (FP): 163457
True Negatives (TN): 0
Selected SAT: uncon
Number of segments in SAT: 243

Building matrix…
Similarity matrix dimensions: (243, 163596)

True Positives (TP): 243
False Negatives (FN): 0
False Positives (FP): 163353
True Negatives (TN): 0
Selected SAT: solid
Number of segments in SAT: 179

Building matrix…
Similarity matrix dimensions: (179, 163596)

True Positives (TP): 179
False Negatives (FN): 0
False Positives (FP): 163417
True Negatives (TN): 0
Selected SAT: ethincl
Number of segments in SAT: 53

Building matrix…
Similarity matrix dimensions: (53, 163596)

True Positives (TP): 53
False Negatives (FN): 0
False Positives (FP): 163543
True Negatives (TN): 0
Selected SAT: intorgs
Number of segments in SAT: 290

Building matrix…
Similarity matrix dimensions: (290, 163596)

True Positives (TP): 290
False Negatives (FN): 0
False Positives (FP): 163306
True Negatives (

True Positives (TP): 40
False Negatives (FN): 11
False Positives (FP): 163495
True Negatives (TN): 50
Selected SAT: prerel
Number of segments in SAT: 62

Building matrix…
Similarity matrix dimensions: (62, 163596)

True Positives (TP): 55
False Negatives (FN): 7
False Positives (FP): 163508
True Negatives (TN): 26
Selected SAT: speedtri
Number of segments in SAT: 118

Building matrix…
Similarity matrix dimensions: (118, 163596)

True Positives (TP): 118
False Negatives (FN): 0
False Positives (FP): 163478
True Negatives (TN): 0
Selected SAT: falseimp
Number of segments in SAT: 76

Building matrix…
Similarity matrix dimensions: (76, 163596)

True Positives (TP): 71
False Negatives (FN): 5
False Positives (FP): 163504
True Negatives (TN): 16
Selected SAT: examwit
Number of segments in SAT: 66

Building matrix…
Similarity matrix dimensions: (66, 163596)

True Positives (TP): 63
False Negatives (FN): 3
False Positives (FP): 163521
True Negatives (TN): 9
Selected SAT: juvenile
Number of seg

Similarity matrix dimensions: (264, 163596)

True Positives (TP): 264
False Negatives (FN): 0
False Positives (FP): 163332
True Negatives (TN): 0
Selected SAT: ombuds
Number of segments in SAT: 77

Building matrix…
Similarity matrix dimensions: (77, 163596)

True Positives (TP): 77
False Negatives (FN): 0
False Positives (FP): 163519
True Negatives (TN): 0
Selected SAT: hogdec
Number of segments in SAT: 47

Building matrix…
Similarity matrix dimensions: (47, 163596)

True Positives (TP): 47
False Negatives (FN): 0
False Positives (FP): 163549
True Negatives (TN): 0
Selected SAT: unamend
Number of segments in SAT: 34

Building matrix…
Similarity matrix dimensions: (34, 163596)

True Positives (TP): 34
False Negatives (FN): 0
False Positives (FP): 163559
True Negatives (TN): 3
Selected SAT: concop
Number of segments in SAT: 6

Building matrix…
Similarity matrix dimensions: (6, 163596)

True Positives (TP): 4
False Negatives (FN): 2
False Positives (FP): 163020
True Negatives (TN): 570
Se

True Positives (TP): 98
False Negatives (FN): 0
False Positives (FP): 163498
True Negatives (TN): 0
Selected SAT: finsup2
Number of segments in SAT: 51

Building matrix…
Similarity matrix dimensions: (51, 163596)

True Positives (TP): 49
False Negatives (FN): 2
False Positives (FP): 163545
True Negatives (TN): 0
Selected SAT: freecomp
Number of segments in SAT: 63

Building matrix…
Similarity matrix dimensions: (63, 163596)

True Positives (TP): 62
False Negatives (FN): 1
False Positives (FP): 163533
True Negatives (TN): 0
Selected SAT: jury
Number of segments in SAT: 39

Building matrix…
Similarity matrix dimensions: (39, 163596)

True Positives (TP): 37
False Negatives (FN): 2
False Positives (FP): 163541
True Negatives (TN): 16
Selected SAT: hocterml
Number of segments in SAT: 26

Building matrix…
Similarity matrix dimensions: (26, 163596)

True Positives (TP): 17
False Negatives (FN): 9
False Positives (FP): 162991
True Negatives (TN): 579
Selected SAT: execindp
Number of segments 

Similarity matrix dimensions: (56, 163596)

True Positives (TP): 56
False Negatives (FN): 0
False Positives (FP): 163540
True Negatives (TN): 0
Selected SAT: dueproc
Number of segments in SAT: 38

Building matrix…
Similarity matrix dimensions: (38, 163596)

True Positives (TP): 36
False Negatives (FN): 2
False Positives (FP): 163543
True Negatives (TN): 15
Selected SAT: housenum
Number of segments in SAT: 67

Building matrix…
Similarity matrix dimensions: (67, 163596)

True Positives (TP): 65
False Negatives (FN): 2
False Positives (FP): 163520
True Negatives (TN): 9
Selected SAT: uhlead
Number of segments in SAT: 44

Building matrix…
Similarity matrix dimensions: (44, 163596)

True Positives (TP): 42
False Negatives (FN): 2
False Positives (FP): 163522
True Negatives (TN): 30
Selected SAT: recvote
Number of segments in SAT: 11

Building matrix…
Similarity matrix dimensions: (11, 163596)

True Positives (TP): 2
False Negatives (FN): 9
False Positives (FP): 158078
True Negatives (TN): 5

Similarity matrix dimensions: (40, 163596)

True Positives (TP): 32
False Negatives (FN): 8
False Positives (FP): 163030
True Negatives (TN): 526
Selected SAT: ordterml
Number of segments in SAT: 9

Building matrix…
Similarity matrix dimensions: (9, 163596)

True Positives (TP): 8
False Negatives (FN): 1
False Positives (FP): 161642
True Negatives (TN): 1945
Selected SAT: truthcom
Number of segments in SAT: 4

Building matrix…
Similarity matrix dimensions: (4, 163596)

True Positives (TP): 4
False Negatives (FN): 0
False Positives (FP): 163489
True Negatives (TN): 103
Selected SAT: civilmil
Number of segments in SAT: 4

Building matrix…
Similarity matrix dimensions: (4, 163596)

True Positives (TP): 2
False Negatives (FN): 2
False Positives (FP): 162965
True Negatives (TN): 627
Selected SAT: ecrdet
Number of segments in SAT: 5

Building matrix…
Similarity matrix dimensions: (5, 163596)

True Positives (TP): 0
False Negatives (FN): 5
False Positives (FP): 143417
True Negatives (TN): 201