# Discrepancies

In [5]:
# import libraries

import json

import numpy as np

from sentence_transformers import SentenceTransformer, util

In [6]:
# load pre-trained model

discrepancy_model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
def answer_discrepancy(options : list[str], answer : int, prediction : int) -> bool:
    '''
    Determines if there is a considerable discrepancy between the answer and the prediction
    '''
    embeddings = [discrepancy_model.encode(option, convert_to_tensor=True) for option in options]
    distances = [1 - util.cos_sim(embeddings[answer], embedding).item() for embedding in embeddings]
    threshold_index = (len(options) + 1) // 2
    most_distant_indices = set(np.argsort(distances)[-threshold_index:])
    return prediction in most_distant_indices

In [8]:
# testing discrepancy logic

options = [
    "More than 10 drinks",
    "6-10 drinks",
    "1-5 drinks",
    "Occasionally, but less than once a week",
    "Never"
]
answer, prediction = 0, 2
result = answer_discrepancy(options, answer, prediction)
if result:
    print(f"[{options[answer]}] is not similar to [{options[prediction]}]")
else:
    print(f"[{options[answer]}] is similar to [{options[prediction]}]")

[More than 10 drinks] is not similar to [1-5 drinks]


In [9]:
# load real answers

data_folder = "../../../data/applicants"

with open(f"{data_folder}/answers.json", 'r') as file:
    answers = json.load(file)

In [41]:
# load records

data_folder = '../../../data/questionnaires'

with open(f"{data_folder}/records/dynamic_2.json", 'r') as file:
    records = json.load(file)

In [42]:
# count answer discrepancies

total_predictions = 0
total_discrepancies = 0
min_confidence = 0.7
total_questionnaires = 85

for id, record in records.items():
    for item in record:
        if 'prediction' in item:
            answer = answers[id][item['factor']]
            item['answer'] = answer
            total_predictions += 1
            if item['factor'] == 'health_diabetes':
                continue
            if item['confidence'] >= min_confidence and item['prediction'] != answer:
                total_discrepancies += 1
                
ratio = (total_discrepancies / total_predictions)
ppq = (total_predictions / total_questionnaires)
dpq = (total_discrepancies / total_questionnaires)
discrepancy_rate = (dpq / ppq)

print(f"Total predictions: {total_predictions}")
print(f"Total discrepancies: {total_discrepancies}")
print(f"Discrepancies (%): {ratio:.2%}")
print(f"Predictions per questionnaire: {ppq:.2f}")
print(f"Discrepancies per questionnaire: {dpq:.2f}")
print(f"Discrepancies per questionnaire (%): {discrepancy_rate:.2%}")

Total predictions: 722
Total discrepancies: 184
Discrepancies (%): 25.48%
Predictions per questionnaire: 8.49
Discrepancies per questionnaire: 2.16
Discrepancies per questionnaire (%): 25.48%
