In [2]:
#Import the required libraries, lots of these are required for the LLMs we utilize for three criteria. 
import Levenshtein
import spacy
import string
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2TokenizerFast, OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
import torch
from sentence_transformers import SentenceTransformer, util
from lexicalrichness import LexicalRichness
import re
import math
import random
import numpy as np
nlp = spacy.load('en_core_web_lg')
import time
from datetime import datetime
from collections import Counter
import warnings
import json
warnings.filterwarnings('ignore')

#NLTK Imports
import nltk
nltk.download('stopwords') #Needed for query wellformedness
nltk.download('punkt') #Needed for query wellformedness
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger') #Needed for query wellformedness
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import trigrams
from nltk import ngrams
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize, sent_tokenize

#OpenAI, but could be replaced with Gemini, Claude, etc.
from openai import OpenAI
model_engine = 'gpt-4o' #or the other models 'gpt-4o-mini','chatgpt-4o-latest', 'o1-preview' etc.
client = OpenAI(
    api_key="OPENAI_API_KEY", 
)

#Libraries for Perplexity, Diversity, Grammatical Error, Complexity, Answerability
from evaluate import load
import language_tool_python

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Steven\AppData\Roamin

## Helper Functions

In [3]:
#Our class used to represent multiple-choice questions
nl = '\n'
class MultipleChoiceQuestion:
    def __init__(self, stem, options, correct_option, qid = None, quality = None):
        self.stem = stem
        self.options = options
        self.correct_option = correct_option
        self.qid = qid
        self.quality = quality
        
    def __str__(self):
        return f"Question: {self.stem}\n {nl.join(self.options)}\nCorrect option: {self.correct_option}\nQuality: {self.quality}"

def mode_or_average(lst):
    # Count occurrences of each number
    try:
        lst = [float(x) for x in lst]  # Handles strings of numbers
    except ValueError:
        raise ValueError("All elements in the list must be convertible to numbers.")
        
    count = Counter(lst)
    
    # Find the maximum frequency
    max_freq = max(count.values())
    # Find all numbers with the maximum frequency
    modes = [k for k, v in count.items() if v == max_freq]

    if len(modes) == 1:
        # If there's a single mode, return it
        return modes[0]
    else:
        # If no single mode, compute the average of two closest numbers
        sorted_lst = sorted(lst)
        min_diff = float('inf')
        closest_pair = None

        # Find the closest pair
        for i in range(len(sorted_lst) - 1):
            diff = sorted_lst[i + 1] - sorted_lst[i]
            if diff < min_diff:
                min_diff = diff
                closest_pair = (sorted_lst[i], sorted_lst[i + 1])
        
        # Compute and return the average of the closest pair
        return np.mean(closest_pair)

#Rating-based LLM Logic used for all verification steps where the LLM is needed
def llm_rating(sysrole, prompt, threshold=5):
    done = False
    ratings = []
    while not done:
        try:
            o = client.chat.completions.create(
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            result = o.choices[0].message.content.lower().strip()
            if result.isdigit():
                ratings.append(result)
        except Exception as error:
            print('errored in LLM API call: ', error)
            time.sleep(10)
        if len(ratings) == 3:
            done = True
        if len(ratings) == 2:
            if float(ratings[0]) >= 7 and float(ratings[1]) >=7:
                done = True
            if float(ratings[0]) <= 4 and float(ratings[1]) <=4:
                done = True

    rating = mode_or_average(ratings)
    
    return rating >= threshold

## Implausible Distractors
Make all distractors plausible as good items depend on having effective distractors


In [4]:
#Uses NER, so if the score is too low, if they're matching entities (i.e. people) then we can ignore this case and say True
def implausible_distractors(question):
    #MiniLM from: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
    model = SentenceTransformer('all-MiniLM-L6-v2')
    correct = question.correct_option
    options = question.options.copy()

    for opt in question.correct_option.split('[SEP]'):
        try:
            options.remove(opt.strip())
        except:
            print('error trying to remove an option, there might be an incorrect space present: ', opt)

    # Two lists of sentences
    sentences1 = [correct, correct, correct, correct]
    sentences2 = options

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    #Output the pairs with their score
    for i in range(len(sentences2)):
        if cosine_scores[i][i] < 0.15:
            
            #NER check here...
            opt_entity = nlp(sentences2[i])
            lemma_nouns_opt = get_lemma_nouns(sentences2[i])
            
            ans_entity = nlp(sentences1[i])
            lemma_nouns_ans = get_lemma_nouns(sentences1[i])

            #If the noun(s) in the answer choice can be tagged with an entity
            if ans_entity.ents:
                answer_entity = ans_entity.ents[0].label_
            else:
                answer_entity = None

            if opt_entity.ents:
                opt_entity = opt_entity.ents[0].label_
            else:
                opt_entity = None

            #Couldn't find the noun nor the entity? Unable to parse effectively to make a judgement.
            if len(lemma_nouns_ans) == 0 and len(lemma_nouns_opt) == 0:
                return True
            
            #If the option in this case is none/all of the above, it won't be similar, so ignore this criteria
            if not all_of_the_above(question) or not none_of_the_above(question):
                return True

            #Low distance like this means it likely shares some words and should not be flagged
            if jaccard_similarity(sentences1[i], sentences2[i]) > .15 or Levenshtein.distance(sentences1[i], sentences2[i]) < (len(sentences1[i])*.7):
                return True

            #Before saying two distractors are plausible, let's have the LLM make a judgement call
            #If the LLM is too generous/strict on this call, we can try using updated word embeddings from openai which might be better for the domain jargon
            if implausible_distractors_verify(question): #question.stem, sentences1[i], sentences2[i]):
                print('LLM says they are similar: ', sentences1[i], ' -and- ', sentences2[i])
                return True

            print("Distractor not similar enough: {} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
            return False

    return True

#Statistic used for gauging the similarity and diversity of text
def jaccard_similarity(str1, str2):
    # Convert strings to sets of words
    set1 = set(str1.split())
    set2 = set(str2.split())

    # Calculate intersection and union
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    # Calculate Jaccard Similarity
    similarity = len(intersection) / len(union)
    return similarity

def implausible_distractors_verify(question):
    sysrole = """You are a seasoned academic professional with extensive experience in designing and reviewing multiple-choice assessments. Your primary objective is to identify whether any distractors (incorrect answer choices) in a multiple-choice question are overly implausible—so unrelated or off-topic that students, even those with minimal subject knowledge, would not consider selecting them. Your assessment should focus on how well each distractor aligns with common misconceptions, partial knowledge, or thematic similarity to the correct answer."""
    prompt = """
Instructions:
    Assign a single numeric score from 1 to 10 indicating the plausibility of the distractors:

    Scoring Guidelines:
        1–3: At least one distractor is clearly off-topic, illogical, or so unrelated to the question, such that no student, even with minimal subject knowledge, would choose it.
        4–6: Distractors have noticeable plausibility issues; there may be some relevance, but at least one distractor still seems very out of place or unconnected to the question or other options.
        7–9: Distractors are reasonably plausible and relevant, though there might be minor clues or inconsistencies that reduce their effectiveness.
        10: All distractors are reasonably related or plausible, so a student with limited knowledge might actually pick any of them due to confusion, misunderstanding, or similarity to the correct answer.
        
    Respond with only the numeric score (1–10) and nothing else. Do not provide an explanation or any additional commentary.

Question: {}
Options: {}
Answer: {}""".format(question.stem, question.options, question.correct_option)
    
    return llm_rating(sysrole, prompt)

## None Of The Above
Avoid none of the above as it only really measures students ability to detect incorrect answers 

In [5]:
def none_of_the_above(question):
    for opt in question.options:
        cleaned_opt = opt.strip().lower()
        if 'none of the above' in cleaned_opt or ('none' in cleaned_opt and 'above' in cleaned_opt) or cleaned_opt.startswith('none of') or cleaned_opt == 'neither' or 'none' in question.options[len(question.options)-1]:
            return False
    return True

## All Of The Above
Avoid all of the above options as students can guess correct responses based on partial information

In [6]:
def all_of_the_above(question):
    for opt in question.options:
        cleaned_opt = opt.strip().lower()
        if 'all of the above' in cleaned_opt or ('all' in cleaned_opt and 'above' in cleaned_opt) or ('all if the' in cleaned_opt)  or ('all of the' in cleaned_opt):
            return False

    return True

## Fill-In-The-Blank
Avoid omitting words in the middle of the stem that students must insert from the options provided 

In [7]:
#Programming questions might contain a single underscore, so check for multiple
def fill_in_the_blank(question):
    if "__" in question.stem or ('fill in the blank' in question.stem.lower()):
        return False
    
    return True

## True/False
The options should not be a series of true/false statements

In [8]:
def true_or_false(question):
    options = question.options.copy()
    
    #Check for true & false mentioned in the stem
    for sent in question.stem.split('.'):
        sent = sent.lower()
        if 'false' in sent and 'true' in sent:
            return False    
    
    for opt in options:
        cleaned_opt = opt.strip().lower() 
        if cleaned_opt == 'true' or cleaned_opt == 'false' or cleaned_opt == 'yes' or cleaned_opt == 'no':
            return False

    return True

## Absolute Terms
Avoid the use of absolute terms (e.g. never, always, all) in both the question stem as it can be confusing and the options as students are aware that they are almost always false

In [9]:
#The list of absolute terms can be different for the stem and options, but we need to be careful here, as sometimes these can be used in proper ways
absolutes = ["always", "never", "none", "all", "completely", "absolutely", "totally", "definitely", "incapable", "inevitable"]
def absolute_terms(question):

    #Check for terms in the question stem, if we we find any, have GPT-4 help us verify the use of it.
    stem = question.stem.lower()
    if any(word in stem.split() for word in absolutes):
        if not true_or_false(question):
            return True
        else:
            return absolute_terms_verify(stem)

    #Check for terms in the options, if we we find any, have GPT-4 help us verify the use of it.
    absolutes_options = ["always", "never", "none", "completely", "absolutely", "totally", "definitely", "incapable", "inevitable", "all"]
    for opt in question.options:
        cleaned_opt = opt.strip().lower()
        
        #Count all, which is a special case, but not in the case of "all of the above"
        if any(word in cleaned_opt for word in absolutes_options):        
            if none_of_the_above(question) and all_of_the_above(question) and true_or_false(question):
                if "all" in cleaned_opt: 
                    return absolute_terms_verify(cleaned_opt)
                return False
    
    return True

def absolute_terms_verify(prompt):
    sysrole = """You are an expert educator evaluating multiple-choice questions for the presence of absolute terms. Your task is to examine the question’s text and answer options for any words or phrases that might give away the correct answer or help a student eliminate an incorrect answer due to their overly certain or extreme language."""
    prompt = """
Instructions:
    Provide a single numeric score from 1 to 10, reflecting how the presence of absolute terms (e.g., “always”, “never”, “all”, “must”, "none", “only”) might help a student guess or eliminate incorrect options or if the question's stem contains absolute terms that are used in a way which constitutes a blanket generalization or hyperbole.

    Scoring Guidelines:
        1–3: The question or answer choices contain multiple or very strong absolute terms that greatly simplify finding the correct answer.
        4–6: The question or choices contain some absolute terms that might help clue the student toward the answer.
        7–9: The question or choices contain minimal absolute terms with limited impact on revealing the correct answer.
        10: The question and all answer options are free of absolute terms that could reveal the answer.
    
    Your response should only be the number from 1 to 10. Do not include any explanation or additional text.

Question: {}
Options: {}
Answer: {}""".format(question.stem, question.options, question.correct_option)

    return llm_rating(sysrole, prompt)

## Longest Answer Correct
Often the correct option is longer and includes more detailed information, which clues students to this option

In [10]:
#If the correct answer is noticably longer (more than 25%) than the second longest answer, flag it.
def longest_answer_correct(question):

    #Ignore this criteria for True/False questions
    if not true_or_false(question) or '[SEP]' in question.correct_option:
        return True
        
    correct = question.correct_option
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())

    longest_option = 0
    for opt in options:
        if len(opt) >= longest_option:
            longest_option = len(opt)
        
    #If the longest option is only by 25% or it's a three words or less, then this passes
    if longest_option >= len(correct) *.75 or len(correct.split()) < 4:
        return True
    
    return False

## Negative worded
Negatively worded stems are less likely to measure important learning outcomes and can confuse students

In [11]:
#The list of negative words can potentially cause this to be too restrictive, particularly for words such as can't and won't
def negative_worded_stem(question):
    negatives = ["none", "never", "without", "exclude", "deny", "refuse", "oppose", "dispute", "can't", "won't", "not"] 

    stem = question.stem.lower()
    if any(word in stem.split() for word in negatives):
        return False

    for sent in question.stem.split('.'):
        sent = sent.lower()        
        if 'which' in sent and ('false' in sent or 'not' in sent or 'incorrect' in sent or 'except' in sent) or \
        'what' in sent and ('false' in sent or 'not' in sent or 'incorrect' in sent or 'except' in sent):
            return False    
    
    return True

## Word Repeats
Avoid similarly worded stems and correct responses or words repeated in the stem and correct response

In [12]:
#Find the nouns in question.correct_option and question.stem --> stem them --> compare cosine similiary (using sentence transformer)
#Also check for the synonyms, compare them. However, if the word(s) are used in the other options, then it's fine.
#Nouns: NN noun, singular ‘- desk’, NNS noun plural – ‘desks’, NNP proper noun, singular – ‘Harrison’, NNPS proper noun, plural – ‘Americans’ 
lemmatizer = WordNetLemmatizer()
nouns = ['NN', 'NNS', 'NNP', 'NNPS']

def word_repeats_in_stem_and_correct_answer(question):   
    options = question.options.copy()
    
    all_options = ' '.join(options)
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())
        
    #This code checks for matching words, specifically nouns and verbs, between the correct answer and stem
    word_types = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBG', 'VBN', 'VBP', 'VBZ']
    stem = strip_punctuation(question.stem)
    matching_words = []
    for wrd in stem.split():
        if wrd not in stop_words and wrd in question.correct_option:
            matching_words.append(wrd)
            
    matching_words = list(set(matching_words)).copy()
    matching_words_copy = matching_words.copy()
    for wrd in matching_words:
        for opt in options:
            if wrd in opt:
                matching_words_copy.remove(wrd)
                break

    #If the word is longer than 4 characters, because non-matching verbs/nouns of smaller characters typically are not cues
    if len([s for s in matching_words_copy if len(s) >= 4]) > 0:
        again = []
        tagged = nltk.pos_tag(matching_words_copy)
        for t in tagged:
            if t[1] in word_types:
                again.append(t[0].lower())
        if len(again) > 0:
            if '[SEP]' in question.correct_option:
                for mwc in matching_words_copy:
                    if mwc in question.correct_option.split('[SEP]')[0] and mwc in question.correct_option.split('[SEP]')[1]:
                        print('*** SEP')
                        return False
                    else:
                        return True
            else:
                #There's the potential false positive where all answer choices are repeated in the question's stem
                all_ops_in_stem = 0
                for opt in question.options:
                    opt = opt.lower()
                    stem = question.stem.lower()
                    if opt in stem:
                        all_ops_in_stem = all_ops_in_stem + 1
                if all_ops_in_stem == len(question.options):
                    return True
                
                return False
    return True

def strip_punctuation(text):
    return ''.join(char for char in text if char not in string.punctuation)

#This is now used for Logical Cue
def get_lemma_nouns(text):
    all_nouns = []
    tokenized = sent_tokenize(text)
    
    for i in tokenized:

        # Word tokenizers is used to find the words and punctuation in a string
        wordsList = nltk.word_tokenize(i)

        # removing stop words from wordList
        wordsList = [w for w in wordsList if not w in stop_words]

        # Using a Tagger. Which is part-of-speech tagger or POS-tagger.
        tagged = nltk.pos_tag(wordsList)
        
        # Add any nouns to this list
        for t in tagged:
            if t[1] in nouns:
                all_nouns.append(t[0].lower())
    
    lemmatized_nouns = []
    for n in all_nouns:
        lemmatized_word = lemmatizer.lemmatize(n, pos="n")
        lemmatized_nouns.append(lemmatized_word.lower())
    
    return lemmatized_nouns   

#Currently not used, was getting poor performance
def word_repeats_verify(question):
    options = question.options.copy()
    
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())
        
    sysrole = """You are an expert educator evaluating multiple-choice questions to identify whether a unique key word, phrase, or term is shared exclusively between the stem and the correct answer. Such repetition can provide an unintended clue for testwise students. If no exclusive repetition of a unique key word or phrase exists, the question should be considered free of this issue."""
    prompt = """
Instructions:
    Respond with a score from 1–10 based on how prominently a unique key word, phrase, or term from the stem is repeated only in the correct answer (not appearing in any other options). Use the following Scoring Guidelines:

    Scoring Guidelines:
        1–3: A clearly noticeable or direct match of a unique key term between the stem and the correct answer makes the correct option stand out strongly.
        4–6: Some repetition of a key term provides a potential clue but is not blatantly revealing.
        7–9: Only minimal or subtle overlap in wording, unlikely to give a significant advantage to a testwise student.
        10: No unique repeated key words, phrases, or terms appear exclusively in the stem and the correct answer.

    Do not lower the score for common or generic terms (e.g., articles, conjunctions, basic verbs) unless they serve as an unmistakable clue.
    
    Only penalize repeated, uniquely identifying words or phrases that are exclusive to the stem and correct answer.
    
    If no exclusive repetition is found, assign a score of 10.

    Your response should only be the number from 1 to 10. Do not include any explanation or additional text.

Question Stem: {}
Options: {}
Answer: {}""".format(question.stem, options, question.correct_option)
    
    return llm_rating(sysrole, prompt)

## Logical Cue - This one is challenging, requires domain knowledge.

In [13]:
#An example of a logical cue is asking students to select the most appropriate pharmaceutical intervention for a problem and only having one or two options which
#Using NER, if the question asks for a <certain type of noun, like a <person> then the options should all be <people> too.
def avoid_logical_cues(question):
    options = question.options.copy()

    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())
    
    if len(options) < 2:
        return True
   
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    if len(options) == 2:
        lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1])]
    if len(options) == 3:
        lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    if len(options) == 4:
        lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2]), get_lemma_nouns(options[3])]
        
    entities_in_options = []
    for opt in lemma_nouns_options:
        for val in opt:
            doc = nlp(val)
            if doc.ents:
                entities_in_options.append(doc.ents[0].label_)
    
    entities_in_answer = []
  
    for val in lemma_nouns_answ:
        doc = nlp(val)
        
        #If the noun(s) in the answer choice can be tagged with an entity
        if doc.ents:
            answer_entity = doc.ents[0].label_
            if answer_entity not in entities_in_options:
                return logical_cue_verify(question)

    
    #If the stem has a number and only one option has a number
    numbers_in_stem = extract_all_numerical_values(question.stem)
    numbers_in_options = 0
    options_without_numbers = 0
    if len(numbers_in_stem) > 0:
        #If only one option has a numerical value and no numerical value is in the stem 
        for opt in options:
           numbers_in_opt = extract_all_numerical_values(opt)
           if len(numbers_in_opt) > 0:
               numbers_in_options = numbers_in_options + 1
        if numbers_in_options == 1:
            return False
    
    return True

def logical_cue_verify(question):
    sysrole = """You are an expert educator evaluating multiple-choice questions for any unintended hints or logical cues in the stem and answer choices that might allow a testwise student to identify the correct option without true mastery of the content. Your primary objective is to spot clues, patterns, or giveaways that compromise the fairness or validity of the question."""
    prompt = """
Instructions:
    Respond with a score from 1–10 based on how likely it is that a testwise student could identify the correct answer through logical cues, based on the following Scoring Guidelines:

    Scoring Guidelines:
        1–3: The question or options contain significant or obvious clues, making it easy for a savvy testwise student to guess the correct answer.
        4–6: The question or options have some identifiable cues, but they are less obvious and require closer attention to exploit.
        7–9: The question is generally well-constructed, with only minor or subtle cues that might help a testwise student.
        10: The question is free of any noticeable cues, providing no advantages to a student guessing logically.

    Please do not lower the score for common question-design elements that are unavoidable or do not genuinely compromise fairness.  

    Your response should only be a single number from 1 to 10, with no additional explanation.

Question: {}
Options: {}
Answer: {}""".format(question.stem, question.options, question.correct_option)

    return llm_rating(sysrole, prompt)

## Lost Sequence
If options are numerical, they should go lowest to highest or vice-versa, not a random arrnagement

In [14]:
#If answer choices are numeric, sort them, compare to current order
#If all but one are numerical, make sure they are in order and the "word" option is last.

def lost_sequence(question):
    options = question.options.copy() 
    opts = []
    non_numerical_option = 0
    for opt in options:
        #First check for fractions
        fraction = extract_fraction_to_float(opt)
        if fraction:
            opts.append(fraction)
        else:
            val = extract_all_numerical_values(opt)
            if len(val) == 1:
                opts.append(float(val[0].replace(',', '')))
            else:
                non_numerical_option = non_numerical_option + 1
    
    if non_numerical_option > 0 and not(non_numerical_option == 1 and len(opts) == len(options)-1):
        return True

    float_options = [float(x) for x in opts]    
    sorted_options = sorted(float_options)
    reverse_sorted_options = sorted(float_options, reverse=True)
    
    if sorted_options == float_options:
        #Numeric options are sorted
        return True
    elif reverse_sorted_options == float_options:
        #Numeric options are sorted in reverse order, which might make sense for the question
        return True
    else:
        return False

def extract_all_numerical_values(s):
    pattern = r'-?\d*(?:,\d{3})*\.\d+|-?\d+(?:,\d{3})*'
    return re.findall(pattern, s)

# Regex pattern to match fractions with optional decimal numerator and/or denominator
def extract_fraction_to_float(s):
    pattern = r'-?\b\d+(\.\d+)?/\d+(\.\d+)?\b'
    match = re.search(pattern, s)
    if match:
        a , b = match.group().split("/")
        fraction = float(a) / float(b)
        return fraction
    else:
        return None

## More Than One Correct
This updated approach simplifies the task by directing GPT-4o to focus solely on identifying the correct answers. This reduction in complexity likely minimizes cognitive strain on the model, allowing it to perform more effectively and with greater accuracy. By removing the secondary task of evaluating the number of correct answers, the model can concentrate fully on its core strength: understanding and answering the question based on its underlying knowledge and reasoning. - Gilles Chen

In [15]:
format_mtoc ={
    "Stem": "The Question",
    "correct_answers": [
        {
            "A": "The choice"
        },
        {
            "C": "The choice"
        }
    ]
}

def more_than_one_correct(question):
    # Ensure the question has at least 4 options
    while len(question.options) < 4:
        question.options.append("")
    
    sysrole = """You are an expert and an astute instructor. 
    Given a multiple-choice question and possible answers, determine the correct answers. 
    Your reply must be in the following JSON format: {}""".format(format_mtoc)
    
    prompt = """
        question: {}
        
        answers: 
        A: {}
        B: {}
        C: {}
        D: {}
    """.format(question.stem, question.options[0], question.options[1], question.options[2], question.options[3])

    done = False
    expert_reasoning = 'blank'
    # Generate a response
    while(done == False):
        try:
            o = client.chat.completions.create(
              response_format={"type": "json_object"},
              model=model_engine,
              messages=[
                 {"role": "system",
                  "content": sysrole},
                {"role": "user", 
                 "content": prompt},
              ],
              max_tokens = 4096,
              temperature = 0.7
             )
            done = True 
        except Exception as error:
            print('errored in LLM API call: ', error)
            time.sleep(10)
    completion = o
    done = False
    expert_reasoning = json.loads(completion.choices[0].message.content)

    if len(expert_reasoning["correct_answers"]) > 1:
        return False
    else:
        return True 

## Complex or K-type
Avoid questions that have a range of correct responses, that ask students to select from a number of possible combinations of the responses

In [16]:
#If the answer options share the same words between one another and there are commas present then it's k type
def complex_k_type(question):    
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())

    if not all_of_the_above(question) or not none_of_the_above(question) or not true_or_false(question):
        return True 
        
    if len(options) < 3:
        return True 
    
    # Check if the options contain a comma
    contain_a_comma = 0
    for opt in options:
        if ',' in opt:
            contain_a_comma += 1
    contain_a_comma = contain_a_comma == len(options)
    
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    options_that_share_noun = 0
    for lno in lemma_nouns_options:   
        repeating_nouns = list(set(lno).intersection(lemma_nouns_answ))
        if (len(repeating_nouns) > 0) and (len(lno) > 0):
            options_that_share_noun += 1

    # Yes or No options are fine and might contain repeat noun, so ignore those if all options are effectively yes/no + reason 
    yes_or_no = 0
    for opt in options:
        opt = opt.lower()
        if "yes" in opt or "no" in opt:
            yes_or_no = yes_or_no + 1
    if yes_or_no == len(options):
        return True
    
    #Options share a key word, there are multiple nouns in the options, and they have a comma suggesting it might be a k-type question
    if options_that_share_noun > 0 and contain_a_comma:
       return False
    
    #After removing any list notation in the answer choices, see if they contain the same words
    cleaned_options = []
    for opt in options:
        cleaned_options.append(clean_string(opt))

    options_set_list = [set(i.split()) for i in cleaned_options]
    if options_set_list[0] == options_set_list[1] and options_set_list[0] == options_set_list[2]:
        return False

    return complex_k_type_verify(question)

def clean_string(string):
    # remove whitespace
    cleaned_string = string.strip()
    
    # remove punctuation
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string)
    
    # remove list notation
    cleaned_string = re.sub(r'\b(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|xiv|xv|xvi|xvii|xviii|xix|xx)\b', '', cleaned_string)
    cleaned_string = re.sub(r'\b(A|B|C|D|E|F)\b', '', cleaned_string)
    return cleaned_string


def complex_k_type_verify(question):
    sysrole = """You are an expert educator and assessment designer specializing in evaluating complex multiple-choice questions. Your primary task is to identify K-Type (Complex) multiple-choice questions. These questions require students to select from a range of combinations of responses, often using phrases like "and," "or," "only," or presenting multiple items and pairs separated by commas, semicolons, periods, or numbers.
    
    K-Type questions are characterized by options that represent different combinations of correct or partially correct answers. Unlike standard multiple-choice questions that have a single correct answer, K-Type questions involve evaluating multiple statements to determine which combination is valid.
    
    Your goal is to identify questions that fit this K-Type structure and distinguish them from traditional single-answer multiple-choice questions."""

    prompt = """
Instructions:
    Evaluate the multiple-choice question below to determine if it qualifies as a K-Type (Complex) question, where students are required to choose from a two or more combinations of responses, rather than a single correct answer, often using phrases like "and," "or," "only," or presenting multiple items and pairs separated by commas, semicolons, periods, or numbers.
    
    A K-Type question should have options that share different combinations of the same answer, just because an option has a few repeated keywords or several commas, does not mean it is K-Type.
    
    Provide a score from 1 to 10 based on the following Scoring Guidelines:
    
    Scoring Guidelines:
        1-3: The question is clearly a K-Type question, where options represent different combinations of possible responses. These options often use phrases like "and", "or", "only", or lists and pairings separated by commas or semicolons. The question requires students to assess multiple statements and determine which combination is valid.
        4-6: The question shows strong characteristics of a K-Type question, but the combinations may be less explicit or not as complex.
        7-9: The question has some elements of a K-Type structure, but it is closer to a traditional multiple-choice question with a single correct answer.
        10: The question is not a K-Type question. It is a standard multiple-choice question that requires selecting a single correct answer without evaluating combinations of responses.
    
    Your response should be a single number between 1 and 10, without any additional explanation or rationale.

Question: {}
Options: {}
Answer: {}""".format(question.stem, question.options, question.correct_option)

    return llm_rating(sysrole, prompt)

## Ambiguous or Unclear Information
Questions and all options should be written in clear, unambiguous language

In [17]:
#Not longer usign the previous models, just a pure LLM-based approach.
def ambiguous_unclear_information(question):
    sysrole = """You are an expert educator evaluating the clarity of multiple-choice questions (MCQs) for ambiguity or comprehension issues. Your primary goal is to assess whether a well-prepared student would easily grasp the intended meaning of the question and confidently select the correct answer."""

    prompt= """
Instructions:
    Respond with a score of 1 - 10, where a 1 indicates a high presence of ambiguous or unclear information and 10 indicates an absence of ambiguity and a clear question.

    Scoring Guidelines:
        1–3: The question is severely ambiguous, unclear, or confusing, even for a knowledgeable student.
        4–6: The question has noticeable clarity issues or structural problems that may impede understanding, even when looking at the options.
        7–9: The question is mostly clear and understandable, with only minor or occasional issues that do not significantly hinder comprehension.
        10:  The question is fully clear and understandable, despite any minor imperfections or complexities.

    Focus on major clarity issues that could seriously hinder comprehension.
       - Do not penalize for minor typos, grammatical issues, or stylistic preferences unless they cause confusion.
       - Standard phrasing like "Which of the following..." is acceptable if the options provide enough context.
       - Technical terms or challenging vocabulary are acceptable if they would be understandable to a knowledgeable student.

    If the question depends on the answer options for clarity, this is acceptable as long as the overall meaning is still discernible given the options. In other words, the combination of the stem and options should clearly convey what the question is asking.

    Your response should only be a value in the range of 1 - 10, do not include an explanation or rationale.

Question: {}
Answer: {}
Options: {}""".format(question.stem, question.correct_option, question.options)
    
    return llm_rating(sysrole, prompt)

## Gratuitous Information
Avoid unnecessary information in the stem that is not required to answer the question

In [18]:
grat_scores_list = []

def gratuitous_information_in_stem(question):  
    #How effective are lexical richness measures for differentiations of vocabulary proficiency? A comprehensive examination with clustering analysis
    #From: https://github.com/LSYS/LexicalRichness
    stem = LexicalRichness(question.stem)
    
    if stem.cttr > 4.5:
        return False
    
    return gratuitous_information_in_stem_verify(question)


def gratuitous_information_in_stem_verify(question):
    sysrole = """You are an expert educator evaluating multiple-choice questions for the presence of gratuitous or unnecessary information in the question's text that could confuse students. Your focus is on identifying significant issues where extraneous details might distract or mislead a student, not on minor additional information that doesn't impact understanding."""
    prompt = """
Instructions:
    Respond with a score of 1 - 10, where 1 means the question contains significant gratuitous information that could confuse students, and 10 means the question is concise and contains only relevant information necessary for understanding and answering the question correctly.

    Scoring Guidelines:
        1–3: The question contains excessive or irrelevant details that significantly distract or confuse students.
        4–6: The question contains noticeable extraneous information that may hamper clarity but does not severely mislead.
        7–9: The question is mostly concise, with minor additional details that do not distract from the main point.
        10: The question is entirely focused on essential information needed to answer correctly, with no unnecessary details.

    Do not assign a low score (1–3 or 4–6) for minor details that are typical in multiple-choice questions and do not affect comprehension.  
    
    Do not assign a low score if the question is scenario-based and the details meaningfully contribute to the scenario.

    Your response should be **only** a single integer from 1–10. Do not include any explanation or rationale.

Question: {}
Options: {}
Answer: {}""".format(question.stem, question.options, question.correct_option)

    return llm_rating(sysrole, prompt)

## Convergence Cues
Avoid convergence cues in options where there are different combinations of multiple components to the answer

In [19]:
#Check for synonyms, because they'll know it's the word they've most recently come across in the text
#The correct option is likely to be used more (when in pairs, etc.) --> k-type (super similar by description)
conv_scores_list = []

def avoid_convergence_cues(question):
    # Here we check for synonyms used in the words, in case they get lazy with distractors
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
       options.remove(opt.strip())
        
    if len(options) < 3:
        return True

    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    #Checking for synonyms 
    synonyms = []
    for noun in lemma_nouns_answ:
        for syn in wn.synsets(noun):
            for l in syn.lemmas():
                synonyms.append(l.name().lower().replace('_', ' '))
    
    for opt in lemma_nouns_options:
        repeating_nouns_synonyms = list(set(synonyms).intersection(opt))
        if len(repeating_nouns_synonyms) > 0:       
            
            #if the repeat is not in every answer choice, flag it.
            for rns in repeating_nouns_synonyms:           
                flag = True
                for value in lemma_nouns_options:
                    if rns not in value:
                        return avoid_convergence_cues_verify(question)
    
    return True

def avoid_convergence_cues_verify(question):
    sysrole = """You are an expert in educational assessment specializing in detecting convergence cues in multiple-choice questions. Convergence cues are present when the options share overlapping elements, refer to each other, or include combinations of other options (e.g., "A and B," "B or C"). Your goal is to identify problematic convergence cues that are based on logical overlap between options (e.g., subsets, combinations, or dependencies). However, you should not flag distinct, independent numerical values or factual answers (like scientific measurements) as convergence cues unless they are structured to confuse students by implying a relationship between the options."""
    prompt = """
Instructions:
    Respond with a score of 1 - 10, where a 1 indicates a high presence of convergence cues and 10 indicates an absence of convergence cues.

    Scoring Guidelines:  
        1-3: The options are heavily interrelated, sharing combinations or subsets that could confuse students (e.g., “A and B”, “B and C”, “A or C”).  
        4-6: There is moderate convergence, with some overlap in the options but not enough to significantly confuse students.  
        7-9: Minimal convergence cues; the options are distinct and clearly differentiated.  
        10: No convergence cues detected; each option is unique and stands on its own without reference to other options.

    Your response should only be a value in the range of 1 - 10, do not include an explanation or rationale.
    
Question: {}
Options: {}
Answer: {}""".format(question.stem, question.options, question.correct_option)

    return llm_rating(sysrole, prompt)

## Grammatical Cues
All options should be grammatically consistent with the stem and should be parallel in style and form

In [20]:
#If verb exists in answer choice, ensure it's the same tense as verb in other options
#We want the stem to be the same, but as long as all the answers are the same, then it's fine, to avoid false positive.
#https://huggingface.co/Unbabel/gec-t5_small
def grammatical_cues_in_stem(question):
    answer_tense = get_verb_tense(question.correct_option)

    #The simplest option is to ensure the answer and other options are in the same tense, everything else was too high on false positives
    options = question.options.copy()
    for opt in question.correct_option.split('[SEP]'):
        options.remove(opt.strip())
    
    for opt in options:
        opt_tense = get_verb_tense(opt)    
        if opt_tense != 'none' and answer_tense != 'none' and answer_tense is not opt_tense:
            return False
            
    return grammatical_cues_verify(question)

#Longer options might contain verbs of different tenses.
#We want options that specifically have a single tense (past or present) and for it to be consistent with all other options.
def get_verb_tense(text):
    verbs = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ == 'VERB':
            if token.tag_ in ['VBP', 'VBZ']:
                verbs.append('present')
            elif token.tag_ in ['VBD', 'VBN']:
                verbs.append('past')
            else:
                verbs.append('none')

    verb_tenses = list(set(verbs))
    if len(verb_tenses) == 1 and verb_tenses[0] == 'past':
        return 'past'
    elif len(verb_tenses) == 1 and verb_tenses[0] == 'present':
        return 'present'
    return 'none'

def grammatical_cues_verify(question):
    sysrole = """You are an expert educator evaluating multiple-choice questions for any grammatical cues in the stem and answer choices that could inadvertently reveal the correct answer or mislead students. This includes mismatched tenses, pronoun inconsistencies, lack of parallelism in structure, or other errors that a testwise student might exploit to identify or eliminate certain options without relying on content knowledge."""
    prompt = """
Instructions:
    Respond with a score from 1–10 based on the overall grammatical consistency and parallelism of the question and its options. Use the following Scoring Guidelines:

    Scoring Guidelines:
        1–3: The question and/or options contain noticeable grammatical or structural inconsistencies that strongly hint at the correct answer.
        4–6: Some grammatical mismatches or parallelism issues exist, but these are not glaring enough to immediately reveal the correct option.
        7–9: The question and options are mostly consistent, with only minor discrepancies that are unlikely to provide a significant clue.
        10: The question and options are grammatically consistent, parallel in style, and contain no identifiable cues that would give an advantage to a testwise student.

    Do not lower the score for trivial wording differences that are typical and do not compromise fairness.
    
    Only penalize clear grammatical or structural errors that could be used to identify the correct option.

    Your response should be only a single integer (1–10). Do not provide any explanations.

Question: {}
Options: {}
Answer: {}""".format(question.stem, question.options, question.correct_option)
    
    #It's so strict on this, we're setting the threshold to 4 instead of 5.
    #We get better results when we just reutnr True instead of verify, but the LLM could be sided with in these cases.
    return llm_rating(sysrole, prompt,4)

## Vague Terms
Avoid the use of vague terms (e.g. frequently, occasionally) in the options as there is seldom agreement on their actual meaning

In [21]:
#Like the ohter criteria that use a list of terms, these can be modified
def vague_terms(question):
    vagues = ["frequently", "occasionally", "rarely", "seldom", "sometimes", "usually", "regularly", "periodically", "infrequently", "generally", "nearly", "more or less", "somewhat", "partly"]
    
    #check the options then check the stem
    for opt in question.options:
        opt = opt.lower()
        if any(word in opt for word in vagues):
            return False

    #In particular, these words can sometimes be used in the stem in a way that is not a flaw, but more likely than not, it is
    if any(word in question.stem.lower() for word in vagues):
        return False
    
    return True

## Unfocused Stem
The stem should present a clear and focused question that can be understood and answered without looking at the options

In [22]:
def unfocused_stem(question):   
    if not true_or_false(question) or not all_of_the_above(question) or not none_of_the_above(question) or not fill_in_the_blank(question):
        return True
        
    #Traits of an unfocused question (not being a question, etc.)
    if '?' not in question.stem and ":" not in question.stem:
        if not question.stem.endswith(('.', ':', '?', ';')):
            return False

        if not check_if_first_word_is_a_verb(question.stem):
            return False
        
        contains_question = False
        doc = nlp(question.stem)
        for sent in doc.sents:
            if is_question(sent.text.strip()):
                contains_question = True
                break
                
        return contains_question
    # We techncially get better results with human evaluation if we just return true like the below instead of the LLM verification
    # else:
    #    return True

    return unfocused_stem_verify(question)

def check_if_first_word_is_a_verb(sent):
    d = nlp(sent)
    token = d[0] # gets the first token in a sentence
    if token.pos_ == "VERB" and token.dep_ == "ROOT": # checks if the first token is a verb and root or not
        return True
    return False


#From https://stackoverflow.com/questions/4083060/determine-if-a-sentence-is-an-inquiry
def is_question(sent):
    d = nlp(sent)
    token = d[0] # gets the first token in a sentence
    if token.pos_ == "VERB" and token.dep_ == "ROOT": # checks if the first token is a verb and root or not
        return True
    for token in d: # loops through the sentence and checks for WH tokens
        if token.tag_ == "WDT" or token.tag_ == "WP" or token.tag_ == "WP$" or token.tag_ == "WRB" or token.text == '?':
            return True
    return  False


#Teachers should avoid using MCQs with unfocused stems which do not ask a clear question or state a clear problem in the sentence completion format
#The stem should present a clear and focused question that can be understood and answered without looking at the options
def unfocused_stem_verify(question):
    sysrole = """You are an expert, but lenient, educator evaluating multiple-choice questions for the presence of an unfocused stem. A question's stem is considered unfocused if it is not a clear query that can be understood and answered on its own. Your task is to assess how clear and self-contained the stem is, without relying on the answer choices for clarity."""
    prompt = """
Instructions:
    Provide a single numeric score from 1 to 10, reflecting how focused and self-contained the question’s stem is.
    It should ask a clear question or state a clear problem in a sentence completion format.
    Ignore questions with options that are similar to "yes"/"no" or "true"/"false" followed by an explanation.

    Scoring Guidelines:
        1–3: The stem is extremely unfocused or unclear, making it difficult for students to understand what is being asked, even after re-reading.
        4–6: The stem has noticeable clarity issues; students might need to look at the options for guidance or context.
        7–9: The stem is generally clear, with only minor ambiguities; students can understand the question but could benefit from the options for further context.
        10: The stem is entirely clear and focused; students can fully understand and answer the question without looking at the options.
    
    Your response should only be a single number from 1 to 10, with no additional explanation.

Question: {}
Options: {}
Answer: {}""".format(question.stem, question.options, question.correct_option)

    #It's so strict on this, we're setting the threshold to 4 instead of 5.
    return llm_rating(sysrole, prompt, 4)

# Other Metrics (Perplexity, Diversity, Grammatical Error, Cognitive Complexity)
In addition to IWF, calculate these other commonly used metrics to see how they evaluate, Answerability is a fifth metric that I am currently leaving out. Just like BLEU, METEOR, ROGUE, etc. these metrics often do not correlate with human judgements and are not indicators of flawed/bad educational multiple-choice questions like the IWF criteria are. You can still compute them because it's easy enough, but you should put little faith in them.

### Note, these metrics aren't great indicators, you can read about it here: https://arxiv.org/pdf/2405.20529

## Perplexity 
This assesses a language model's ability to predict question and answer text based on its training data. Lower scores suggest more coherent questions and answers with predictable language patterns, whereas higher scores indicate complexity or atypical text, suggesting the questions could be unclear or poorly structured. <br/>
<b>NOTE</b>: This will be very slow for a large amount of questions, anything greater than 30 questions will take quite some time.

In [23]:
def load_model(model_id):
    tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
    model = GPT2LMHeadModel.from_pretrained(model_id)
    return tokenizer, model

def compute_perplexity(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
    loss = outputs.loss
    perplexity = torch.exp(loss)
    return perplexity.item()

def perplexity(questions, model_id='gpt2-large'):
    tokenizer, model = load_model(model_id)
    nl = ', '
    perplexities = []
    
    for index, row in questions.iterrows():
        stem = row['text'].strip()
        non_empty_values = [row[col].strip() for col in ['a', 'b', 'c', 'd', 'e'] if row[col].strip()]
        row_string = nl.join(non_empty_values)
        text = stem + ' ' + row_string
        perplexity_value = compute_perplexity(text, tokenizer, model)
        perplexities.append(perplexity_value)
    
    return perplexities

## Diversity
Using Distinct-3, this evaluates the range in vocabulary, structure, and content across generated texts, ensuring a variety of questions and answers and reducing repetition. A higher diversity score indicates greater uniqueness among MCQs, avoiding repetitive phrases and templated patterns. 

In [24]:
def diversity(questions):
    predictions = []
    per_question = []
    for index, row in questions.iterrows():
        stem=row['text'].strip()
        non_empty_values = [row[col].strip() for col in ['a','b','c','d'] if row[col].strip()]
        row_string = ', '.join(non_empty_values)

        predictions.append(stem + ' ' + row_string)
    
    distinct_3_total = 0
    for o in predictions:
        dist3 = calculate_distinct_3(o)
        distinct_3_total = distinct_3_total + dist3
        per_question.append(dist3)

    print('distinct_3_total: ', (distinct_3_total)/len(predictions))   
    print('ngram_diversity_total: ', ngram_diversity(predictions))
    print('length: ', len(predictions))

    return per_question

def calculate_distinct_3(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Generate trigrams
    trigrams_list = list(trigrams(tokens))

    # Count unique trigrams
    unique_trigrams = set(trigrams_list)
    num_unique_trigrams = len(unique_trigrams)

    # Count total trigrams
    total_trigrams = len(trigrams_list)

    # Calculate Distinct-3
    if total_trigrams > 0:
        distinct_3 = num_unique_trigrams / total_trigrams
    else:
        distinct_3 = 0

    return distinct_3

def ngram_diversity(options, n=3):
    all_ngrams = [ngram for option in options for ngram in ngrams(word_tokenize(option), n)]
    unique_ngrams = set(all_ngrams)
    return len(unique_ngrams) / len(all_ngrams)

## Grammatical Error
This uses a Python wrapper for https://languagetool.org/ currently we are using the free API endpoint, so if it's used excessively we might get IP blocked. Grammatical errors pinpoint grammar violations, such as incorrect verb tense or spelling, quantified for each MCQ.

In [25]:
def check_grammar(text):
    tool = language_tool_python.LanguageToolPublicAPI('en-US')
    matches = tool.check(text)
    return len(matches), matches

predictions = []
ques = {}

def grammatical_error(questions):
    total_errors = 0
    errorsList = []
    for index, row in questions.iterrows():
        stem=row['text'].strip()
        num_errors, errors = check_grammar(stem)
        total_errors = total_errors + num_errors
        errorsList.append(num_errors)

    print('total_errors: ', (total_errors/len(questions)))   
    print('length: ', len(questions))
    print('length of errors: ', len(errorsList))

    return errorsList

## Cognitive Complexity
This is measured by Bloom's Taxonomy, although some research has done it by the "difficulty" of the question, which a LLM can assess, but Bloom's is a better fit. Additionally, this might be redundant since the Bloom's label is included in the question's construction.plex

### Note this is only set up for MCQs with 4 optons at the moment

In [26]:
def cognitive_complexity(questions):
    bloom_labels = []
    predictions = []
    for index, row in questions.iterrows():
        stem=row['text'].strip()
        non_empty_values = [row[col].strip() for col in ['a','b','c','d', 'e'] if row[col].strip()]
        row_string = '\n'.join(non_empty_values)
        predictions.append(stem + '\n' + row_string)

    sysrole = "You are an expert in pedagogy and an astute instructor here to classify a multiple-choice questions provided to you with one of the six level's of Bloom's Revised Taxonomy"
    prompt = """Given the multiple-choice question below, please respond with that level of Bloom's Revised Taxonomy it falls into and nothing else.
        {}
        """
    for q in predictions:
        p = prompt.format(q)
        done = False
    
        while(done == False):
            try:
                o = openai.chat.completions.create(
                  model=model_engine,
                  messages=[
                     {"role": "system",
                      "content": sysrole},
                    {"role": "user", 
                     "content": p},
                  ],
                  max_tokens = 4096,
                  temperature = 0.7
                 )
                done = True 
            except Exception as error:
                print('errored in LLM API call: ', error)
                time.sleep(10)
        completion = o
        done = False
    
        try:
            expert_reasoning = completion.choices[0].message.content.lower()
            bloom_labels.append(expert_reasoning)
        except: 
            print('error with LLM: ', completion)
    return bloom_labels

# Formatting Your CSV of MCQs (it's fine if e is blank)

| id | text | answer | a | b | c | d | e |
|----|------|--------|---|---|---|---|---|
| Data 1  | Data 2  | Data 3  | Data 4  | Data 5  | Data 6  | Data 7  | Data 8  |
| Data 9  | Data 10 | Data 11 | Data 12 | Data 13 | Data 14 | Data 15 | Data 16 |
 |


### id: A unique number
### text: The question's stem
### answer: The text of the correct response, this should match the text in one of the a/b/c/d columns
### a-e: The text for the corresponding option

# 19 Item-Writing Flaws Criteria - Running the code for MCQs with 5 options

In [None]:
all_criteria = [
    'ambiguous_unclear_information',
    'implausible_distractors',
    'none_of_the_above',
    'longest_answer_correct',
    'gratuitous_information_in_stem',
    'true_or_false',
    'avoid_convergence_cues',
    'avoid_logical_cues',
    'all_of_the_above',
    'fill_in_the_blank',
    'absolute_terms',
    'word_repeats_in_stem_and_correct_answer',
    'unfocused_stem',
    'complex_k_type',
    'grammatical_cues_in_stem',
    'lost_sequence',
    'vague_terms',
    'more_than_one_correct',
    'negative_worded_stem'
]

# List all your CSV file paths here
files = [
    'yourpath/YourFile.csv',
    # 'yourpath/MoreFilesIfNeeded.csv',
]

# Loop over each file individually
for file in files:
    print(f'Processing file: {file}')
    # Initialize data structures for each file
    all_data = {}
    all_questions = []
    qids = {}

    # Read the CSV file
    data = pd.read_csv(file)
    data = data.fillna('')
    combined_data = pd.concat([data])

    questions = []
    for index, row in combined_data.iterrows():
        question = MultipleChoiceQuestion(
            stem=row['text'],
            options=[row[col].strip() for col in ['a', 'b', 'c', 'd', 'e'] if row[col].strip()],
            correct_option=row['answer'].strip(),
            qid=row['id'],
            quality=0
        )
        if question.qid not in qids:
            qids[question.qid] = 1
            all_questions.append([question.qid, question.stem, question.correct_option] + question.options)
        questions.append(question)

    # Apply each criterion to the questions
    for criteria in all_criteria:
        print(f'Applying criteria: {criteria}')
        auto_iwf_results = []
        for q in questions:
            ids = globals()[criteria](q)
            if ids:
                ids = 0
            else:
                ids = 1
            auto_iwf_results.append(ids)
        all_data[criteria] = auto_iwf_results

    # Function to pad rows to a required length
    def pad_row(row, length=8, pad_value=''):
        return row + [pad_value] * (length - len(row))

    # Apply the padding function to each row
    padded_questions = [pad_row(row) for row in all_questions]

    qdf = pd.DataFrame(padded_questions, columns=['id', 'text', 'answer', 'a', 'b', 'c', 'd', 'e'])
    df = pd.DataFrame(all_data)
    combined_df = pd.concat([qdf, df], axis=1)

    # Generate a unique output filename based on the input filename
    output_filename = file.replace('.csv', '_IWFresults.csv')
    combined_df.to_csv('yourpath/' + output_filename, index=False)
    print(f'Saved output to: {output_filename}')

# Other Metrics (Perplexity, Diversity, Grammatical Error, Cognitive Complexity) - Running the code

In [None]:
t = erl letting it error
## Calc the other metrics
other_metrics = ['perplexity',
                 'diversity',
                 'grammatical_error',
                 'cognitive_complexity']

files = ['CSV_OTHER.csv']
metric_data = {}
for metric in other_metrics:
    for file in files:
        print('----- ', metric, ' ----- ', file)
        data = pd.read_csv(file)
        data = data.fillna('')
        combined_data = pd.concat([data]) #This is used for multiple files
        
        result = globals()[metric](combined_data)
        metric_data[metric] = result

df = pd.DataFrame(metric_data)
df.to_csv('RESULTS_OTHER.csv', index=False)