In [353]:
import spacy
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util
from lexicalrichness import LexicalRichness
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import math
import random

#synonym testing
import numpy as np
nlp = spacy.load('en_core_web_lg')
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize, sent_tokenize
import openai
openai.api_key = 'REPLACE_WITH_YOUR_KEY'
model_engine = "text-davinci-003"

In [300]:
nl = '\n'
class MultipleChoiceQuestion:
    def __init__(self, stem, options, correct_option, qid = None, courseid = None, quality = None):
        self.stem = stem
        self.options = options
        self.correct_option = correct_option
        self.qid = qid
        self.courseid = courseid
        self.quality = quality
        
    def __str__(self):
        return f"Question: {self.stem}\n {nl.join(self.options)}\nCorrect option: {self.correct_option}\nQuality: {self.quality}"

In [378]:
#https://huggingface.co/cointegrated/roberta-large-cola-krishna2020
#Ranking Distractors for Multiple Choice Questions Using Multichannel Semantically Informed CNN-LSTM Networks
cola = pipeline('text-classification', model='cointegrated/roberta-large-cola-krishna2020',truncation=True)

def ambiguous_unclear_information(question):
    output = cola(question.stem)
    score = output[0]['score']
    if score >= 0.7:
        return True
    else:
        print('--- Question stem is unclear')
        return False

In [336]:
#semantic specificity and semantic homogeneity 
#semantic relatedness of answer and other options
#model_gigaword = api.load("glove-wiki-gigaword-100")
#model_gigaword.most_similar(positive=['dirty','grimy'],topn=10)
#v_apple = model_gigaword['Paris']
#v_mango = model_gigaword['London']
#cosine_similarity([v_apple],[v_mango])

#https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
#Distractor Quality Evaluation in Multiple Choice Questions
#Ranking Distractors for Multiple Choice Questions Using Multichannel Semantically Informed CNN-LSTM Networks
model = SentenceTransformer('all-MiniLM-L6-v2')


#Uses NER, so if the score is too low, if they're matching entities (i.e. people) then we can ignore this case and say True
#For example, these two are similar, they're just both entities not recognized. 
#--- distractor not similar enough
#Millikan 		 Rutherford 		 Score: 0.1707
def implausible_distractors(question):
    correct = question.correct_option
    options = question.options.copy()
    options.remove(correct)

    # Two lists of sentences
    sentences1 = [correct, correct, correct]
    sentences2 = options

    #Compute embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    #Output the pairs with their score
    for i in range(len(sentences1)):
        if cosine_scores[i][i] < 0.15: #Was .2
            
            #NER check here...
            opt_entity = nlp(sentences2[i])
            lemma_nouns_opt = get_lemma_nouns(sentences2[i])
            print('opt_entity: ', opt_entity)
            print('lemma_nouns_opt: ', lemma_nouns_opt)
            
            ans_entity = nlp(sentences1[i])
            lemma_nouns_ans = get_lemma_nouns(sentences1[i])
            print('ans_entity: ', ans_entity)
            print('lemma_nouns_ans: ', lemma_nouns_ans)
            

            #If the noun(s) in the answer choice can be tagged with an entity
            if ans_entity.ents:
                answer_entity = ans_entity.ents[0].label_
            else:
                answer_entity = None

            if opt_entity.ents:
                opt_entity = opt_entity.ents[0].label_
            else:
                opt_entity = None

            if answer_entity and opt_entity and answer_entity in opt_entity:
                print('*** low score, but they are the same entity', answer_entity, ' & ', opt_entity)
                return True
            
            if len(lemma_nouns_ans) == 0 and len(lemma_nouns_opt) == 0:
                #Couldn't find the noun nor the entity? Unable to parse effectively to make a judgement.
                print('*** no noun or entity, cant judge')
                return True
            
            #If the option in this case is none/all of the above, it won't be similar, so ignore this criteria
            if not all_of_the_above(question) or not none_of_the_above(question):
                return True
            
            print('--- distractor not similar enough')
            print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
            return False
        
    return True

In [303]:
def none_of_the_above(question):
    print(question)
    for opt in question.options:

        cleaned_opt = opt.strip().lower()
        if 'none of the above' in cleaned_opt or ('none' in cleaned_opt and 'above' in cleaned_opt) or cleaned_opt == 'neither' or cleaned_opt == "none" or 'none' in question.options[3]:
            print('--- none of the above')
            return False
    
    return True

In [304]:
#If the correct answer is noticably longer (20% or more) than the second longest answer, flag it.
def longest_answer_correct(question):
    correct = question.correct_option
    options = question.options.copy()
    options.remove(correct)
    
    longest_option = 0
    for opt in options:
        if len(opt) >= longest_option:
            longest_option = len(opt)
        
    #If the longest option is only by 20% or it's a single word/number, then this passes
    if longest_option >= len(correct) * 0.8 or len(correct.split()) == 1:
        return True
    
    print('--- longest option is correct')
    return False

In [305]:
def gratuitous_information_in_stem(question):  
    #How effective are lexical richness measures for differentiations of vocabulary proficiency? A comprehensive examination with clustering analysis
    #https://github.com/LSYS/LexicalRichness
    #If the lexical richness is too wild...it's gratutious 
    stem = LexicalRichness(question.stem)
    
    if stem.cttr > 4.5:
        print("--- CTTR above 4.5, text is too complex and extraneous: ", stem.cttr)
        return False
    
    return True

  

In [306]:
s = "This is a test. Here is some extraneous information. Which of these is not true?"
for sent in s.split('.'):
    sent = sent.lower()
    print('sent: ' , sent)
    if 'which' in sent and 'not' in sent:
        print('negative word')

sent:  this is a test
sent:   here is some extraneous information
sent:   which of these is not true?
negative word


In [307]:
#Question should not be a series of true/false statements, so we can look for "which" and "true" or "false" in the stem
def true_or_false(question):
    correct = question.correct_option
    options = question.options.copy()
    options.remove(correct)
    
    #Check for neatively worded stem too.
    for sent in question.stem.split('.'):
        sent = sent.lower()
        if 'which' in sent and ('false' in sent or 'true' in sent):
            return False    
    
    for opt in options:
        if opt.strip().lower() == 'true' or opt.strip().lower() == 'false' or opt.strip().lower() == 'yes'or opt.strip().lower() == 'no':
            print('--- There is a true/false or yes/no answer choice')
            return False

    return True

In [308]:
def avoid_convergence_cues(question):
    #Check for synonyms, because they'll know it's the word they've most recently come across in the text
    #The correct option is likely to be used more (when in pairs, etc.) --> k-type (super similar by description)
    #https://www.rcpch.ac.uk/sites/default/files/rcpch/HTWQ/convergence.html
    
    #So here we check for synonyms used in the words, in case they get lazy with distractors
    options = question.options.copy()
    options.remove(question.correct_option)
    if len(options) < 3:
        return True
    
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    #so we want this code, but like, for synonyms
    synonyms = []
    for noun in lemma_nouns_answ:
        for syn in wn.synsets(noun):
            for l in syn.lemmas():
                synonyms.append(l.name().lower().replace('_', ' '))
    
    for opt in lemma_nouns_options:
        repeating_nouns_synonyms = list(set(synonyms).intersection(opt))
        print("repeating_nouns_synonyms: ", repeating_nouns_synonyms)
        if len(repeating_nouns_synonyms) > 0:           
            
            #if the repeat is not in every answer choice, flag it.
            for rns in repeating_nouns_synonyms:           
                
                flag = True
                for value in lemma_nouns_options:
                    if rns not in value:
                        print('--- we have a synonym of the answer being used in other answer choices, but not all of them: ', rns)
                        return False
    
    return True

In [309]:
#An example of a logical cue is asking students to select the most appropriate pharmaceutical intervention for a problem and
#only having one or two options which

def avoid_logical_cues(question):
    #NER? If the question asks for a <certain type of noun, like a person> then the options should all be <people> too.
    #so do NER on the answer and see if we get those in the other options
    
    options = question.options.copy()
    options.remove(question.correct_option)
    if len(options) < 3:
        return True
    
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    entities_in_options = []
    for opt in lemma_nouns_options:
        for val in opt:
            doc = nlp(val)
            if doc.ents:
                entities_in_options.append(doc.ents[0].label_)
    
    entities_in_answer = []
  
    for val in lemma_nouns_answ:
        doc = nlp(val)
        
        #If the noun(s) in the answer choice can be tagged with an entity
        if doc.ents:
            answer_entity = doc.ents[0].label_
            if answer_entity not in entities_in_options:
                print('--- The answer entity is not found in any other options: ', answer_entity)
                return False 
    
    return True

In [310]:
def all_of_the_above(question):
    for opt in question.options:
        if 'all of the above' in opt or ('all' in opt and 'above' in opt) or ('all if the' in opt):
            print('--- all of the above')
            return False

    return True

In [311]:
#Might change to be more than a single underscore? For example, __
def fill_in_the_blank(question):
    if "_" in question.stem:
        print('--- fill in the blank')
        return False
    
    return True

In [312]:
#This one might need POS tagging too, because an absolute term isn't a deal breaker in some cases
# i.e. "Why does Hydrogen never bond with...?" 
def absolute_terms(question):
    absolutes = ["always", "never", "every", "none", "only"]
    for opt in question.options:
        #Count all, but not in the case of "all of the above"
        if any(word in opt for word in absolutes) or ("all" in opt and all_of_the_above(question)):
            print('--- absolute word in question stem')
            return False
    
    return True

In [313]:
#Find the nouns in question.correct_option and question.stem --> stem them --> compare cosine similiary (usin sentence transformer)
#Also check for the synonyms, compare them. However, if the word(s) are used in the other options, then it's fine.
#PoS tagging -> lemma -> Synonyms -> Cosine similarity 
#Technically I could check synonyms between stem and other answer choices, but I wont.

lemmatizer = WordNetLemmatizer()
#Nouns: NN noun, singular ‘- desk’, NNS noun plural – ‘desks’, NNP proper noun, singular – ‘Harrison’, NNPS proper noun, plural – ‘Americans’ 
nouns = ['NN', 'NNS', 'NNP', 'NNPS']

def word_repeats_in_stem_and_correct_answer(question):   
    lemma_nouns_stem = get_lemma_nouns(question.stem)        
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    
    repeating_nouns = list(set(lemma_nouns_stem).intersection(lemma_nouns_answ))
    
    #check for synonms in question stem w/ answer choice
    synonyms = []
    for noun in lemma_nouns_stem:
        for syn in wn.synsets(noun):
            for l in syn.lemmas():
                synonyms.append(l.name().lower().replace('_', ' '))
                
    repeating_nouns_synonyms = list(set(synonyms).intersection(lemma_nouns_answ))
    
    #If we get a repeat, then it should also repeat in the other answer choices, not just the correct!
    if len(repeating_nouns) > 0 or len(repeating_nouns_synonyms) > 0:
        options = question.options.copy()
        options.remove(question.correct_option)
        options_that_share_noun = 0
        
        for opt in options:
            lemma_option = get_lemma_nouns(opt)
            repeating_nouns_ans_opt = list(set(lemma_option).intersection(lemma_nouns_answ))
            if len(repeating_nouns_ans_opt) > 0:
                options_that_share_noun += 1
        
        #If the word is shared between all options, then it is fine
        if options_that_share_noun == 3 or all_of_the_above(question) or none_of_the_above(question):
            return True
        else:
            print('--- The noun is only shared in certain words')
            return False
        
    #Check for word (adjective, noun, verb, adverb) that repeats just in stem and answer    
    word_types = ['NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'VB', 'VBG', 'VBN', 'VBP', 'VBZ']
    stem_token = sent_tokenize(question.stem)
    stem_words = []
    for i in stem_token:
        wordsList = nltk.word_tokenize(i)
        wordsList = [w for w in wordsList if not w in stop_words]
        tagged = nltk.pos_tag(wordsList)
        for t in tagged:
            if t[1] in word_types:
                stem_words.append(t[0].lower())
                
    ans_token = sent_tokenize(question.correct_option)
    ans_words = []
    for i in ans_token:
        wordsList = nltk.word_tokenize(i)
        wordsList = [w for w in wordsList if not w in stop_words]
        tagged = nltk.pos_tag(wordsList)
        for t in tagged:
            if t[1] in word_types:
                ans_words.append(t[0].lower())
    
    if any(x in stem_words for x in ans_words):
        return False

    
    return True

def get_lemma_nouns(text):
    all_nouns = []
    tokenized = sent_tokenize(text)
    
    for i in tokenized:

        # Word tokenizers is used to find the words and punctuation in a string
        wordsList = nltk.word_tokenize(i)

        # removing stop words from wordList
        wordsList = [w for w in wordsList if not w in stop_words]

        #  Using a Tagger. Which is part-of-speech tagger or POS-tagger.
        tagged = nltk.pos_tag(wordsList)
        
        # Add any nouns to this list
        for t in tagged:
            if t[1] in nouns:
                all_nouns.append(t[0].lower())
    
    lemmatized_nouns = []
    for n in all_nouns:
        lemmatized_word = lemmatizer.lemmatize(n, pos="n")
        lemmatized_nouns.append(lemmatized_word.lower())
    
    return lemmatized_nouns

In [314]:
#Readability score (perhaps BLEU and shit, maybe the BERT-based one?)
def unfocused_stem(question):
    contains_question = False
    doc = nlp(question.stem)
    for sent in doc.sents:
        if is_question(sent.text.strip()):
            contains_question = True
            
    if not contains_question:
        print("--- Question stem does not contain a question")
        
    return contains_question

#https://stackoverflow.com/questions/72548388/is-there-a-machine-learning-or-nlp-model-to-separate-questions-and-answers-in-ra
#https://stackoverflow.com/questions/4083060/determine-if-a-sentence-is-an-inquiry
def is_question(sent):
    d = nlp(sent)
    token = d[0] # gets the first token in a sentence
    if token.pos_ == "VERB" and token.dep_ == "ROOT": # checks if the first token is a verb and root or not
        return True
    for token in d: # loops through the sentence and checks for WH tokens
        if token.tag_ == "WDT" or token.tag_ == "WP" or token.tag_ == "WP$" or token.tag_ == "WRB" or token.text == '?':
            return True
    return  False

In [315]:
def clean_string(string):
    # remove whitespace
    cleaned_string = string.strip()
    
    # remove punctuation
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string)
    
    # remove list notation
    cleaned_string = re.sub(r'\b(i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|xiv|xv|xvi|xvii|xviii|xix|xx)\b', '', cleaned_string)
    cleaned_string = re.sub(r'\b(A|B|C|D|E|F)\b', '', cleaned_string)
    print('cleaned_string: ', cleaned_string)
    return cleaned_string

#Check for commas in the answer choices? Repeated words between answer choices?
#Also similar to the convergence cue one
#Now it does a good job picking up on teh 1. or ii/iii/iv or (A) (B) (C) style questions, how can we do that?
def complex_k_type(question):
    #if the answer options share the same words between one another 
    #and there are commas present
    #then it's k type
    #also the stem says "Select all that apply" or "Select all" in general?
    
    options = question.options.copy()
    options.remove(question.correct_option)
    if len(options) < 3:
        return True 
    
    #check if the options contain a comma
    contain_a_comma = 0
    for opt in options:
        if ',' in opt:
            contain_a_comma += 1
    contain_a_comma = contain_a_comma == len(options)
    
    lemma_nouns_answ = get_lemma_nouns(question.correct_option)
    lemma_nouns_options = [get_lemma_nouns(options[0]), get_lemma_nouns(options[1]), get_lemma_nouns(options[2])]
    
    options_that_share_noun = 0
    for lno in lemma_nouns_options:   
        repeating_nouns = list(set(lno).intersection(lemma_nouns_answ))
        if (len(repeating_nouns) > 0) and (len(lno) > 0):
            options_that_share_noun += 1
    
    #Options share a key word, there are multiple nouns in the options, and they have a comma
    #suggesting it might be a k-type question
    if options_that_share_noun > 0 and contain_a_comma:
        print("--- This might be a K-type question")
        return False
    
    #After removing any list notation in the answer choices, see if they contain the same words
    cleaned_options = []
    for opt in options:
        cleaned_options.append(clean_string(opt))

    options_set_list = [set(i.split()) for i in cleaned_options]
    if options_set_list[0] == options_set_list[0] and options_set_list[0] == options_set_list[1] and options_set_list[0] == options_set_list[2]:
        return False
    
    return True

In [316]:
#If verb exists in answer choice, ensure it's the same tense as verb in other options
# we want the stem to be the same, but as long as all the answers are the same, then it's fine, to avoid false positive

# (Done in logical cue) NER, if the answer choice contains a specific type of noun, the other options should likely contain that
#VB  --  verb, base form
# VBD  --  verb, past tense
# VBG  --  verb, gerund or present participle
# VBN  --  verb, past participle
# VBP  --  verb, non-3rd person singular present
# VBZ  --  verb, 3rd person singular present

def grammatical_cues_in_stem(question):
    stem_tense = get_verb_tense(question.stem)
    answer_tense = get_verb_tense(question.correct_option)
    
    options = question.options.copy()
    options.remove(question.correct_option)
    for opt in options:
        opt_tense = get_verb_tense(opt)
        if opt_tense != 'none' and answer_tense is not opt_tense:
            print("--- verb tense doesn't align between answer other options")
            return False
        
    return True
    
def get_verb_tense(text):
    doc = nlp(text)
    for token in doc:
        if token.pos_ == 'VERB':
            if token.tag_ in ['VBP', 'VBZ']:
                return 'present'
            elif token.tag_ in ['VBD', 'VBN']:
                return 'past'
            else:
                return 'other'
    return 'none'

In [317]:
#If answer choices are numeric, sort them, compare to current order
#Otherwise how the hell do we know this one..? No need to alphabetize imo (unless it’s like one word)
def lost_sequence(question):
    options = question.options.copy()
    
    opts = []
    for opt in options:
        opt = re.sub(r'[$%°FC,]', '', opt)
        opts.append(opt)
        try:
            float(opt)
        except ValueError:
            return True
        
    float_options = [float(x) for x in opts]    
    sorted_options = sorted(float_options)
    if sorted_options == float_options:
        #Numeric options are sorted
        return True
    else:
        print('--- Options are numeric and not sorted')
        return False

In [318]:
#This one might need POS tagging too, because an absolute term isn't a deal breaker in some cases
# i.e. "Why does Hydrogen never bond with...?" 
def vague_terms(question):
    vagues = ["often", "sometimes", "rarely", "typically", "usually", "normally", "generally", "nearly", "approximately", "more or less", "somewhat"]
    for opt in question.options:
        if any(word in opt for word in vagues):
            print('--- vague word in question stem')
            return False
    
    return True

In [319]:
#QA model confirm correct answer is correct → (make the QA model do it’s best guess, 
#but maybe use like gpt-3 AND some QA model?)
#I think we can say “all answers are correct, the QA Model was great, but in the future something more robust 
#like ChatGPT should be utilized? As we input all the questions and they were right.
completion = None
def more_than_one_correct(question):
    if not question.options[2]:
        question.options[2] = 'None'
        
    if not question.options[3]:
        question.options[3] = 'None'
    
    # Define the prompt
    prompt = """
    Answer the multiple-choice question below by responding with A, B, C, or D.
    
    {}

    A) {}
    B) {}
    C) {}
    D) {}
    """.format(question.stem, question.options[0], question.options[1], question.options[2], question.options[3])

    # Generate a response
    completion = openai.Completion.create(engine=model_engine, prompt=prompt, max_tokens=1024, n=1,stop=None,temperature=0.5)

    # Print the response
    try:
        #cleaned_response = completion.choices[0].text.split(')')[1].strip()
        cleaned_response = completion.choices[0].text.split(')')[0].strip()
        if (cleaned_response == 'A'):
            cleaned_response = question.options[0]
        elif (cleaned_response == 'B'):
            cleaned_response = question.options[1]
        elif (cleaned_response == 'C'):
            cleaned_response = question.options[2]
        elif (cleaned_response == 'D'):
            cleaned_response = question.options[3]
    except: 
        print('error in GPT3: ', completion)
    
    if cleaned_response == question.correct_option:
        return True
    else:
        print('--- GPT-3 believes the answer is incorrect: ', cleaned_response, ' ', question.correct_option)
        return False

In [320]:
#This one might need POS tagging too, because an negative term isn't a deal breaker in some cases
# i.e. "If there is no oxygen present when the chemical reacts, then what does...?" 
#Sentiment analysis for this? nlp determine if something is negatively worded
#Now it checks for "not" and "incorrect" and "false" paired with "Which" in the same sentence? Perhaps split by period
def negative_worded_stem(question):
    negatives = ["no", "none", "never", "without", "exclude", "avoid", "deny", "refuse", "oppose", "dispute"]
    for opt in question.stem:
        if any(word in opt for word in negatives):
            print('--- absolute word in question stem')
            return False
    
    #Check for neatively worded stem too.
    for sent in question.stem.split('.'):
        sent = sent.lower()
        #What is common too, but might have too many falses?
        if 'which' in sent and ('false' in sent or 'not' in sent or 'incorrect' in sent or 'except' in sent) or \
        'what' in sent and ('false' in sent or 'not' in sent or 'incorrect' in sent or 'except' in sent):
            return False    
    
    
    return True

## Peerwise Biochemistry Data - 400 Questions

In [408]:
qs = pd.read_csv('peerwise_questions.csv')
# filtered_df = qs.loc[(qs['avg_rating'] > 3.49) & (qs['total_ratings'] > 9) & (qs['numAlts'] == 4)]
# filtered_df = filtered_df[~filtered_df['question'].str.casefold().str.contains("img")]
columns_to_clean = ['question', 'altA', 'altB', 'altC', 'altD']

def clean_peerwise_data(cdf):
    for cName in columns_to_clean:
        cdf[cName] = cdf[cName].str.replace(r'<[^<>]*>', ' ', regex=True)
        cdf[cName] = cdf[cName].apply(lambda x: re.sub("&nbsp;",  "", x))
        cdf[cName] = cdf[cName].str.replace(' +', ' ', regex=True)
        cdf[cName] = cdf[cName].str.strip()

    peerwise_student_questions = []
    for index, row in cdf.iterrows():
        question = MultipleChoiceQuestion(
            stem=row['question'],
            options=[row['altA'], row['altB'], row['altC'], row['altD']],
            correct_option= row['alt' + row['answer'].strip()],
            qid = row['total_response'],
            courseid = row['avg_difficulty'],
            quality = row['avg_rating']
        )
        if not question.stem or not question.options[0] or not question.options[1] or not question.options[2] or not question.options[3]:
            print('removing empty option question')
        else:
            peerwise_student_questions.append(question)
    
    selected_items = random.sample(peerwise_student_questions, 400)
    return selected_items


# # biochem_lowest_df = qs.loc[(qs['avg_rating'] < 2.0) & (qs['avg_rating'] > 1.49) & (qs['total_ratings'] > 9) & (qs['numAlts'] == 4)]
# # biochem_lowest = clean_peerwise_data(biochem_lowest_df)

# # biochem_low_df = qs.loc[(qs['avg_rating'] < 2.5) & (qs['avg_rating'] > 1.99) & (qs['total_ratings'] > 19) & (qs['numAlts'] == 4)]
# # biochem_low = clean_peerwise_data(biochem_low_df)

# # biochem_mid_df = qs.loc[(qs['avg_rating'] < 3.0) & (qs['avg_rating'] > 2.49) & (qs['total_ratings'] > 19) & (qs['numAlts'] == 4)]
# # biochem_mid = clean_peerwise_data(biochem_mid_df)

# # biochem_high_df = qs.loc[(qs['avg_rating'] < 3.5) & (qs['avg_rating'] > 2.99) & (qs['total_ratings'] > 19) & (qs['numAlts'] == 4)]
# # biochem_high = clean_peerwise_data(biochem_high_df)

# # biochem_highest_df = qs.loc[(qs['avg_rating'] > 3.49) & (qs['total_ratings'] > 6) & (qs['numAlts'] == 4)]
# # biochem_highest = clean_peerwise_data(biochem_highest_df)

# # peerwise_student_questions = biochem_lowest + biochem_low + biochem_mid + biochem_high + biochem_highest    
# # print('length: ', len(peerwise_student_questions))

take2 = qs.loc[(qs['total_response'] > 29) & (qs['numAlts'] == 4)]
biochem_take2 = clean_peerwise_data(take2)

peerwise_student_questions = biochem_take2

rows = []
i = 0
for q in peerwise_student_questions:
    r = [ambiguous_unclear_information(q),
        implausible_distractors(q),
        none_of_the_above(q),
        longest_answer_correct(q),
        gratuitous_information_in_stem(q),
        true_or_false(q),
        avoid_convergence_cues(q),
        avoid_logical_cues(q),
        all_of_the_above(q),
        fill_in_the_blank(q),
        absolute_terms(q),
        word_repeats_in_stem_and_correct_answer(q),
        unfocused_stem(q),
        complex_k_type(q),
        grammatical_cues_in_stem(q),
        lost_sequence(q),
        vague_terms(q),
        #more_than_one_correct(q),
        negative_worded_stem(q),
        q.qid,
        q.courseid,
        q.quality]
    rows.append(r)
    print('i: ', i)
    i += 1
    
columns = [
    'ambiguous_unclear_information',
    'implausible_distractors',
    'none_of_the_above',
    'longest_answer_correct',
    'gratuitous_information_in_stem',
    'true_or_false',
    'avoid_convergence_cues',
    'avoid_logical_cues',
    'all_of_the_above',
    'fill_in_the_blank',
    'absolute_terms',
    'word_repeats_in_stem_and_correct_answer',
    'unfocused_stem',
    'complex_k_type',
    'grammatical_cues_in_stem',
    'lost_sequence',
    'vague_terms',
    #'more_than_one_correct',
    'negative_worded_stem',
    'qid',
    'courseid',
    'quality'
]

peerwise_results = pd.DataFrame(rows, columns=columns)
peerwise_results.to_csv("peerwise_400diffv2_results_auto_iwf.csv")

#Print the number of "Falses", which correspond to IWF, found the table across each criteria
peerwise_400diffv2_results = peerwise_results.copy()
peerwise_400diffv2_results = peerwise_400diffv2_results.drop(columns=['qid', 'courseid', 'quality'])
for c in columns[:len(columns)-3]:
    print(c, ': ', (~peerwise_400diffv2_results[c]).values.sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdf[cName] = cdf[cName].str.replace(r'<[^<>]*>', ' ', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdf[cName] = cdf[cName].apply(lambda x: re.sub("&nbsp;",  "", x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdf[cName] = cdf[cName].str.replace(' +', ' ', regex=True)
A value is try

removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
removing empty option question
opt_entity:  ETC coupling
lemma_nouns_opt:  ['etc', 'coupling']
ans_entity:  ATP production in type I

Question: C1-17 What are the main characteristics of a rate limiting step, or enzyme, of metabolism?
 Slow, [S] &lt;&lt; Km and saturated with substrate
Fast, catalyses a reversible reaction and is working at Vmax
Slow, catalyses a irreversible reaction and is working at Vmax
Slow, operating under low [S] and catalyses an irreversible reaction
Correct option: Slow, catalyses a irreversible reaction and is working at Vmax
Quality: 2.82
repeating_nouns_synonyms:  ['slow']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  slow
--- This might be a K-type question
--- verb tense doesn't align between answer other options
i:  6
Question: In our daily nitrogen balance:
 The amount of output of nitrogen is less than the input
Approximately 16.5g of nitrogen is excreted through urea everyday
Approximately 16.5g of nitrogen is excreted through urea and faeces everyday
The amount of input of nitrogen is the same as the output
Correct option: The amount 

opt_entity:  DNA Polymerase 1
lemma_nouns_opt:  ['dna', 'polymerase']
ans_entity:  dNTPs
lemma_nouns_ans:  ['dntps']
Question: Which of the following PCR components is used in the Sanger method of DNA Sequencing?
 DNA Polymerase 1
Single-stranded DNA Molecule
A pair of DNA primers
dNTPs
Correct option: dNTPs
Quality: 2.8
--- distractor not similar enough
dNTPs 		 DNA Polymerase 1 		 Score: 0.0358
Question: Which of the following PCR components is used in the Sanger method of DNA Sequencing?
 DNA Polymerase 1
Single-stranded DNA Molecule
A pair of DNA primers
dNTPs
Correct option: dNTPs
Quality: 2.8
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  DNA Polymerase 1
cleaned_string:  Singlestranded DNA Molecule
cleaned_string:   pair of DNA primers
--- verb tense doesn't align between answer other options
i:  13
opt_entity:  Hydrophobic
lemma_nouns_opt:  ['hydrophobic']
ans_entity:  All of above
lemma_nouns_ans:  []
Question: The fo

--- verb tense doesn't align between answer other options
i:  22
Question: C5-21 Which of the following regarding cholesterol's role in a cell membrane is incorrect ?
 Adding cholesterol to a membrane with too many unsaturated fatty acids will decrease its fluidity.
Cholesterol acts as a membrane fluidity 'buffer'.
Adding cholesterol to a membrane with lots of saturated fatty acids will increase its solidity.
Cholesterol helps keep the membrane impermeable to some water soluble molecules.
Correct option: Adding cholesterol to a membrane with lots of saturated fatty acids will increase its solidity.
Quality: 2.86
repeating_nouns_synonyms:  ['acid', 'cholesterol']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  acid
cleaned_string:  Adding cholesterol to a membrane with too many unsaturated fatty acids will decrease its fluidity
cleaned_string:  Cholesterol acts as a membrane fluidity buffer
cleaned_string:  Cholesterol helps keep the membran

i:  30
Question: salmon and olive oil are both good sources of unsaturated fat, however deep-fried salmon with olive oil is actually unhealthy because:
 The high temperature harms the effectiveness of nutrients.
The high temperature breaks some of the double bounds in the polyunsaturated fats and turns them into monosaturated fats which is unhealthy.
The unsaturated fats is fully oxidized to saturated fats in the cooking process
Deep-frying may increases the trans fats contents of the food.
Correct option: Deep-frying may increases the trans fats contents of the food.
Quality: 2.59
repeating_nouns_synonyms:  ['nutrient']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  nutrient
--- Question stem does not contain a question
cleaned_string:  The high temperature harms the effectiveness of nutrients
cleaned_string:  The high temperature breaks some of the double bounds in the polyunsaturated fats and turns them into monosaturated fats which is 

Question: When blood glucose concentration drops below 5mM, glucagon is released by the pancreas to restore this euglycemic state. Which of the following best represents what happens when glucagon levels are high in terms of the regulatory molecule fructose 2,6-bisphosphate?
 glucagon increases - cAMP increases - PFK2 deactivates and FBPase2 activates - F26BP decreases - inhibition of glycolysis and stimulation of gluconeogenesis
glucagon increases - cAMP decreases - PFK2 inactivates and FBPase2 activates - F26BP decreases - inhibition of glycolysis and stimulation of gluconeogenesis
glucagon increases - cAMP increases - both PFK2 and FBPase2 parts deactivate - F26BP decreases - inhibition of glycolysis and stimulation of gluconeogenesis
glucagon increases - cAMP increases - both PFK2 and FBPase2 parts activate - F26BP decreases - inhibition of glycolysis and stimulation of gluconeogenesis
Correct option: glucagon increases - cAMP increases - PFK2 deactivates and FBPase2 activates - F2

Question: Which of the following is correct in describing the composition and role of the following lipoproteins?
 Chylomicrons have a high triglyceride and low protein composition so that it can be hydrolysed to release free fatty acids.
VLDL have a high protein and cholesterol composition so that it can perform reverse cholesterol transport.
LDL have a high triglyceride and low in cholesterol composition so that it can transport and remove exogenous triglycerides.
HDL have a low cholesterol and low protein composition so that it can transport endogenous triglyceride from the liver.
Correct option: Chylomicrons have a high triglyceride and low protein composition so that it can be hydrolysed to release free fatty acids.
Quality: 2.88
repeating_nouns_synonyms:  ['protein', 'composition']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  protein
cleaned_string:  VLDL have a high protein and cholesterol composition so that it can perform revers

Question: C1- 09: Imagine your friend asks for your opinion on taking a DNP supplement such as the one below, in an attempt to lose weight. Which of the following would be the best advice to give? You should -
 Consume DNP because it lowers ATP production in the cell of your choice &ndash; hence you have control over your weight loss
Consume DNP because it has no side effects and many people have had success in losing weight
Not consume DNP because it does not discriminate which cells have lowered ATP production and can cause large amounts of cell death
Not consume DNP because it does not cause you to stop ATP production and hence doesn&rsquo;t work in attempting lose weight
Correct option: Not consume DNP because it does not discriminate which cells have lowered ATP production and can cause large amounts of cell death
Quality: 2.38
repeating_nouns_synonyms:  ['atp', 'cell', 'production']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  atp


Question: Topic C03-03: If someone is affected by diabetes and does not eat for a prolonged period of time:
 Their blood glucose would fall below 4 mM, which is okay because the brain will use fatty acids instead
Their blood glucose would fall below 4 mM, which is not enough because the brain uses 120 g of glucose per day
Their blood glucose would increase, meaning that the excess glucose will get converted to fatty acids
Their blood glucose level would remain unaffected because the body will convert fatty acids to glucose
Correct option: Their blood glucose would fall below 4 mM, which is not enough because the brain uses 120 g of glucose per day
Quality: 2.27
repeating_nouns_synonyms:  ['glucose', 'blood', 'brain', 'mm']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  brain
--- absolute word in question stem
--- Question stem does not contain a question
cleaned_string:  Their blood glucose would fall below 4 mM which is okay because the b

--- Question stem is unclear
Question: If you consume a meal containing approximately 50g of glucose/starch, in relation to postprandial glucose disposal which of the following will occur:
 Blood glucose concentration will begin to rise 30 minutes after consumption and will completely return to normal after ~60 minutes
Blood glucose concentration will rise 5 minutes after consumption and will fall, returning to normal after ~20 minutes
Blood glucose concentration will rise immediately after consumption but quickly fall, returning to normal within ~90 minutes
Blood glucose concentration will rise 40 minutes after consumption and will remain at peak for ~10 minutes before returning to normal after ~90 minutes
Correct option: Blood glucose concentration will rise immediately after consumption but quickly fall, returning to normal within ~90 minutes
Quality: 2.82
repeating_nouns_synonyms:  ['minute', 'rise', 'concentration', 'blood', 'consumption']
repeating_nouns_synonyms:  ['minute', 'ri

Question: Which group of amino acids is both ketogenic and glucogenic and why?
 Group 1, because pyruvate acts as the substrate for both ketogenesis and gluconeogenesis
Group 2, because Acetyl-CoA not only produces acetoacetate but also regenerates oxaloacetate via the TCA cycle to be used in gluconeogenesis
Group 3, because acetoacetate can be converted into acetyl-CoA in a reversible reaction and participates in gluconeogenesis
Group 4, because intermediates in the TCA cycle could regenerate both pyruvate and acetyl-CoA to be used in gluconeogenesis and ketogenesis.
Correct option: Group 1, because pyruvate acts as the substrate for both ketogenesis and gluconeogenesis
Quality: 2.77
repeating_nouns_synonyms:  ['group']
repeating_nouns_synonyms:  ['group']
repeating_nouns_synonyms:  ['group']
--- absolute word in question stem
--- This might be a K-type question
--- verb tense doesn't align between answer other options
i:  71
Question: Which of the following statements is true about t

--- This might be a K-type question
--- verb tense doesn't align between answer other options
i:  78
Question: C4-49 Fat distribution after digestion. Rhonda is visiting the doctor after suffering abdominal pain and inflammation and deposits of fatty material in the skin. A blood sample is taken and centrifuged to reveal a creamy layer in the plasma. What could be the cause of Rhonda's condition?
 Abetalipoproteinemia, the inability of the body to make certain lipoproteins.
Cystic fibrosis causing a thick, mostly impenetrable mucus to line epithelial cells in both the pancreas and in the intestine.
Gall stones in her gallbladder preventing bile salts from being secreted in the intestine.
Chylomicronemia syndrome, a mutation of the gene coding for lipoprotein lipase resulting in an inability to produce the protein.
Correct option: Chylomicronemia syndrome, a mutation of the gene coding for lipoprotein lipase resulting in an inability to produce the protein.
Quality: 3.54
repeating_nouns

Question: Which of the following statements about thesubstrates and key enzyme control points for lipogenesis is TRUE?
 Acetyl CoA Carboxylase(ACC) is regulated by several factors but can only be stimulated by insulin
Fatty acyl Synthase (FAS) is the ultimate product of ACC and ACC is stimulated by FAS
The phosphorylation of ACC is catalysed by cAMP-activated protein kinase and causes the inactivation of ACC
The formation of malonyl-CoA from acetyl-CoA consumes ATP, then the decarboxylation of malonyl catalysed by FAS would cause malonyl to become active
Correct option: The phosphorylation of ACC is catalysed by cAMP-activated protein kinase and causes the inactivation of ACC
Quality: 2.67
repeating_nouns_synonyms:  ['acc']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  acc
--- absolute word in question stem
cleaned_string:  Acetyl CoA CarboxylaseACC is regulated by several factors but can only be stimulated by insulin
cleaned_string:  Fat

--- verb tense doesn't align between answer other options
i:  91
Question: If an individual consumes a meal high in carbohydrates, which of the following is likely to be observed?
 Hexokinase becomes increasingly saturated at higher glucose concentrations
Hexokinase is catalyzing at its maximum rate, however glucokinase can still respond to further increase in blood glucose levels
Both hexokinase and glucokinase will not be fully saturated
At the postprandial state, hexokinase dominates the phosphorylation process due to its high affinity for glucose
Correct option: Hexokinase is catalyzing at its maximum rate, however glucokinase can still respond to further increase in blood glucose levels
Quality: 3.07
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  Hexokinase becomes increasingly saturated at higher glucose concentrations
cleaned_string:  Both hexokinase and glucokinase will not be fully saturated
cleaned_string:  At the po

cleaned_string:  Everyday nitrogen turnover occurs in the body as proteins in the body are degraded and synthesised this turnover is higher in intestinal tissues than in muscle tissues
cleaned_string:  Under normal circumstances nitrogen intake is equal to nitrogen loss But during periods of growth pregnancy or tissue repair nitrogen intake will be higher than nitrogen loss
cleaned_string:  If Mr Arnold wants to consume plenty of nitrogen his diet should be rich in meat poultry legumes dairy products eggs and nuts
--- verb tense doesn't align between answer other options
--- vague word in question stem
i:  100
Question: An individual who has Type I Diabetes takes insulin injections daily. What would occur as a result of insulin increase?
 A
B
C
D
Correct option: A
Quality: 3.0
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  
cleaned_string:  
cleaned_string:  
i:  101
opt_entity:  Prevention of long-term hyperglycemia
lemma_nou

cleaned_string:  I Olestra is a vitamin supplements II Benefits It provides nutrition needs to the body diet on replacing any food intakes Problem Act as an artificial nutrition provider and it is highly addictive
cleaned_string:  I Olestra is a vitamin supplements II Benefits It provides nutrition needs to the body diet on replacing any food intakes Problem Does not provides essential needs of vitamin    and K
cleaned_string:  I Olestra is a fat substitute II Benefits Zero calories zero grams of cholesterol and zero grams of fat Problem Act as an artificial nutrition provider and it is highly addictive
i:  107
Question: Identify the CORRECT statement about Polymerase chain reaction (PCR).
 During the first stage, the reaction is heated to 95 degrees so the heat-resistant Taq polymerase can be activated and bind with the primers.
During the annealing stage, the two types of single-stranded primers will each bind to 3'-end and 5'-end of strands of the template and run in opposite direct

Question: Glycolysis produces 2 pyruvate molecules. During the aerobic metabolism of pyruvate, it is transported to the ------1--------- and oxidised to ------2------- by -------3--------.
 A
B
C
D
Correct option: B
Quality: 2.93
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
--- Question stem does not contain a question
cleaned_string:  
cleaned_string:  
cleaned_string:  
i:  115
Question: The liver is stimulated to release glucose from glycogen during the first few hours of fasting to prevent ________ ??
 blood glucose levels from increasing too high
hexokinase levels from increasing as this can be very dangerous
the production of pyruvate
blood glucose levels from falling too low
Correct option: blood glucose levels from falling too low
Quality: 2.5
repeating_nouns_synonyms:  ['level', 'glucose', 'blood']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  level
--- fill in the blank
--- absolute w

--- absolute word in question stem
--- This might be a K-type question
--- verb tense doesn't align between answer other options
i:  121
Question: When the terminal phosphates of ADP are hydrolysed; ___________________________.
 Energy is released and ATP is produced.
Energy is released and AMP is produced.
Energy is consumed and ATP is produced.
Energy is consumed and AMP is produced.
Correct option: Energy is released and AMP is produced.
Quality: 2.93
repeating_nouns_synonyms:  ['energy']
repeating_nouns_synonyms:  ['energy']
repeating_nouns_synonyms:  ['energy', 'amp']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  amp
--- fill in the blank
cleaned_string:  Energy is released and ATP is produced
cleaned_string:  Energy is consumed and ATP is produced
cleaned_string:  Energy is consumed and AMP is produced
i:  122
Question: Compared to RNA polymerase, DNA polymerase has a much lower error rate for nucleotide incorporation. What structur

--- absolute word in question stem
cleaned_string:  The preparatory phase production of glyceraldehyde3phosphate from glucose in glycolysis occurs in the cytoplasm but the conversion of glyceraldehyde3phosphate to pyruvate in the 2nd return phase occurs in the matrix of the mitochondria This is because pyruvate in the matrix is then able to undergo further anaerobic or aerobic metabolic pathways depending on conditions
cleaned_string:  Since glycolysis utilizes ATP in its reactions both the preparatory phase and return phase of glycolysis occur in the matrix of the of the mitochondria because the majority of the ATP produced during metabolism from is the ATPase pump is generated in here
cleaned_string:  The preparatory phase of glycolysis occurs in the matrix of the mitochondria because there is a high concentration of ATP here required for phosphofructokinase to function The substrate level phosphorylation in the return phase then occurs in the intermembrane space of the mitochondria


Question: Fill in the blanks: Lactose is the &mdash;&mdash;(1)&mdash;&mdash; in milk. Lactose is composed of &mdash;&mdash;(2)&mdash;&mdash; which is most commonly found in dairy products. The bond between the &mdash;&mdash;(3)&mdash;&mdash; carbons is known as a glycosidic bond. Lactose intolerance arises from a lack of the enzyme &mdash;&mdash;(4)&mdash;&mdash;, which aids in the breakdown of lactose.
 (1) = monosaccharide (2) = glucose and glucose (3) = first and fourth (4) = lactase
(1) = disaccharide (2) = galactose and glucose (3) = first and fourth (4) = lactase
(1) = disaccharide (2) = glucose and glucose (3) = first and third (4) = lactate
(1) = monosaccharide (2) = glucose and fructose (3) = first and fourth (4) = lactate
Correct option: (1) = disaccharide (2) = galactose and glucose (3) = first and fourth (4) = lactase
Quality: 2.65
repeating_nouns_synonyms:  ['glucose', 'lactase']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  

Question: A nucleosome is made up of:
 DNA, histone core protein
DNA, histone core protein, linker H1
RNA, histone core protein
RNA, histone core protein, linker H1
Correct option: DNA, histone core protein, linker H1
Quality: 3.45
repeating_nouns_synonyms:  ['protein', 'histone', 'dna', 'core']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  dna
--- Question stem does not contain a question
--- This might be a K-type question
i:  147
--- Question stem is unclear
Question: In the coupling of oxidative phosphorylation with fuel oxidation, is known to be equal to the synthesis of ATP because....
 The lack of proton gradient would result in a lower force generated for ATP synthesis
Leakage in the mitochondrial membrane will allow protons to flow without utilizing ATP synthase
It is determined by the demand of ATP as opposed to the supply of fuel
The coupling process is determined by the rapid regeneration of NAD from NADH
Correct option: It is

Question: Which of the following are the three main processes of the central dogma?
 Transcription, Translation, Replication
Translocation, Translation, Transcription
Translation, Transcription, Transreplication
Replication, Reproduction, Translation
Correct option: Transcription, Translation, Replication
Quality: 2.34
repeating_nouns_synonyms:  ['translation', 'transcription']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  transcription
--- This might be a K-type question
i:  156
Question: At which step of phosphorylation does glycogen convert to glucose 1-phosphate?
 Inactive adenylyl cyclase and cAMP
Active Protein Kinase A (PKA) phosphorylase
Inactive phosphorylase kinase
Active glycogen phosphorylase
Correct option: Active glycogen phosphorylase
Quality: 2.79
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  Inactive adenylyl cyclase and cAMP
cleaned_string:  Active Protein Kina

--- Question stem is unclear
opt_entity:  Both B and C are correct
lemma_nouns_opt:  ['b', 'c', 'correct']
ans_entity:  Insulin will reduce the risk of getting type 2 diabetes
lemma_nouns_ans:  ['insulin', 'risk', 'diabetes']
Question: Insulin is the major antilipolytic hormone and its decrease is the major factor in insuring the increased availability of the fatty acids needed for energy metabolism during starvation. Which of the following statement is correct about insulin'santilipolytic role regarding on diabetes.
 Insulin doesn't help with anti-lipolysis because it only deal with glucose
Insulin will reduce the risk of getting type 2 diabetes
Insulin will reduce the risk of getting type 1 diabetes
Both B and C are correct
Correct option: Insulin will reduce the risk of getting type 2 diabetes
Quality: 2.47
--- distractor not similar enough
Insulin will reduce the risk of getting type 2 diabetes 		 Both B and C are correct 		 Score: 0.0677
Question: Insulin is the major antilipolyti

opt_entity:  Ketone bodies lost in the urine
lemma_nouns_opt:  ['ketone']
ans_entity:  B and C
lemma_nouns_ans:  ['b', 'c']
Question: Which of the following options is a source of inefficiency in energy metabolism induced by the ketotic state.
 Oxidation
Ketone bodies lost in the urine
Ketone bodies can spontaneously decarboxylate
B and C
Correct option: B and C
Quality: 2.79
--- distractor not similar enough
B and C 		 Ketone bodies lost in the urine 		 Score: 0.0345
Question: Which of the following options is a source of inefficiency in energy metabolism induced by the ketotic state.
 Oxidation
Ketone bodies lost in the urine
Ketone bodies can spontaneously decarboxylate
B and C
Correct option: B and C
Quality: 2.79
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  Oxidation
cleaned_string:  Ketone bodies lost in the urine
cleaned_string:  Ketone bodies can spontaneously decarboxylate
--- verb tense doesn't align between answer

Question: Fatty acid oxidation is also known as beta-oxidation as oxidation occurs on the beta-carbon atom. This is a C 16 fatty acid molecule What products are produced after fatty acid oxidation?
 After one cycle of fatty acid oxidation, a C 14 fatty acid, two acetyl-CoA, two FADH 2 molecules and two NADH molecules are produced
After two cycles of fatty acid oxidation, a C 12 fatty acid, two acetyl-CoA, four FADH 2 molecule and four NADH molecule are produced
After two cycles of fatty acid oxidation,a C 12 fatty acid, two acetyl-CoA, two FADH 2 molecules and two NADH molecules are produced
After one cycle of fatty acid oxidation, a C 14 fatty acid, one acetyl-CoA, two FADH 2 molecules and two NADH molecules are produced
Correct option: After two cycles of fatty acid oxidation,a C 12 fatty acid, two acetyl-CoA, two FADH 2 molecules and two NADH molecules are produced
Quality: 2.84
repeating_nouns_synonyms:  ['cycle', 'acid', 'oxidation', 'molecule', 'c']
repeating_nouns_synonyms:  ['c

Question: QI no. 125 If there was a defect in the transcription of GLUT-1 mRNA resulting in the downregulated transcription of the GLUT-1 transporters, which of the following processes would be affected during the first few hours of starvation?
 Cori-Cycle and Lipolysis
Cori-Cycle and Gluconeogenesis
Cori-Cycle, Gluconeogenesis and Glycogenolysis
Cori-Cycle, Glycogenolysis and Lipolysis
Correct option: Cori-Cycle and Gluconeogenesis
Quality: 2.97
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  CoriCycle and Lipolysis
cleaned_string:  CoriCycle Gluconeogenesis and Glycogenolysis
cleaned_string:  CoriCycle Glycogenolysis and Lipolysis
i:  181
Question: The rate limiting step in a biochemical reaction can be compared to an influx of people entering a train station during peak hour. Hundreds of commuters making their way to work arrive at the station, and have to enter through the gates at the same time. As you can imagine, a line 

opt_entity:  None of the above.
lemma_nouns_opt:  ['none']
ans_entity:  During a high intensity interval training (HIIT) session in which glucose is oxidised from glycogen stores in type 2 b muscle fibres.
lemma_nouns_ans:  ['intensity', 'interval', 'training', 'hiit', 'session', 'glucose', 'glycogen', 'store', 'muscle', 'fibre']
Question: Under which of the following circumstances does Kreb&rsquo;s cycle not occur?
 During a high intensity interval training (HIIT) session in which glucose is oxidised from glycogen stores in type 2 b muscle fibres.
During a gentle walk when beta oxidation takes over and the conversion of pyruvate to Acetyl-CoA from glucose oxidation is inhibited in type 1 muscle fibres.
During a moderately intense sprinting session when both glycolysis and beta oxidation are used to generate ATP and glycogen stores are depleted to maintain blood glucose homeostasis in type 1 muscle fibres.
None of the above.
Correct option: During a high intensity interval training (HI

--- absolute word in question stem
--- This might be a K-type question
i:  194
--- Question stem is unclear
Question: Glucose will be deficit during the early stage of starvation (3-8 hours). For example, an athelete's glucose has depleted after a period of starvation. Which of the following will be switched by his MUSCLE to make alternative energy source during starvation other than glucose?
 glucagon
Glycogen
Fatty Acid
Ketone Bodies
Correct option: Fatty Acid
Quality: 2.44
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  glucagon
cleaned_string:  Glycogen
cleaned_string:  Ketone Bodies
i:  195
Question: You are researching the effect of a toxin known to accumulate in the nucleosol of a cell on RNA polymerases. You isolate and discard the nucleolus from the nucleus, and from the remainder, separate two types of RNA polymerase. You observe that the toxin decreases the activity of the lighter one (determined by mass spec). Which

opt_entity:  Foods with a low GI contain more fibre which doesn't require insulin to digest and therefore decreases the likelihood of insulin resistance.
lemma_nouns_opt:  ['food', 'gi', 'contain', 'insulin', 'therefore', 'decrease', 'resistance']
ans_entity:  All of the above.
lemma_nouns_ans:  []
Question: The Glycemic Index can be clinically important in the control of diabetes because:
 Foods with a low GI contain more fibre which doesn't require insulin to digest and therefore decreases the likelihood of insulin resistance.
Foods with a low GI contain more resistant starch, which is not absorbed in the small intestine and therefore doesn't raise glucose levels as high.
Foods with a low GI can be fermented to things that cause positive insulin sensitivity changes, such as short-chain fatty acids in the gut.
All of the above.
Correct option: All of the above.
Quality: 2.87
--- distractor not similar enough
All of the above. 		 Foods with a low GI contain more fibre which doesn't req

Question: C2-16 Energy Charge - Phosphofructokinase (PFK) is strongly affected by the levels of ATP and ADP in the cell, but it does not directly react to changes in the concentration of these nucleotides. Instead it reacts to the concentration of AMP in the cell. Why is that?
 Relative change in concentration is lower in AMP as opposed to ATP
Relative change in concentration is higher in AMP as opposed to ATP
PFK cannot detect any changes in concentration of ATP or ADP
PFK can only detect AMP levels in the cell
Correct option: Relative change in concentration is higher in AMP as opposed to ATP
Quality: 3.0
repeating_nouns_synonyms:  ['amp', 'atp', 'concentration', 'change']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  amp
--- absolute word in question stem
cleaned_string:  Relative change in concentration is lower in AMP as opposed to ATP
cleaned_string:  PFK cannot detect any changes in concentration of ATP or ADP
cleaned_string:  PFK 

--- absolute word in question stem
--- This might be a K-type question
i:  218
Question: C1-09 Uncouplers:An uncoupler of oxidative phosphorylation would most likely have this effect on the ATP production chain.
 No heat being produced by the operation of the electron transport chain
The electron transport chain occurs without ATP production
The incapacity to oxidise NADH by the electron transport chain
ADP is depended on in order for the electron transport chain to operate.
Correct option: The electron transport chain occurs without ATP production
Quality: 2.56
repeating_nouns_synonyms:  ['electron', 'transport', 'chain']
repeating_nouns_synonyms:  ['electron', 'transport', 'chain']
repeating_nouns_synonyms:  ['electron', 'transport', 'chain']
--- The answer entity is not found in any other options:  ORG
--- Question stem does not contain a question
cleaned_string:  No heat being produced by the operation of the electron transport chain
cleaned_string:  The incapacity to oxidise NADH 

Question: Based on your knowledge of the C-value paradox and the values provided, which option correctly ranks the following organisms from least complex (1) - most complex (3):
 A
B
C
D
Correct option: C
Quality: 2.61
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  
cleaned_string:  
cleaned_string:  
i:  228
Question: Which of the following is NOT true regarding ATP synthase
 Its structure consists of alpha and beta subunits
It uses ADP to produce units of ATP
It moves 3 protons to generate 1 ATP
Uses the H+ gradient in the process of making ATP
Correct option: Its structure consists of alpha and beta subunits
Quality: 2.05
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  It uses ADP to produce units of ATP
cleaned_string:  It moves 3 protons to generate 1 ATP
cleaned_string:  Uses the H gradient in the process of making ATP
i:  229
Question: Which of the following opt

Question: Thymine in DNA is replaced by uracil in RNA. Which of the following statements is CORRECT?
 Uracil makes RNA less thermodynamically stable
Thymine is a precursor for the synthesis of uracil
Uracil and thymine have different Watson-Crick base pairing partners
Thymine provides DNA with greater resistance to spontaneous mutation
Correct option: Thymine provides DNA with greater resistance to spontaneous mutation
Quality: 2.76
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  ['thymine']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  thymine
--- absolute word in question stem
cleaned_string:  Uracil makes RNA less thermodynamically stable
cleaned_string:  Thymine is a precursor for the synthesis of uracil
cleaned_string:  Uracil and thymine have different WatsonCrick base pairing partners
i:  236
Question: Which of the following is INCORRECT?
 Higher temperatures are need to break hydrogen bonds between Guanine and Cytosine
Gu

Question: Which of the following chemical strategies is NOT involved in glycolysis
 Phosphorylation of glucose by hexokinase
The isomerisation of glucose 6-phosphate to fructose 6-phosphate
ATP is harvested when the three-carbon fragments are oxidised to pyruvate
Glucose 1-phosphate joins to the nucleotide UDP, which becomes the active form of glucose
Correct option: Glucose 1-phosphate joins to the nucleotide UDP, which becomes the active form of glucose
Quality: 3.12
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  Phosphorylation of glucose by hexokinase
cleaned_string:  The isomerisation of glucose 6phosphate to fructose 6phosphate
cleaned_string:  ATP is harvested when the threecarbon fragments are oxidised to pyruvate
--- verb tense doesn't align between answer other options
i:  245
opt_entity:  The structures of bilayers and micelles are formed from amphipathic molecules, that is both hydrophobic and hydrophilic propertie

Question: C3-A04 The microbiota in our colon is a wonderful community. It is able to take otherwise indigestible polysaccharides, such as dietary fibre, and turn it into useful short-chain fatty acids (SCFA). However, depending on the composition of the colonic microbiota, different amounts of SCFA will be produced. A microbiota with increased abundance of Firmicutes and decreased abundance of Bacteroidetes is able to process fibre well, and produces SCFA more efficiently. However, excess SCFA production can lead to weight gain due to the extra caloric content extracted from the dietary fibre. The root cause of obesity is excess caloric intake compared to expenditure. Based on this information (and ignoring the effects of SCFA on FFAs - as mentioned in the journal article), which one of the following conditions are most likely to contribute to obesity, assuming the rest of the diet is exactly the same?
 A
B
C
D
Correct option: A
Quality: 2.64
--- CTTR above 4.5, text is too complex and

--- verb tense doesn't align between answer other options
i:  258
Question: Following his experiment ontwo strains of Streptococcus pneumoniae (The Griffith's experiment), Frederick Griffith concluded...
 Transfer of DNA could transform a nonvirulent strain of bacteria into a virulent strain
Dead strains of bacteria are capable of transferring their DNA
If you have two of the same strain of bacteria one may be virulent whilst the other is not
Materials from a virulent bacterium can be transferred to a nonvirulent strain of the same bacteria and &ldquo;transform&rdquo; it
Correct option: Materials from a virulent bacterium can be transferred to a nonvirulent strain of the same bacteria and &ldquo;transform&rdquo; it
Quality: 2.64
--- longest option is correct
repeating_nouns_synonyms:  ['bacteria', 'strain']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  strain
--- The answer entity is not found in any other options:  ORG
--- Question stem 

Question: The body implements a general strategy involving conservation, recycling and de novo glucose formation to maintain its glucose requirements during starvation. Which row correctly fills in the blanks? Glucose conservation includes certain tissues such as 1 switching to Fatty Acids as an alternate fuel during starvation. Glucose 2 includes when PDH is turned 3 by phosphorylation. Pyruvate is converted into lactate and then taken up by the liver to be remade into glucose in a process known as the 4 .
 A
B
C
D
Correct option: C
Quality: 3.31
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  
cleaned_string:  
cleaned_string:  
i:  266
Question: TAG:10, PURPLE An agonist contains an alcohol, amine and aromatic ring, all of which act as binding groups. Which of the following modifications is most likely to result in an antagonist?
 Converting the alcohol to a methyl ether
Adding an extra aromatic ring to the structure
Synthes

Question: In which situation would anaplerosis be beneficial?
 When we want to accelerate the rate of glycolysis
When extra oxaloacetate has been added to the Krebs cycle and we want to take advantage of it to drive gluconeogenesis
When oxaloacetate has been extracted from the Krebs cycle for use elsewhere
When a lack of amino acids slows down gluconeogenesis
Correct option: When oxaloacetate has been extracted from the Krebs cycle for use elsewhere
Quality: 3.14
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  ['cycle', 'oxaloacetate', 'krebs']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  cycle
cleaned_string:  When we want to accelerate the rate of glycolysis
cleaned_string:  When extra oxaloacetate has been added to the Krebs cycle and we want to take advantage of it to drive gluconeogenesis
cleaned_string:  When a lack of amino acids slows down gluconeogenesis
--- verb tense doesn't align between answer other options
i:  275


--- Question stem is unclear
Question: Which enzyme is used for the replication in case of Sanger&rsquo;s method of sequencing?
 Polymerase I
Smaller subunit polymerase I
Polymerase III
Larger subunit polymerase I
Correct option: Larger subunit polymerase I
Quality: 2.23
repeating_nouns_synonyms:  ['polymerase']
repeating_nouns_synonyms:  ['polymerase', 'subunit']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  subunit
--- absolute word in question stem
cleaned_string:  Polymerase I
cleaned_string:  Smaller subunit polymerase I
cleaned_string:  Polymerase III
i:  285
Question: During beta-oxidation, how is Coenzyme A able to successfully trap fatty acids inside the cell?
 It prevents fatty acid binding proteins from associating with fatty acids to cause active transport out of the cell
When bound to fatty acid, the large, charged structure prevents the molecule from passing back through the lipid bilayer
Coenzyme A reacts with carnitine to 

Question: C1-18: Rate Limiting Steps in Fatty Acid Oxidation The attachment of fatty acids on carnitine by carnitine acyltransferase I (CAT-I) and their subsequent transport into the mitochondria is often considered the rate-limiting step in &beta;-oxidation in the liver. The discovery of an alien species reveals that fatty acids are oxidised in the same way in their bodies and where in their liver, the CAT-I reaction is also the rate-limiting step. However, in their heart and skeletal muscle, the concentration of malonyl-CoA is estimated to be in a range that would, in theory, produce a constant inhibition on heart and skeletal muscle CAT-I. Yet, &beta;-oxidation still proceeds in the heart and skeletal muscle in these aliens. Which of the following possibilities cannot account for this observation? (Question inspired by Eaton 2002 (Eaton, S 2002, &lsquo;Control of mitochondrial &beta;-oxidation flux&rsquo;, Progress in Lipid Research, vol. 41, no. 3, pp. 197&ndash;239.))
 The majorit

--- Question stem is unclear
Question: In a normal starvation alternative power source for the brain and other tissues is required. Metabolic acidosis in the face of excessively high glucose involves what? And the repercussions of this on the blood's pH is?
 Ketone bodies, fatty acids and lactate, resulting in an increase in blood pH.
Ketone bodies, and fatty acids, resulting in a decrease in blood pH.
Ketone bodies, fatty acids and lactate, resulting in a decrease in blood pH.
Ketone bodies, and lactate, resulting in an increase in blood pH.
Correct option: Ketone bodies, fatty acids and lactate, resulting in a decrease in blood pH.
Quality: 2.92
repeating_nouns_synonyms:  ['acid', 'ketone', 'ph', 'lactate', 'blood', 'body']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  acid
--- This might be a K-type question
i:  298
Question: Explain how energy is put in to drive this anabolic reaction
 hydrolysis of adenosine triphosphate
hydrolysis o

Question: Hydroelectricity is similar to our metabolism. It stores water in a reservoir. Water is released from a reservoir to flow through a turbine spinning it, which in turn activates a generator which produces electricity. Which components of this system best represents those of our metabolism.
 Water = Fuels (fatty acids, carbohydrates) Reservoir = Cytoplasm Turbine + Generator = Electron transport chain and F0 channel + ADP + Pi Electricity = ATP
Water = Oxygen Reservoir = White adipose tissue Turbine + Generator = Electron transport chain and F0 channel + ADP + Pi Electricity = ADP + Pi
Water = Fuels (fatty acids, carbohydrates) Reservoir = Cytoplasm Turbine/generator = Electron transport chain Electricity = ATP
Water = Fuels (fatty acids, carbohydrates) Reservoir = Cytoplasm Turbine + Generator = Electron transport chain Electricity = ADP + Pi
Correct option: Water = Fuels (fatty acids, carbohydrates) Reservoir = Cytoplasm Turbine + Generator = Electron transport chain and F0 c

--- This might be a K-type question
i:  314
opt_entity:  The temperature at which the reaction occurs is increased.
lemma_nouns_opt:  ['temperature', 'reaction']
ans_entity:  The fragments of cDNA produce multiple bands during agarose gel electrophoresis.
lemma_nouns_ans:  ['fragment', 'band', 'electrophoresis']
Question: During a reverse transcription reaction that involves transcribing a long segment of RNA, the concentration of random primers to be used according to the protocol is 10nM. However, a researcher misreads the protocol and uses a concentration of 100nM. Which of the following is the most likely result?
 There is a lower than expected yield of cDNA fragments.
The time taken for the reverse transcription process is longer than expected.
The fragments of cDNA produce multiple bands during agarose gel electrophoresis.
The temperature at which the reaction occurs is increased.
Correct option: The fragments of cDNA produce multiple bands during agarose gel electrophoresis.
Qua

--- This might be a K-type question
i:  321
Question: Glycolysis is a process which produces ATP very quickly. Which of these statements is false:
 The formation of ATP is very little but very efficient
The product pyruvate transports into mitochondria for full oxidation
Glucose 6-phosphate is broken down to form a net value of two ATP molecules
Glucose 6-phosphate is broken down into one molecule of pyruvate
Correct option: Glucose 6-phosphate is broken down into one molecule of pyruvate
Quality: 2.88
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  ['glucose', 'molecule']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  glucose
cleaned_string:  The formation of ATP is very little but very efficient
cleaned_string:  The product pyruvate transports into mitochondria for full oxidation
cleaned_string:  Glucose 6phosphate is broken down to form a net value of two ATP molecules
--- verb tense doesn't align 

opt_entity:  non of the above
lemma_nouns_opt:  ['non']
ans_entity:  1. To increase the level of fatty acid oxidation enzyme activity 2. to assist in beta oxidation 3. Carnitine
lemma_nouns_ans:  ['level', 'oxidation', 'enzyme', 'activity', 'assist', 'beta', 'oxidation', 'carnitine']
Question: C2-35 Glycogen sparing is an important technique used by athletes in attempt to burn alternative fuel sources rather than glucose. To achieve this, one must increase their reliance on fatty acid oxidation and thus increase the rate and usage of fatty acids. One way to achieve this is to attempt to start FA release from white Adipose tissue by drinking coffee and limiting consumption of glucose, alternatively one can include aerobic exercises in their training. Above shows a very simplistic explanation of how aerobic exercise is a good method of glycogen sparing. Answer, by choosing the best response for each question: 1. What is the main purpose of aerobic training in relation to increased FA oxi

--- absolute word in question stem
cleaned_string:  Binds to extracellular receptor gt Phosphorylation cascade gt activation of transcriptor factors in the nucleus gt TFs bind tospecific DNA sequence via their zinc fingers gt modify expression by recruiting cofactors
cleaned_string:  Binds to extracellular receptor gt protein cascade gt allosteric activation of transcriptor factors in the nucleus gt TFs bind to specific DNA sequence via their ferritin fingers gt modify expression by recruiting cofactors
cleaned_string:  Diffuses through cell membrane gt Oestrogen bindsdirectly to a specific DNA sequence with its zinc fingers gt modify expression by recruiting cofactors
i:  334
Question: Usain Bolt has decided that instead of sprinting, he wants to be able to run long distance endurance races, so that in the event of a zombie apocalypse he can not only outrun zombies, but put enough distance between them to ensure his safety. Thus, Usain starts training for long distance running, and de

Question: Trc_9: In eukaryotes, which of the following statement regarding post transcriptional processing is CORRECT?
 Capping is the addition of a methyl-guanosine risidue at 3' end.
Splicing is a sewing selection of the mRNA and cutting out exons.
Addition of a chain of 10s - 100s of adenines at 3' end.
From the process of splicing the introns will be used agian.
Correct option: Addition of a chain of 10s - 100s of adenines at 3' end.
Quality: 2.63
repeating_nouns_synonyms:  ['end', 'addition']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  end
--- fill in the blank
cleaned_string:  Capping is the addition of a methylguanosine risidue at 3 end
cleaned_string:  Splicing is a sewing selection of the mRNA and cutting out exons
cleaned_string:  From the process of splicing the introns will be used agian
--- verb tense doesn't align between answer other options
i:  341
--- Question stem is unclear
Question: After 10minutes of continuous exer

Question: Fill in the blank: Consistent hyperglycemia stimulates the over-production of _______ from the ____ cells of the pancreas, which can result in ________ resistance, as well as ________ diabetes.
 Glucagon, alpha, glucagon, type II
Insulin, beta, insulin, type II
Glucagon, beta, glucagon, type I
Insulin, alpha, insulin, type II
Correct option: Insulin, beta, insulin, type II
Quality: 2.65
repeating_nouns_synonyms:  ['ii', 'type']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  ii
--- fill in the blank
--- This might be a K-type question
i:  351
Question: During the formation of the ketone body, what product(s) would be generated by the enzyme thiolase and HMG-CoA lyse?
 Both thiolase andHMG-CoA lyse generateCoA-SH.
Boththiolase and HMG-CoA lyse generateAcetyl-CoA.
Thiolase generates CoA-SH and HMG-CoA lyse generates Acetyl-CoA.
Thiolase generates Acetyl-CoAand HMG-CoA lyse generates CoA-SH.
Correct option: Thiolase generates CoA-SH 

Question: Neurexins are a family of presynaptic cell adhesion proteins that have roles in connecting neurons at the synapse. Neurexin genes and proteins are studied in the lab, and here is a picture regarding alternative splicing of a neurexin gene shown below. Given that exon 3 and exon 4 can be alternatively spliced, which of the following option is TRUE?
 Only 2 neurexin proteins in total are resulted from alternative splicing from neurexin gene
Neurexin proteinsA and B are not from the same gene
Neurexin proteins A and B are functionally different
Introns and alternatively spliced exons are removed during transcription of neurexin gene
Correct option: Neurexin proteins A and B are functionally different
Quality: 3.39
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  ['b']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  b
--- absolute word in question stem
cleaned_string:  Only 2 neurexin proteins in total are resulted from altern

Question: Why does NOT having the 5' to 3' exonuclease make Klenow so useful in molecular biology?
 All the 5' to 3' exonuclease chewing up ahead of replication makes it too slow to be practical
The 5' to 3' exonuclease can act unpredictably in vitro leading to lots of mistakes
The &lsquo;futile&rsquo; cycle of nucleotide incorporation and removal wastes tons of dNTP substrate
Removal of the 5&rsquo; to 3&rsquo; exonuclease makes the polymerase stay on track better
Correct option: The &lsquo;futile&rsquo; cycle of nucleotide incorporation and removal wastes tons of dNTP substrate
Quality: 2.42
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  ['removal']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  removal
--- The answer entity is not found in any other options:  ORG
cleaned_string:  All the 5 to 3 exonuclease chewing up ahead of replication makes it too slow to be practical
cleaned_string:  The 5 to 

Question: C1-01: Sally is a race driver who is about to compete in the F1 tournament. Sally believes that a full tank is key to winning, and has invented a pipe that transports petrol into the engine to keep it topped up. Howard is Sally's main competition who drives the same car. Howard has a different tactic, and has implemented a state of the art internal combustion engine which combusts fuel and oxygen to move pistons 2x faster than those of regular combustion engines. Using the information above and your knowledge of metabolism, select the most correct option from the statements below.
 It is likely that Sally will win the tournament given that her petrol tank is always full. Fuel levels are primarily responsible for the rate of car acceleration, and can be likened to the concentration of fatty acids and carbohydrates in our bodies, where a greater concentration leads to a faster rate of metabolism
The pistons in Howard's car can be likened to the spinning protein past the F0 chan

Question: Mr. Green, 60, who has been told that he needs to lose weight and in particular control the cholesterol level in his blood, otherwise his health will be threatened. Which way is the best to help Mr. Green?
 Immediately become a vegan, no more meat.
Reduce the amount of full fat milk, butter and sausages in meals.
Include food that contains high amount of phytosterols to inhibit cholesterol, such as walnuts, vegetable, fruits and legumes, in total at least 1 kg per day to reduce cholesterol as fast as possible.
Most of the cholesterol is produced in the body, not from the intake of food, he just needs to change dietary habit and do more exercise, but he can keep smoking.
Correct option: Reduce the amount of full fat milk, butter and sausages in meals.
Quality: 2.42
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  ['amount']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  amount
--- This might be a K-type question
--- verb t

Question: If a mutation caused a cell to rapidly reduce its rate of functioning, ADP stores would:
 decrease as they are turned into stable ATP
remain stagnant as the cell no longer functions
decrease as they are quickly used by the body for energy
increase to accomodate for increased ADP usage by other cells
Correct option: decrease as they are turned into stable ATP
Quality: 2.51
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  ['decrease']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  decrease
--- The answer entity is not found in any other options:  ORG
--- Question stem does not contain a question
cleaned_string:  remain stagnant as the cell no longer functions
cleaned_string:  decrease as they are quickly used by the body for energy
cleaned_string:  increase to accomodate for increased ADP usage by other cells
--- verb tense doesn't align between answer other options
i:  388
Question: C2-25 Kevin is waiting for his friend at

Question: Scientists collected some blood samples from individuals who have been starving for more than 30 days to measure the plasma level of glucose, fatty acids and ketone bodies as shown in the graph below. Which of the following is NOT the reason whystarving for more than 30 daysis unsustainable?
 Other organs still need constant glucose supply as their main energy source from proteolysis
Ketone bodies can spontaneous decarboxylate acetoacetate to acetone which is a useless molecule
The break down of protein can only generate 25% glucose (1g protein= 1/4 g glucose)
Depletion of protein from tissue compromised immune system
Correct option: The break down of protein can only generate 25% glucose (1g protein= 1/4 g glucose)
Quality: 2.76
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  ['protein']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  protein
--- absolute word in question stem
cleaned_string

In [398]:
# Calculate the correlation matrix
corr_matrix = peerwise_results.corr()

# Number of tests to conduct
num_tests = 5

# Significance level (alpha)
alpha = 0.05

# Bonferroni corrected significance level
bonferroni_alpha = alpha / num_tests

# Print the correlation matrix
print(f"Bonferroni corrected alpha: {bonferroni_alpha}")
corr_matrix

Bonferroni corrected alpha: 0.01


Unnamed: 0,ambiguous_unclear_information,implausible_distractors,none_of_the_above,longest_answer_correct,gratuitous_information_in_stem,true_or_false,avoid_convergence_cues,avoid_logical_cues,all_of_the_above,fill_in_the_blank,...,unfocused_stem,complex_k_type,grammatical_cues_in_stem,lost_sequence,vague_terms,more_than_one_correct,negative_worded_stem,qid,courseid,quality
ambiguous_unclear_information,1.0,0.027856,0.093918,-0.009219,0.002034,0.038109,-0.049504,0.117052,-0.023956,0.002034,...,-0.04389,-0.041221,0.052327,,-0.054114,0.000206,0.032566,-0.040438,-0.042121,0.109842
implausible_distractors,0.027856,1.0,-0.040561,0.105541,0.045466,-0.029022,-0.204681,-0.084866,-0.0233,0.005844,...,-0.009519,-0.147433,0.105275,,0.00135,-0.067654,0.08361,-0.014047,-0.018423,0.175869
none_of_the_above,0.093918,-0.040561,1.0,0.060444,-0.027558,0.073101,0.046295,-0.031863,-0.008748,-0.027558,...,0.109886,-0.000275,0.033183,,-0.01976,0.003086,-0.011718,-0.09098,-0.086918,-0.026564
longest_answer_correct,-0.009219,0.105541,0.060444,1.0,-0.053874,0.196816,-0.055594,-0.016988,-0.017102,-0.053874,...,-0.034209,-0.108213,0.05022,,-0.038631,-0.015905,0.149126,-0.034584,-0.03484,0.056293
gratuitous_information_in_stem,0.002034,0.045466,-0.027558,-0.053874,1.0,-0.087275,0.038447,-0.009104,-0.01583,0.005387,...,-0.068063,0.08868,-0.096994,,-0.035759,-0.029686,0.032171,0.197406,0.197992,-0.115811
true_or_false,0.038109,-0.029022,0.073101,0.196816,-0.087275,1.0,0.050957,-0.039983,-0.027704,-0.052607,...,-0.092676,-0.096315,0.17406,,-0.01535,0.01715,0.349515,-0.054649,-0.051627,0.044058
avoid_convergence_cues,-0.049504,-0.204681,0.046295,-0.055594,0.038447,0.050957,1.0,-0.0516,0.051447,-0.011002,...,0.013766,-0.167382,0.1843,,0.116214,-0.154928,-0.048331,-0.016716,-0.014276,-0.097631
avoid_logical_cues,0.117052,-0.084866,-0.031863,-0.016988,-0.009104,-0.039983,-0.0516,1.0,-0.018303,-0.009104,...,-0.078696,-0.032843,-0.006469,,-0.041345,-0.045199,0.099135,0.103182,0.106941,0.03203
all_of_the_above,-0.023956,-0.0233,-0.008748,-0.017102,-0.01583,-0.027704,0.051447,-0.018303,1.0,-0.01583,...,-0.021606,-0.031797,0.066421,,-0.011351,0.001773,-0.035997,0.125361,0.127883,-0.006246
fill_in_the_blank,0.002034,0.005844,-0.027558,-0.053874,0.005387,-0.052607,-0.011002,-0.009104,-0.01583,1.0,...,0.016226,-0.005744,-0.144107,,-0.035759,-0.100228,0.003057,0.197839,0.202253,-0.019416


## Peerwise Biochemistry - 50 Questions 

In [337]:
# #Maybe grab 10 from each: 1.5-1.99, 2.0-2.49, 2.50-2.99, 3.00-3.49, 3.50-3.99
# #It's fine in the paper to say these are higher quality and came from a different system, comparing OLI and PeerWise is not 
# #the point, it's just that even when such a system is used, problems persist with student-authored questions.
# qs = pd.read_csv('peerwise_questions.csv')
# def clean_df(fdf):
#     fdf = fdf[~fdf['question'].str.casefold().str.contains("img")]
#     columns_to_clean = ['question', 'altA', 'altB', 'altC', 'altD']

#     for cName in columns_to_clean:
#         fdf[cName] = fdf[cName].str.replace(r'<[^<>]*>', ' ', regex=True)
#         fdf[cName] = fdf[cName].apply(lambda x: re.sub("&nbsp;",  "", x))
#         fdf[cName] = fdf[cName].str.replace(' +', ' ', regex=True)
#         fdf[cName] = fdf[cName].str.strip()
        
#         fdf[fdf[cName].apply(lambda x: len(x.strip()) > 2)]
#         fdf[cName].replace('', np.nan, inplace=True)
#     fdf.dropna(subset=['altA'], inplace=True)
#     return fdf

# biochem_lowest = qs.loc[(qs['avg_rating'] < 2.0) & (qs['avg_rating'] > 1.49) & (qs['total_ratings'] > 19) & (qs['numAlts'] == 4)]
# biochem_lowest = clean_df(biochem_lowest)

# biochem_low = qs.loc[(qs['avg_rating'] < 2.5) & (qs['avg_rating'] > 1.99) & (qs['total_ratings'] > 19) & (qs['numAlts'] == 4)]
# biochem_low = clean_df(biochem_low)

# biochem_mid = qs.loc[(qs['avg_rating'] < 3.0) & (qs['avg_rating'] > 2.49) & (qs['total_ratings'] > 19) & (qs['numAlts'] == 4)]
# biochem_mid = clean_df(biochem_mid)

# biochem_high = qs.loc[(qs['avg_rating'] < 3.5) & (qs['avg_rating'] > 2.99) & (qs['total_ratings'] > 19) & (qs['numAlts'] == 4)]
# biochem_high = clean_df(biochem_high)

# biochem_highest = qs.loc[(qs['avg_rating'] > 3.49) & (qs['total_ratings'] > 19) & (qs['numAlts'] == 4)]
# biochem_highest = clean_df(biochem_highest)

# sampled_dfs = [biochem_lowest.sample(n=10), biochem_low.sample(n=10), biochem_mid.sample(n=10), biochem_high.sample(n=10), biochem_highest.sample(n=10)]
# fifty_biochem_questions = pd.concat(sampled_dfs, axis=0).reset_index()

#This uses the ones tagged by IWF
# qs = pd.read_csv('peerwise_50_questions.csv')

# peerwise_biochem_questions = []
# for index, row in qs.iterrows():
#     question = MultipleChoiceQuestion(
#         stem=row['question'],
#         options=[row['altA'], row['altB'], row['altC'], row['altD']],
#         correct_option= row['alt' + row['answer'].strip()],
#         qid = row['student'],
#         courseid = row['course'],
#         quality = row['course']
#     )
#     peerwise_biochem_questions.append(question)

# rows = []
# i = 0
# for q in peerwise_biochem_questions:
#     r = [ambiguous_unclear_information(q),
#         implausible_distractors(q),
#         none_of_the_above(q),
#         longest_answer_correct(q),
#         gratuitous_information_in_stem(q),
#         true_or_false(q),
#         avoid_convergence_cues(q),
#         avoid_logical_cues(q),
#         all_of_the_above(q),
#         fill_in_the_blank(q),
#         absolute_terms(q),
#         word_repeats_in_stem_and_correct_answer(q),
#         unfocused_stem(q),
#         complex_k_type(q),
#         grammatical_cues_in_stem(q),
#         lost_sequence(q),
#         vague_terms(q),
#         more_than_one_correct(q),
#         negative_worded_stem(q),
#         q.qid,
#         q.courseid,
#         q.quality]
#     rows.append(r)
#     i += 1
    
# columns = [
#     'ambiguous_unclear_information',
#     'implausible_distractors',
#     'none_of_the_above',
#     'longest_answer_correct',
#     'gratuitous_information_in_stem',
#     'true_or_false',
#     'avoid_convergence_cues',
#     'avoid_logical_cues',
#     'all_of_the_above',
#     'fill_in_the_blank',
#     'absolute_terms',
#     'word_repeats_in_stem_and_correct_answer',
#     'unfocused_stem',
#     'complex_k_type',
#     'grammatical_cues_in_stem',
#     'lost_sequence',
#     'vague_terms',
#     'more_than_one_correct',
#     'negative_worded_stem',
#     'qid',
#     'courseid',
#     'quality'
# ]

# peerwise_biochem_results = pd.DataFrame(rows, columns=columns)
# peerwise_biochem_results.to_csv("biochem_results_auto_iwf.csv")

# #Print the number of "Falses", which correspond to IWF, found the table across each criteria
# for c in columns:
#     print(c, ': ', (~peerwise_biochem_results[c]).values.sum())

Question: Which of the following is not a characteristic of an emulsion forming molecule?
 They are amphiphilic.
They act as detergents.
They form dricelles.
They are amphipatic.
Correct option: They form dricelles.
Quality: 1.6
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
--- The answer entity is not found in any other options:  ORG
cleaned_string:  They are amphiphilic
cleaned_string:  They act as detergents
cleaned_string:  They are amphipatic
--- GPT-3 believes the answer is incorrect:  They are amphipatic.   They form dricelles.
Question: At what temperature is the DNA double helix denatured in PCR?
 54 o C
95 o C
60 o C
75 o C
Correct option: 95 o C
Quality: 1.81
repeating_nouns_synonyms:  ['c']
repeating_nouns_synonyms:  ['c']
repeating_nouns_synonyms:  ['c']
cleaned_string:  54 o 
cleaned_string:  60 o 
cleaned_string:  75 o 
opt_entity:  Growth factors
lemma_nouns_opt:  ['growth', 'factor']
ans_entity:  DNA-dependent RNA polymerase


--- GPT-3 believes the answer is incorrect:  It is uncalled for. Because endogenous antioxidant will be enough to balance the free-radical produced from exercise.   It is uncalled for. Because exercise might cause cells to produce more free-radical, but it also will produce more endogenous antioxidant, and with natural dietary antioxidant intake (e.g. in orange), there won&rsquo;t be ill effect on the body.
opt_entity:  Initiation
lemma_nouns_opt:  ['initiation']
ans_entity:  Relocation
lemma_nouns_ans:  ['relocation']
Question: What is a process NOT used inEukaryotic translation?
 Initiation
Elongation
Relocation
Termination
Correct option: Relocation
Quality: 1.94
--- distractor not similar enough
Relocation 		 Initiation 		 Score: 0.1351
Question: What is a process NOT used inEukaryotic translation?
 Initiation
Elongation
Relocation
Termination
Correct option: Relocation
Quality: 1.94
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_s

--- GPT-3 believes the answer is incorrect:  All of the above   Uses very low amounts of DNA
Question: The components of a computer including the devices (such as a printer), hard-drive and RAM storage can be said to be analogous to the information storage components in a cell; the DNA, RNA and protein. Given this analogy, and thinking about which parts of the computer these cell components are similar to, choose the most correct answer from the following.
 DNA is like the hard-drive of the computer as different computers, like every different cell in your body, contains different information stored in their DNA so they are able to perform the different purposes needed.
While it is impossible for information to be sent directly from the hard-drive to devices such as printer, this is a point the metaphor breaks as it is possible for information to be sent from DNA directly to protein.
Information can never flow from DNA to RNA, only from DNA to RNA in the same sense that information in 

Question: You are trying to create a standard curve, but accidentally take your measurements using a cuvette with a 2cm light path instead of 1cm. What impact does this increased light path have on the equation of your resulting standard curve?
 The gradient of the curve becomes steeper, or greater, than you expected.
The gradient of the curve becomes flatter, or less, than you expected.
The gradient of the curve becomes inverted (i.e. negative/ slopes downward instead of upward).
No effect.
Correct option: The gradient of the curve becomes steeper, or greater, than you expected.
Quality: 2.36
repeating_nouns_synonyms:  ['gradient', 'curve']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  gradient
cleaned_string:  The gradient of the curve becomes flatter or less than you expected
cleaned_string:  The gradient of the curve becomes inverted ie negative slopes downward instead of upward
cleaned_string:  No effect
--- GPT-3 believes the answer

--- GPT-3 believes the answer is incorrect:  None of the above   The overall ATP utilization is greater in gluconeogenesis than in glycolysis
Question: Which force contributes the most to stabilizing the DNA double-helix?
 a) Hydrophobic bases are found in the interior of the helix where each base- pair is stabilized by the same number of hydrogen bonds.
b) Hydrophilic sugar-phosphate groups are found on the exterior of the helix where they can interact with water.
c) Non-covalent N-glycosidic bonds can form between nitrogenous bases in opposite strands in the helix.
d) Covalent base-stacking interactions can occur between adjacent bases within the same strand in the helix.
Correct option: b) Hydrophilic sugar-phosphate groups are found on the exterior of the helix where they can interact with water.
Quality: 2.77
repeating_nouns_synonyms:  ['helix']
repeating_nouns_synonyms:  ['helix']
repeating_nouns_synonyms:  ['helix']
cleaned_string:  a Hydrophobic bases are found in the interior 

--- GPT-3 believes the answer is incorrect:  I and IV are true II and III are false   III and IV are true I and II are false
Question: Which of the following is correct in describing the role of NAD+ in ATP synthesis?
 The pumping of protons powered by NAD+ allows the uncoupling protein to dissipate the proton gradient, hence resulting in ATP synthesis.
NAD+ breaks down fats and carbohydrates in the body, and that then provides the electrons for the electron transport chain, hence resulting in ATP synthesis.
NAD+ is reduced to NADH via the breakdown of fats and carbohydrates in the body, and that then provides the electrons for the electron transport chain, hence resulting in ATP synthesis.
NAD+ binds to the uncoupling protein to inhibit ATP synthesis
Correct option: NAD+ is reduced to NADH via the breakdown of fats and carbohydrates in the body, and that then provides the electrons for the electron transport chain, hence resulting in ATP synthesis.
Quality: 3.0
repeating_nouns_synonym

--- GPT-3 believes the answer is incorrect:  The main regulator, phosphofructokinase, is strongly affected by ATP and ADP levels due to the energy charge fluctuations in the cells. However, AMP doesn&rsquo;t affect phosphofructokinase as this enzyme cannot recognize the single phosphate attached to the adenine.   Metabolic pathways are very sensitive to the energy charge of a cell (i.e. ATP levels in the cell) such that a drop in ATP levels would cause hexokinase, phosphofructokinase and pyruvate kinase to respond dramatically.
opt_entity:  I don't know
lemma_nouns_opt:  []
ans_entity:  The theory does not apply because metabolic performance DOES NOT increase with smaller meals i.e. metabolism is not regulated by fuel availability( unlike fire).
lemma_nouns_ans:  ['theory', 'performance', 'doe', 'not', 'meal', 'metabolism', 'availability', 'fire']
Question: Metaphors for Metabolism. You&rsquo;re planning a long car trip in a 4WD, and your friend strongly believes that his car &ldquo;wo

--- This might be a K-type question
--- verb tense doesn't align between answer other options
--- GPT-3 believes the answer is incorrect:  If Derek decides that the weather is too hot and decides to lie down on the tennis court instead of exercising, then the decrease in work will cause ATP consumption to DECREASE, rate of proton pumping to DECREASE, and oxygen consumption will DECREASE.   If Derek had a gigantic feast of KFC in the middle of his tennis session and there is now a readily available amount of fuel for his body to metabolise, then the rate of ATP synthesis will INCREASE, rate of proton pumping will INCREASE and oxygen consumption will also INCREASE.
Question: You want the initial rate of the reaction to be about 0.75 absorbance units per minute. What is this rate of change expressed in moles per minute in the well? Extinction Coefficient: 15 mM -1 cm -1 ,200uL solution with cuvette length of 1cm.
 50 nmol min -1
10 nmol min -1
5 nmol min- 1
15nmol min -1
Correct option: 1

TypeError: ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

## OLI Chemistry Data - 50 Questions

In [323]:
# qs = pd.read_csv('oli_chemistry_questions.csv')
# qs = qs.fillna('')

# oli_chemistry_student_questions = []
# for index, row in qs.iterrows():
#     question = MultipleChoiceQuestion(
#         stem=row['question'],
#         options=[row['altA'], row['altB'], row['altC'], row['altD']],
#         correct_option= row['altA']
#     )
#     oli_chemistry_student_questions.append(question)

# rows = []
# i = 0
# for q in oli_chemistry_student_questions:
#     r = [ambiguous_unclear_information(q),
#         implausible_distractors(q),
#         none_of_the_above(q),
#         longest_answer_correct(q),
#         gratuitous_information_in_stem(q),
#         true_or_false(q),
#         avoid_convergence_cues(q),
#         avoid_logical_cues(q),
#         all_of_the_above(q),
#         fill_in_the_blank(q),
#         absolute_terms(q),
#         word_repeats_in_stem_and_correct_answer(q),
#         unfocused_stem(q),
#         complex_k_type(q),
#         grammatical_cues_in_stem(q),
#         lost_sequence(q),
#         vague_terms(q),
#         more_than_one_correct(q),
#         negative_worded_stem(q)]
#     rows.append(r)
#     i += 1
#     print(i)

# columns = [
#     'ambiguous_unclear_information',
#     'implausible_distractors',
#     'none_of_the_above',
#     'longest_answer_correct',
#     'gratuitous_information_in_stem',
#     'true_or_false',
#     'avoid_convergence_cues',
#     'avoid_logical_cues',
#     'all_of_the_above',
#     'fill_in_the_blank',
#     'absolute_terms',
#     'word_repeats_in_stem_and_correct_answer',
#     'unfocused_stem',
#     'complex_k_type',
#     'grammatical_cues_in_stem',
#     'lost_sequence',
#     'vague_terms',
#     'more_than_one_correct',
#     'negative_worded_stem'
# ]

# oli_chemistry_results = pd.DataFrame(rows, columns=columns)
# oli_chemistry_results.to_csv("chem_results_auto_iwf.csv")

# #Print the number of "Falses", which correspond to IWF, found the table across each criteria
# for c in columns:
#     print(c, ': ', (~oli_chemistry_results[c]).values.sum())

opt_entity:  All of the above
lemma_nouns_opt:  []
ans_entity:  Proton
lemma_nouns_ans:  ['proton']
Question: Which particle weighs the least in an atom?
 Proton
Electron
Neutron
All of the above
Correct option: Proton
Quality: None
--- distractor not similar enough
Proton 		 All of the above 		 Score: 0.1279
Question: Which particle weighs the least in an atom?
 Proton
Electron
Neutron
All of the above
Correct option: Proton
Quality: None
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  Electron
cleaned_string:  Neutron
cleaned_string:  All of the above
--- GPT-3 believes the answer is incorrect:  Electron   Proton
1
opt_entity:  Rutherford
lemma_nouns_opt:  ['rutherford']
ans_entity:  Millikan
lemma_nouns_ans:  ['millikan']
*** low score, but they are the same entity ORG  &  ORG
Question: Which scientist discovered the fundamental electric charge, the charge of an electron?
 Millikan
Rutherford
Dalton
Thomson
Correct option: M

cleaned_string:  Robert  Millikan
cleaned_string:  George Stoney
cleaned_string:  Rutherford
15
Question: How can you determine the mass?
 the sum of protons and neutrons
sum of protons
sum of neurtrons
sum of electrons
Correct option: the sum of protons and neutrons
Quality: None
--- longest option is correct
repeating_nouns_synonyms:  ['proton']
--- we have a synonym of the answer being used in other answer choices, but not all of them:  proton
cleaned_string:  sum of protons
cleaned_string:  sum of neurtrons
cleaned_string:  sum of electrons
16
Question: which of these subatomic particles are neutral or uncharged in the atom?
 neutrons
electrons
protons
atom
Correct option: neutrons
Quality: None
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
repeating_nouns_synonyms:  []
cleaned_string:  electrons
cleaned_string:  protons
cleaned_string:  atom
17
Question: For Carbon to be neutral how many electrons must it have?
 2
3
4
6
Correct option: 2
Quality: None
repeating_nouns

cleaned_string:  AMU
cleaned_string:  Mass
cleaned_string:  atomic mass
31
Question: what is contained in the nucleus and what is outside of the nucleus
 neutrons only inside and protons outside
Protons and electrons inside and neutrons outside
protons  and neutrons  inside and electrons outside
electrons only inside and neutrons and protons outside
Correct option: neutrons only inside and protons outside
Quality: None
repeating_nouns_synonyms:  ['neutron', 'proton']
repeating_nouns_synonyms:  ['neutron', 'proton']
repeating_nouns_synonyms:  ['neutron', 'proton']
--- absolute word in question stem
cleaned_string:  Protons and electrons inside and neutrons outside
cleaned_string:  protons  and neutrons  inside and electrons outside
cleaned_string:  electrons only inside and neutrons and protons outside
--- GPT-3 believes the answer is incorrect:  protons  and neutrons  inside and electrons outside   neutrons only inside and protons outside
32
Question: What is the electrical charge of a

--- Question stem does not contain a question
cleaned_string:  Chemical
cleaned_string:  compounds
cleaned_string:  reactions
49
Question: What is the number of protons in the nucleus is called?
 the atomic number (Z)
the atomic number is (A)
the atomic number is (N)
the atomic number is (C)
Correct option: the atomic number (Z)
Quality: None
repeating_nouns_synonyms:  ['number']
repeating_nouns_synonyms:  ['number']
repeating_nouns_synonyms:  ['number']
cleaned_string:  the atomic number is 
cleaned_string:  the atomic number is N
cleaned_string:  the atomic number is 
50
ambiguous_unclear_information :  2
implausible_distractors :  6
none_of_the_above :  8
longest_answer_correct :  5
gratuitous_information_in_stem :  0
true_or_false :  3
avoid_convergence_cues :  12
avoid_logical_cues :  2
all_of_the_above :  0
fill_in_the_blank :  2
absolute_terms :  6
word_repeats_in_stem_and_correct_answer :  0
unfocused_stem :  2
complex_k_type :  0
grammatical_cues_in_stem :  3
lost_sequence :  

## OLI CollabU - 50 Questions

In [324]:
# qs = pd.read_csv('oli_collabu_questions.csv')
# qs = qs.fillna('')

# oli_collabu_student_questions = []
# for index, row in qs.iterrows():
#     question = MultipleChoiceQuestion(
#         stem=row['question'],
#         options=[row['altA'], row['altB'], row['altC'], row['altD']],
#         correct_option= row['altA']
#     )
#     oli_collabu_student_questions.append(question)

# rows = []
# i = 0
# for q in oli_collabu_student_questions:
#     r = [ambiguous_unclear_information(q),
#         implausible_distractors(q),
#         none_of_the_above(q),
#         longest_answer_correct(q),
#         gratuitous_information_in_stem(q),
#         true_or_false(q),
#         avoid_convergence_cues(q),
#         avoid_logical_cues(q),
#         all_of_the_above(q),
#         fill_in_the_blank(q),
#         absolute_terms(q),
#         word_repeats_in_stem_and_correct_answer(q),
#         unfocused_stem(q),
#         complex_k_type(q),
#         grammatical_cues_in_stem(q),
#         lost_sequence(q),
#         vague_terms(q),
#         more_than_one_correct(q),
#         negative_worded_stem(q)]
#     rows.append(r)
#     i += 1
#     print(i)

# columns = [
#     'ambiguous_unclear_information',
#     'implausible_distractors',
#     'none_of_the_above',
#     'longest_answer_correct',
#     'gratuitous_information_in_stem',
#     'true_or_false',
#     'avoid_convergence_cues',
#     'avoid_logical_cues',
#     'all_of_the_above',
#     'fill_in_the_blank',
#     'absolute_terms',
#     'word_repeats_in_stem_and_correct_answer',
#     'unfocused_stem',
#     'complex_k_type',
#     'grammatical_cues_in_stem',
#     'lost_sequence',
#     'vague_terms',
#     'more_than_one_correct',
#     'negative_worded_stem'
# ]

# oli_collabu_results = pd.DataFrame(rows, columns=columns)
# oli_collabu_results.to_csv("collabu_results_auto_iwf.csv")
# #Print the number of "Falses", which correspond to IWF, found the table across each criteria
# for c in columns:
#     print(c, ': ', (~oli_collabu_results[c]).values.sum())

## OLI Stats - 50 Questions

In [325]:
# qs = pd.read_csv('oli_stats_questions.csv')
# qs = qs.fillna('')

# oli_stats_student_questions = []
# for index, row in qs.iterrows():
#     question = MultipleChoiceQuestion(
#         stem=row['question'],
#         options=[row['altA'], row['altB'], row['altC'], row['altD']],
#         correct_option= row['altA']
#     )
#     oli_stats_student_questions.append(question)

# rows = []
# i = 0
# for q in oli_stats_student_questions:
#     r = [ambiguous_unclear_information(q),
#         implausible_distractors(q),
#         none_of_the_above(q),
#         longest_answer_correct(q),
#         gratuitous_information_in_stem(q),
#         true_or_false(q),
#         avoid_convergence_cues(q),
#         avoid_logical_cues(q),
#         all_of_the_above(q),
#         fill_in_the_blank(q),
#         absolute_terms(q),
#         word_repeats_in_stem_and_correct_answer(q),
#         unfocused_stem(q),
#         complex_k_type(q),
#         grammatical_cues_in_stem(q),
#         lost_sequence(q),
#         vague_terms(q),
#         more_than_one_correct(q),
#         negative_worded_stem(q)]
#     rows.append(r)
#     i += 1
#     print(i)

# columns = [
#     'ambiguous_unclear_information',
#     'implausible_distractors',
#     'none_of_the_above',
#     'longest_answer_correct',
#     'gratuitous_information_in_stem',
#     'true_or_false',
#     'avoid_convergence_cues',
#     'avoid_logical_cues',
#     'all_of_the_above',
#     'fill_in_the_blank',
#     'absolute_terms',
#     'word_repeats_in_stem_and_correct_answer',
#     'unfocused_stem',
#     'complex_k_type',
#     'grammatical_cues_in_stem',
#     'lost_sequence',
#     'vague_terms',
#     'more_than_one_correct',
#     'negative_worded_stem'
# ]

# oli_stats_results = pd.DataFrame(rows, columns=columns)
# oli_stats_results.to_csv("stats_results_auto_iwf.csv")
# #Print the number of "Falses", which correspond to IWF, found the table across each criteria
# for c in columns:
#     print(c, ': ', (~oli_stats_results[c]).values.sum())