# **Imports &#8595;**

In [1]:
#!pip install einops --no-index --find-links=file:///kaggle/input/packages/einops/ 
#!pip install transformers --no-index --find-links=file:///kaggle/input/packages/transformers/ 
#!pip install sentence-transformers --no-index --find-links=file:///kaggle/input/packages/sentence-transformers/ 

!pip install einops
!pip install transformers==4.42.4
!pip install -U sentence-transformers
!jupyter nbextension disable --py widgetsnbextension

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0
Collecting transformers==4.42.4
  Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.42.4)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.42.4-py3-none-any.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import heapq

In [5]:
import numpy as np
import pandas as pd 
import os
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# **Load Dataset &#8595;**

In [6]:
DATA_PATH = "datasets/eedi-mining-misconceptions-in-mathematics"
EXTERNAL_DATA_PATH = "datasets/eedi-external-dataset"

train_df = pd.read_csv(f'{EXTERNAL_DATA_PATH}/all_train.csv', index_col='QuestionId') #this contains the original dataset + an external dataset generated by a LLM
test_df = pd.read_csv(f'{DATA_PATH}/test.csv')
misconceptions_df = pd.read_csv(f'{DATA_PATH}/misconception_mapping.csv')

pd.options.display.max_colwidth = 300
display(train_df.head(5))
pd.options.display.max_colwidth = 50

Unnamed: 0_level_0,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId,source,MisconceptionAName,MisconceptionBName,MisconceptionCName,MisconceptionDName,OriginalQuestionId
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,856.0,Use the order of operations to carry out calculations involving powers,33.0,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer equal \( 13 \) ?,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,,,,1672.0,original,,,,"Confuses the order of operations, believes addition comes before multiplication",
1,1612.0,Simplify an algebraic fraction by factorising the numerator,1077.0,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \)",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,2142.0,143.0,2142.0,,original,"Does not know that to factorise a quadratic expression, to find two numbers that add to give the coefficient of the x term, and multiply to give the non variable term\n","Thinks that when you cancel identical terms from the numerator and denominator, they just disappear","Does not know that to factorise a quadratic expression, to find two numbers that add to give the coefficient of the x term, and multiply to give the non variable term\n",,
2,2774.0,Calculate the range from a list of data,339.0,Range and Interquartile Range from a List of Data,B,"Tom and Katie are discussing the \( 5 \) plants with these heights:\n\( 24 \mathrm{~cm}, 17 \mathrm{~cm}, 42 \mathrm{~cm}, 26 \mathrm{~cm}, 13 \mathrm{~cm} \)\nTom says if all the plants were cut in half, the range wouldn't change.\nKatie says if all the plants grew by \( 3 \mathrm{~cm} \) each,...",Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,1287.0,,1287.0,1073.0,original,Believes if you changed all values by the same proportion the range would not change,,Believes if you changed all values by the same proportion the range would not change,Believes if you add the same value to all numbers in the dataset the range will change,
3,2377.0,Recall and use the intersecting diagonals properties of a rectangle,88.0,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with different length sides can never be... ![A rectangle with the diagonals drawn in. The angle on the right hand side at the centre is highlighted in red and the angle at the bottom at the centre is highlighted in yellow.](),acute,obtuse,\( 90^{\circ} \),Not enough information,1180.0,1180.0,,1180.0,original,Does not know the properties of a rectangle,Does not know the properties of a rectangle,,Does not know the properties of a rectangle,
4,3387.0,Substitute positive integer values into formulae involving powers or roots,67.0,Substitution into Formula,A,The equation \( f=3 r^{2}+3 \) is used to find values in the table below. What is the value covered by the star? \begin{tabular}{|c|c|c|c|c|}\n\hline\( r \) & \( 1 \) & \( 2 \) & \( 3 \) & \( 4 \) \\\n\hline\( f \) & \( 6 \) & \( 15 \) & \( \color{gold}\bigstar \) & \\\n\hline\n\end{tabular},\( 30 \),\( 27 \),\( 51 \),\( 24 \),,,,1818.0,original,,,,Thinks you can find missing values in a given table by treating the row as linear and adding on the difference between the first two values given.,


# **Data Preprocessing &#8595;**

In [7]:
def clean(example, columns):
    """
    Cleans the example from the Dataset
    Args:
        example: an example from the Dataset
        columns: columns that will be cleaned

    Returns: update example containing 'clean' columns

    """
    for col in columns:
        text = example[f'{col}']

        # Empty text
        if type(text) not in (str, np.str_) or text=='':
            example[f'clean_{col}'] = ''
            return example

        # 'text' from the example can be of type numpy.str_, let's convert it to a python str
        text = str(text).lower()

        # Clean the text
        text = re.sub("\"", " ", text) # removes the " from certain texts
        text = re.sub("\n", " ", text) # removes the multiple "\n" 
        text = re.sub(r"(\\\w+)(\W)", r" \1 \2", text) # matches with the LaTeX commands like "\hline{}",... and transforms them to " \hline {}"
        text = re.sub(r"([\(|\{|\[|\|])", r" \1", text) # matches every opening parenthesis types and puts spaces on their left
        text = re.sub(r"([\)|\}|\]])", r"\1 ", text) # matches every closing parenthesis types and puts spaces on their right
        text = re.sub(r"\\(?![a-zA-Z])", " ", text) # removes every backslash that is not the start of a LaTeX command
        text = re.sub(r"\( | \)", "", text) # removes the parentheses that appear sometimes from nowhere 
        text = re.sub(r"\[ | \]", "", text) # removes the parentheses that appear sometimes from nowhere
        
        text = re.sub(r" +", " ", text) # cleans the double spaces made by above substitutions
        # Update the example with the cleaned text
        example[f'clean_{col}'] = text.strip()
    return example

# testing_data = {
#     'QuestionText': ["This is a question with a newline\nin the middle"],
#     'AnswerAText': ["Answer A\nwith newline and \\table[test]"],
#     'AnswerBText': ["Answer B\nwith newline and \hline(uwo)"],
#     'AnswerCText': ["Answer C\nwith newline and \color{gold}"],
#     'AnswerDText': ["Answer D\nwith newline and \\begin{tabular}"]
# }
# df = pd.DataFrame(testing_data)
# df = df.apply(clean, axis = 1, columns = columns_to_clean)
# display(df.head(1))

columns_to_clean = ['QuestionText', 'AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']
train_df = train_df.apply(clean, axis = 1, columns = columns_to_clean)

# Adjust column order
new_order = ['ConstructId', 'ConstructName', 'SubjectId', 'SubjectName', 'CorrectAnswer']
for col in columns_to_clean:
    new_order.append(col)
    new_order.append(f'clean_{col}')
#new_order.extend(['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId', 'source', 'MisconceptionAName', 'MisconceptionBName', 'MisconceptionCName', 'MisconceptionDName', 'OriginalQuestionId'])
train_df = train_df[new_order]


display_train_df = train_df[['QuestionText', 'clean_QuestionText','AnswerAText', 'clean_AnswerAText', 'AnswerBText', 'clean_AnswerBText', 'AnswerCText', 'clean_AnswerCText', 'AnswerDText', 'clean_AnswerDText']]
pd.options.display.max_colwidth = 300
display(display_train_df.head(1))
pd.options.display.max_colwidth = 50

Unnamed: 0_level_0,QuestionText,clean_QuestionText,AnswerAText,clean_AnswerAText,AnswerBText,clean_AnswerBText,AnswerCText,clean_AnswerCText,AnswerDText,clean_AnswerDText
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,\[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer equal \( 13 \) ?,3 \times 2+4-5 where do the brackets need to go to make the answer equal 13 ?,\( 3 \times(2+4)-5 \),3 \times (2+4) -5,\( 3 \times 2+(4-5) \),3 \times 2+ (4-5),\( 3 \times(2+4-5) \),3 \times (2+4-5),Does not need brackets,does not need brackets


# **Pre-trained LLM Classifier &#8595;**

In [5]:
task_name_to_instruct = {"example": "Given a math question and a distractor incorrect answer, please retrieve the most accurate reason for the misconception.",}
query_prefix = "Instruct: "+task_name_to_instruct["example"]+"\nQuery: "
passage_prefix = ""

In [6]:
def create_query(example):
    """
    Creates a query from the features of the example
    Args:
        example: an example from the Dataset

    Returns: update example containing 'queryA/B/C/D' columns

    """
    # ConstructName, SubjectName, CorrectAnswer, QuestionText, AnswerText, source
    for answer in ['A', 'B', 'C', 'D']:
        if example['CorrectAnswer'] == answer:
            continue
        query = f"##Construct Name##: {example['ConstructName']}, ##Subject Name##: {example['SubjectName']}, ##Question Text##: {example['clean_QuestionText']}, ##Answer Text##: {example[f'clean_Answer{answer}Text']}"
        example[f'Query{answer}'] = query
    return example

#testing_data = {
#    'ConstructName': ["Use the order of operations to carry out calculations involving powers"],
#    'SubjectName': ["BIDMAS"],
#    'CorrectAnswer': ["A"],
#    'clean_QuestionText': ["3 \times 2+4-5 where do the brackets need to go to make the answer equal 13 ?"],
#    'clean_AnswerAText': ["3 \times (2+4) -5"],
#    'clean_AnswerBText': ["3 \times 2+ (4-5)"],
#    'clean_AnswerCText': ["3 \times (2+4-5)"],
#    'clean_AnswerDText': ["does not need brackets"],
#    'source': ["original"]
#}
#df = pd.DataFrame(testing_data)
#df = df.apply(create_query, axis = 1)

#pd.options.display.max_colwidth = 300
#display(df.head(1))
#pd.options.display.max_colwidth = 50
train_df = train_df.apply(create_query, axis = 1)
display(train_df[['QueryA', 'QueryB', 'QueryC', 'QueryD']].head(1))

Unnamed: 0_level_0,QueryA,QueryB,QueryC,QueryD
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,##Construct Name##: Use the order of operation...,##Construct Name##: Use the order of operation...,##Construct Name##: Use the order of operation...


In [8]:
#model = SentenceTransformer('/kaggle/input/Bert-MLM_arXiv-MP-class_zbMath')
model = SentenceTransformer('math-similarity/Bert-MLM_arXiv-MP-class_zbMath')


def encode_misconceptions(example): 
    if pd.isna(example['MisconceptionName']):
        example['Embeddings'] = np.nan
    else:
        example['Embeddings'] = model.encode([example['MisconceptionName']], show_progress_bar=False)
    return example

misconceptions_df = misconceptions_df.apply(encode_misconceptions, axis=1)
embeddings = misconceptions_df['Embeddings'].values.tolist()
display(misconceptions_df.head(5))

NameError: name 'SentenceTransformer' is not defined

In [8]:
def encode_query(example):
    for column in ['QueryA', 'QueryB', 'QueryC', 'QueryD']: 
        if pd.isna(example[column]):
            example[f'Embed{column}'] = np.nan
        else:
            example[f'Embed{column}'] = model.encode([example[column]], show_progress_bar=False)
    return example

train_df = train_df.apply(encode_query, axis = 1)
display(train_df[['EmbedQueryA', 'EmbedQueryB', 'EmbedQueryC', 'EmbedQueryD']].head(1))

Unnamed: 0_level_0,EmbedQueryA,EmbedQueryB,EmbedQueryC,EmbedQueryD
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,"[[0.50364035, 0.8572104, -0.42123145, 0.39019,...","[[0.519818, 0.79976207, -0.45448592, 0.3832621...","[[0.55353516, 0.86182225, -0.44465786, 0.27094..."


In [9]:
def create_predictions(example):
    for answer in ['A', 'B', 'C', 'D']:
        if answer == example['CorrectAnswer']:
            example[f'top25_{answer}'] = np.nan
        else:                               
            top_25 = []
            for i, misconception in enumerate(misconceptions_df['Embeddings']):
                misconception = misconception.reshape(1, -1)
                query_embedding = example[f'EmbedQuery{answer}'].flatten()
                similarity = misconception @ query_embedding
                misconception_id = misconceptions_df.iloc[i]['MisconceptionId']
                if len(top_25) < 25:
                    #print(misconception.shape)
                    #print(example[f'EmbedQuery{answer}'].shape)
                    heapq.heappush(top_25, (similarity.item(), misconception_id))
                else:
                    heapq.heappushpop(top_25, (similarity.item(), misconception_id))
            top_25_misconceptions = [misconception_id for _, misconception_id in sorted(top_25, reverse=True)]
            example[f'top25_{answer}'] = top_25_misconceptions
    return example

train_df = train_df.apply(create_predictions, axis = 1)
display(train_df[['top25_A', 'top25_B', 'top25_C', 'top25_D']].head(5))

Unnamed: 0_level_0,top25_A,top25_B,top25_C,top25_D
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,"[1963, 1593, 791, 1825, 2098, 466, 1651, 59, 1...","[1963, 1593, 791, 1825, 77, 59, 1389, 466, 154...","[1963, 77, 1825, 1593, 791, 2270, 1026, 1571, ..."
1,"[363, 2398, 2484, 59, 1593, 80, 1540, 606, 182...","[363, 2398, 2484, 59, 1593, 80, 1540, 606, 182...","[363, 2398, 59, 2484, 1593, 80, 1540, 606, 182...",
2,"[2205, 1328, 461, 1016, 709, 1744, 2257, 675, ...",,"[2205, 1328, 461, 1016, 709, 1744, 675, 2455, ...","[2205, 1328, 461, 1016, 709, 1744, 675, 2455, ..."
3,"[955, 22, 382, 1016, 1298, 1947, 1553, 1170, 2...","[955, 22, 382, 1553, 1170, 1298, 1317, 496, 18...",,"[22, 955, 382, 1553, 1170, 1845, 496, 1317, 12..."
4,,"[847, 2242, 1341, 1270, 2368, 2308, 760, 1622,...","[847, 1341, 2242, 1270, 2368, 2308, 1622, 760,...","[847, 1341, 2242, 1270, 2368, 2308, 1622, 760,..."


# **Output &#8595;** 

In [10]:
train_df = train_df.reset_index()
display(train_df.columns)
display(train_df.head(3))

Index(['QuestionId', 'AnswerAText', 'AnswerBText', 'AnswerCText',
       'AnswerDText', 'ConstructId', 'ConstructName', 'CorrectAnswer',
       'QueryA', 'QueryB', 'QueryC', 'QueryD', 'QuestionText', 'SubjectId',
       'SubjectName', 'clean_AnswerAText', 'clean_AnswerBText',
       'clean_AnswerCText', 'clean_AnswerDText', 'clean_QuestionText',
       'EmbedQueryA', 'EmbedQueryB', 'EmbedQueryC', 'EmbedQueryD', 'top25_A',
       'top25_B', 'top25_C', 'top25_D'],
      dtype='object')

Unnamed: 0,QuestionId,AnswerAText,AnswerBText,AnswerCText,AnswerDText,ConstructId,ConstructName,CorrectAnswer,QueryA,QueryB,...,clean_AnswerDText,clean_QuestionText,EmbedQueryA,EmbedQueryB,EmbedQueryC,EmbedQueryD,top25_A,top25_B,top25_C,top25_D
0,0,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,856.0,Use the order of operations to carry out calcu...,A,,##Construct Name##: Use the order of operation...,...,does not need brackets,3 \times 2+4-5 where do the brackets need to g...,,"[[0.50364035, 0.8572104, -0.42123145, 0.39019,...","[[0.519818, 0.79976207, -0.45448592, 0.3832621...","[[0.55353516, 0.86182225, -0.44465786, 0.27094...",,"[1963, 1593, 791, 1825, 2098, 466, 1651, 59, 1...","[1963, 1593, 791, 1825, 77, 59, 1389, 466, 154...","[1963, 77, 1825, 1593, 791, 2270, 1026, 1571, ..."
1,1,\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,1612.0,Simplify an algebraic fraction by factorising ...,D,##Construct Name##: Simplify an algebraic frac...,##Construct Name##: Simplify an algebraic frac...,...,does not simplify,"simplify the following, if possible: \frac {m^...","[[0.7318566, 0.965311, -0.67278385, 0.3614066,...","[[0.74010336, 0.96261096, -0.67143726, 0.36470...","[[0.7320576, 0.96087164, -0.67774916, 0.370653...",,"[363, 2398, 2484, 59, 1593, 80, 1540, 606, 182...","[363, 2398, 2484, 59, 1593, 80, 1540, 606, 182...","[363, 2398, 59, 2484, 1593, 80, 1540, 606, 182...",
2,2,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,2774.0,Calculate the range from a list of data,B,##Construct Name##: Calculate the range from a...,,...,neither is correct,tom and katie are discussing the 5 plants with...,"[[-0.23089972, -0.27076513, -0.46715936, 0.192...",,"[[-0.21322936, -0.2692874, -0.4728039, 0.18241...","[[-0.22471593, -0.28712168, -0.45850548, 0.193...","[2205, 1328, 461, 1016, 709, 1744, 2257, 675, ...",,"[2205, 1328, 461, 1016, 709, 1744, 675, 2455, ...","[2205, 1328, 461, 1016, 709, 1744, 675, 2455, ..."


In [11]:
def create_output(example):
    for answer in ['A', 'B', 'C', 'D']:
        if example[f'top25_{answer}'] is np.nan:
            continue
        else: 
            misconceptions_str = " ".join(map(str, example[f'top25_{answer}'])) if isinstance(example[f'top25_{answer}'], list) else example[f'top25_{answer}']
            output_df.loc[len(output_df)] = [f"{example['QuestionId']}_{answer}", misconceptions_str]

output_df = pd.DataFrame(columns=['QuestionId', 'MisconceptionId'])
train_df.apply(create_output, axis = 1)
display(output_df.head(10))

Unnamed: 0,QuestionId,MisconceptionId
0,0_B,1963 1593 791 1825 2098 466 1651 59 1540 567 7...
1,0_C,1963 1593 791 1825 77 59 1389 466 1540 567 980...
2,0_D,1963 77 1825 1593 791 2270 1026 1571 606 2398 ...
3,1_A,363 2398 2484 59 1593 80 1540 606 1825 1963 88...
4,1_B,363 2398 2484 59 1593 80 1540 606 1825 1963 88...
5,1_C,363 2398 59 2484 1593 80 1540 606 1825 1963 88...
6,2_A,2205 1328 461 1016 709 1744 2257 675 1372 2455...
7,2_C,2205 1328 461 1016 709 1744 675 2455 1372 1684...
8,2_D,2205 1328 461 1016 709 1744 675 2455 1684 1372...
9,3_A,955 22 382 1016 1298 1947 1553 1170 2565 668 1...


In [12]:
output_df.to_csv('submission.csv', index=False, header=['QuestionId_Answer', 'MisconceptionId']) 

In [13]:
output_df

Unnamed: 0,QuestionId,MisconceptionId
0,0_B,1963 1593 791 1825 2098 466 1651 59 1540 567 7...
1,0_C,1963 1593 791 1825 77 59 1389 466 1540 567 980...
2,0_D,1963 77 1825 1593 791 2270 1026 1571 606 2398 ...
3,1_A,363 2398 2484 59 1593 80 1540 606 1825 1963 88...
4,1_B,363 2398 2484 59 1593 80 1540 606 1825 1963 88...
...,...,...
11210,eedi_1833_B,77 1963 2270 2285 1026 931 126 575 656 940 184...
11211,eedi_1833_C,77 1963 2270 2285 1026 931 126 575 656 940 462...
11212,eedi_674_A,1760 1170 1468 1298 1632 1372 358 663 1592 141...
11213,eedi_674_B,1760 1170 1468 1298 1632 358 1372 663 1410 190...
