In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

## Data Load

In [28]:
import os
os.getcwd()

'c:\\Users\\arche\\OneDrive\\桌面\\kaggle\\eedi'

In [8]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

misconception_mapping = pd.read_csv("misconception_mapping.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [9]:
# Preview the datasets
print(train.head())
print(test.head())
print(misconception_mapping.head())
print(sample_submission.head())

   QuestionId  ConstructId                                      ConstructName  \
0           0          856  Use the order of operations to carry out calcu...   
1           1         1612  Simplify an algebraic fraction by factorising ...   
2           2         2774            Calculate the range from a list of data   
3           3         2377  Recall and use the intersecting diagonals prop...   
4           4         3387  Substitute positive integer values into formul...   

   SubjectId                                        SubjectName CorrectAnswer  \
0         33                                             BIDMAS             A   
1       1077                    Simplifying Algebraic Fractions             D   
2        339  Range and Interquartile Range from a List of Data             B   
3         88                       Properties of Quadrilaterals             C   
4         67                          Substitution into Formula             A   

                          

## Preprocess

In [10]:
def to_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["ConstructName"] +" " +df["QuestionText"]
    return df

test = to_text(test)

In [11]:
print(test.shape)
print(test.columns)

(3, 12)
Index(['QuestionId', 'ConstructId', 'ConstructName', 'SubjectId',
       'SubjectName', 'CorrectAnswer', 'QuestionText', 'AnswerAText',
       'AnswerBText', 'AnswerCText', 'AnswerDText', 'all_question_text'],
      dtype='object')


In [12]:
def to_long(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.melt(
        df[
            [
                "QuestionId",
                "all_question_text",
                "CorrectAnswer",
                "AnswerAText",
                "AnswerBText",
                "AnswerCText",
                "AnswerDText"
            ]
        ],
        id_vars    = ["QuestionId", "all_question_text", "CorrectAnswer"],
        var_name   = 'Answer',
        value_name = 'value'
    )

    return df

test_long = to_long(test)
test_long

Unnamed: 0,QuestionId,all_question_text,CorrectAnswer,Answer,value
0,1869,Use the order of operations to carry out calcu...,A,AnswerAText,\( 3 \times(2+4)-5 \)
1,1870,Simplify an algebraic fraction by factorising ...,D,AnswerAText,\( m+1 \)
2,1871,Calculate the range from a list of data Tom an...,B,AnswerAText,Only\nTom
3,1869,Use the order of operations to carry out calcu...,A,AnswerBText,\( 3 \times 2+(4-5) \)
4,1870,Simplify an algebraic fraction by factorising ...,D,AnswerBText,\( m+2 \)
5,1871,Calculate the range from a list of data Tom an...,B,AnswerBText,Only\nKatie
6,1869,Use the order of operations to carry out calcu...,A,AnswerCText,\( 3 \times(2+4-5) \)
7,1870,Simplify an algebraic fraction by factorising ...,D,AnswerCText,\( m-1 \)
8,1871,Calculate the range from a list of data Tom an...,B,AnswerCText,Both Tom and Katie
9,1869,Use the order of operations to carry out calcu...,A,AnswerDText,Does not need brackets


In [13]:
print(test_long.shape)
print(test_long.columns)

(12, 5)
Index(['QuestionId', 'all_question_text', 'CorrectAnswer', 'Answer', 'value'], dtype='object')


In [14]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_text"] = df["all_question_text"] +" " +df["value"]
    return df

test_long = make_all_text(test_long)
test_long

Unnamed: 0,QuestionId,all_question_text,CorrectAnswer,Answer,value,all_text
0,1869,Use the order of operations to carry out calcu...,A,AnswerAText,\( 3 \times(2+4)-5 \),Use the order of operations to carry out calcu...
1,1870,Simplify an algebraic fraction by factorising ...,D,AnswerAText,\( m+1 \),Simplify an algebraic fraction by factorising ...
2,1871,Calculate the range from a list of data Tom an...,B,AnswerAText,Only\nTom,Calculate the range from a list of data Tom an...
3,1869,Use the order of operations to carry out calcu...,A,AnswerBText,\( 3 \times 2+(4-5) \),Use the order of operations to carry out calcu...
4,1870,Simplify an algebraic fraction by factorising ...,D,AnswerBText,\( m+2 \),Simplify an algebraic fraction by factorising ...
5,1871,Calculate the range from a list of data Tom an...,B,AnswerBText,Only\nKatie,Calculate the range from a list of data Tom an...
6,1869,Use the order of operations to carry out calcu...,A,AnswerCText,\( 3 \times(2+4-5) \),Use the order of operations to carry out calcu...
7,1870,Simplify an algebraic fraction by factorising ...,D,AnswerCText,\( m-1 \),Simplify an algebraic fraction by factorising ...
8,1871,Calculate the range from a list of data Tom an...,B,AnswerCText,Both Tom and Katie,Calculate the range from a list of data Tom an...
9,1869,Use the order of operations to carry out calcu...,A,AnswerDText,Does not need brackets,Use the order of operations to carry out calcu...


In [15]:
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
test_long

Unnamed: 0,QuestionId,all_question_text,CorrectAnswer,Answer,value,all_text
0,1869,Use the order of operations to carry out calcu...,A,AnswerAText,\( 3 \times(2+4)-5 \),Use the order of operations to carry out calcu...
1,1869,Use the order of operations to carry out calcu...,A,AnswerBText,\( 3 \times 2+(4-5) \),Use the order of operations to carry out calcu...
2,1869,Use the order of operations to carry out calcu...,A,AnswerCText,\( 3 \times(2+4-5) \),Use the order of operations to carry out calcu...
3,1869,Use the order of operations to carry out calcu...,A,AnswerDText,Does not need brackets,Use the order of operations to carry out calcu...
4,1870,Simplify an algebraic fraction by factorising ...,D,AnswerAText,\( m+1 \),Simplify an algebraic fraction by factorising ...
5,1870,Simplify an algebraic fraction by factorising ...,D,AnswerBText,\( m+2 \),Simplify an algebraic fraction by factorising ...
6,1870,Simplify an algebraic fraction by factorising ...,D,AnswerCText,\( m-1 \),Simplify an algebraic fraction by factorising ...
7,1870,Simplify an algebraic fraction by factorising ...,D,AnswerDText,Does not simplify,Simplify an algebraic fraction by factorising ...
8,1871,Calculate the range from a list of data Tom an...,B,AnswerAText,Only\nTom,Calculate the range from a list of data Tom an...
9,1871,Calculate the range from a list of data Tom an...,B,AnswerBText,Only\nKatie,Calculate the range from a list of data Tom an...


In [18]:
labels = misconception_mapping['MisconceptionName'].values
print(labels[0:10])

['Does not know that angles in a triangle sum to 180 degrees'
 'Uses dividing fractions method for multiplying fractions'
 'Believes there are 100 degrees in a full turn'
 'Thinks a quadratic without a non variable term, can not be factorised'
 'Believes addition of terms and powers of terms are equivalent e.g. a + c = a^c'
 'When measuring a reflex angle, gives the acute or obtuse angle that sums to 360 instead'
 'Can identify the multiplier used to form an equivalent fraction but does not apply to the numerator'
 'Believes gradient = change in y'
 'Student thinks that any two angles along a straight line are equal'
 'Thinks there are 180 degrees in a full turn']


## Loading the model and tokenizer for embedding generation

In [22]:
import kagglehub

# Download latest version
path = kagglehub.model_download("happenmass/bge-small-en-v1.5/transformers/bge")

print("Path to model files:", path)

Downloading 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/README.md...




Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/1_Pooling/config.json...



100%|██████████| 196/196 [00:00<00:00, 196kB/s]
Downloading 14 files:   7%|▋         | 1/14 [00:01<00:14,  1.08s/it]

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/config.json...



[A

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/.gitattributes...


100%|██████████| 774/774 [00:00<00:00, 387kB/s]

100%|██████████| 1.52k/1.52k [00:00<00:00, 1.55MB/s]
100%|██████████| 95.6k/95.6k [00:00<00:00, 1.78MB/s]

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/config_sentence_transformers.json...





Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/modules.json...



100%|██████████| 130/130 [00:00<00:00, 65.0kB/s]
100%|██████████| 368/368 [00:00<00:00, 184kB/s]

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/onnx/model.onnx...





Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/model.safetensors...



[A
[A
[A
[A

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/tokenizer_config.json...




100%|██████████| 381/381 [00:00<00:00, 381kB/s]


Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/tokenizer.json...




[A[A

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/vocab.txt...





[A[A[A

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/pytorch_model.bin...






[A[A[A[A


[A[A[A

100%|██████████| 256k/256k [00:00<00:00, 2.22MB/s]
100%|██████████| 725k/725k [00:00<00:00, 4.50MB/s]




[A[A[A[A



[A[A[A[A

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/sentence_bert_config.json...




100%|██████████| 55.0/55.0 [00:00<?, ?B/s]




[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A

Downloading from https://www.kaggle.com/api/v1/models/happenmass/bge-small-en-v1.5/transformers/bge/2/download/special_tokens_map.json...




100%|██████████| 132/132 [00:00<00:00, 132kB/s]




[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



100%|██████████| 127M/127M [00:04<00:00, 27.1MB/s]

[A
[A
[A
[A
100%|██████████| 127M/127M [00:07<00:00, 17.2MB/s]
Downloading 14 files:  14%|█▍        | 2/14 [00:08<01:00,  5.07s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 127M/127M [00:10<00:00, 12.4MB/s]
Downloading 14 files: 100%|██████████| 14/14 [00:12<00:00,  1.16it/s]

Path to model files: C:\Users\arche\.cache\kagglehub\models\happenmass\bge-small-en-v1.5\transformers\bge\2





In [29]:
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained('happenmass/bge-small-en-v1.5/transformers/bge/2')
model = AutoModel.from_pretrained('happenmass/bge-small-en-v1.5/transformers/bge/2')

model.eval()
model.to(device)
print("finish")

finish


In [30]:
from tqdm import tqdm
MisconceptionName = list(misconception_mapping['MisconceptionName'].values)
per_gpu_batch_size = 8


def prepare_inputs(text, tokenizer, device):
    tokenizer_outputs = tokenizer.batch_encode_plus(
        text,
        padding        = True,
        return_tensors = 'pt',
        max_length     = 1024,
        truncation     = True
    )
    result = {
        'input_ids': tokenizer_outputs.input_ids.to(device),
        'attention_mask': tokenizer_outputs.attention_mask.to(device),
    }
    return result


all_ctx_vector = []
for mini_batch in tqdm(range(0, len(MisconceptionName[:]), per_gpu_batch_size)):
    mini_context          = MisconceptionName[mini_batch:mini_batch+ per_gpu_batch_size]
    encoded_input         = prepare_inputs(mini_context,tokenizer,device)
    sentence_embeddings   = model(**encoded_input)[0][:, 0]
    sentence_embeddings   = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    all_ctx_vector.append(sentence_embeddings.detach().cpu().numpy())

all_ctx_vector = np.concatenate(all_ctx_vector, axis=0)
print("Sentence embeddings:", sentence_embeddings.shape)

100%|██████████| 324/324 [00:06<00:00, 51.38it/s]

Sentence embeddings: torch.Size([3, 384])





In [31]:
test_texts = list(test_long.all_text.values)
all_text_vector = []
per_gpu_batch_size = 8

for mini_batch in tqdm(
        range(0, len(test_texts[:]), per_gpu_batch_size)):
    mini_context = test_texts[mini_batch:mini_batch
                                           + per_gpu_batch_size]
    encoded_input = prepare_inputs(mini_context,tokenizer,device)
    sentence_embeddings = model(
        **encoded_input)[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    
    all_text_vector.append(sentence_embeddings.detach().cpu().numpy())

all_text_vector = np.concatenate(all_text_vector, axis=0)
print(all_text_vector.shape)

100%|██████████| 2/2 [00:00<00:00, 16.11it/s]

(12, 384)





## Predict

In [32]:
test_cos_sim_arr = cosine_similarity(all_text_vector, all_ctx_vector)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [33]:
test_sorted_indices[:, :25]

array([[1672, 1941, 2488, 1316, 1119, 2131, 2010, 2586, 1901, 1872, 2202,
        2518, 1999,  346, 2221,  256,  871, 1929,   14,   74,  466,  659,
         340, 1163, 1085],
       [1672, 1941, 1316, 1119, 2488, 2010, 2131, 2586, 1872, 1901, 2202,
        2518,  346, 1999,   14, 1929,  871, 2221,  256,   74, 1163,  659,
         340, 1085, 1971],
       [1672, 1941, 1316, 1119, 2488, 2131, 2010, 2586, 1872, 1901, 2202,
        2518,  346, 1999,  256, 2221,   14,  871, 1929, 1163,   74,  659,
         340, 1085,  466],
       [2488, 1941, 1672, 2131, 1316, 1119, 2586, 1872,  256,  871, 2010,
          74, 2202, 2221, 1999, 2518, 1901,  466,  373, 2532, 1929,  969,
        1163,  659,  315],
       [1540, 2398, 1593,  885,  363, 2078, 2307,  979,  606,   59,  848,
        1548,   80, 1825,  633, 1131, 1469,   29,  891, 1812, 1916,  317,
        1358, 1218, 1280],
       [1540, 2398, 1593,  885,  363, 2078, 2307,  979,  606,   59,  848,
        1548,   80, 1825,  633, 1131, 1812, 1469,  

## Make Submit File

In [34]:
test_long["Answer_alphabet"] = test_long["Answer"].str.extract(r'Answer([A-Z])Text$')
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["Answer_alphabet"]
test_long["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
# filter correct row
test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [35]:
submission.head(10)

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,1672 1941 1316 1119 2488 2010 2131 2586 1872 1...
1,1869_C,1672 1941 1316 1119 2488 2131 2010 2586 1872 1...
2,1869_D,2488 1941 1672 2131 1316 1119 2586 1872 256 87...
3,1870_A,1540 2398 1593 885 363 2078 2307 979 606 59 84...
4,1870_B,1540 2398 1593 885 363 2078 2307 979 606 59 84...
5,1870_C,1540 2398 885 1593 363 2078 2307 979 606 59 84...
6,1871_A,1059 2151 632 397 2303 1982 1797 2319 1349 192...
7,1871_C,1059 2151 632 397 2303 1982 2319 1797 1349 192...
8,1871_D,1059 632 397 2151 2319 2303 1797 1982 1349 162...
