In [2]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
import faiss
import torch
from tqdm.auto import trange
import torch.functional as F

In [3]:
def apk(actual, predicted, k=10):
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [4]:
train_data = pd.read_csv('./eedi-mining-misconceptions-in-mathematics/train.csv')
test_data = pd.read_csv('./eedi-mining-misconceptions-in-mathematics/test.csv')
simple_sub = pd.read_csv('./eedi-mining-misconceptions-in-mathematics/sample_submission.csv')
misconcpts = pd.read_csv('./eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

In [5]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


@torch.no_grad()
def create_embeds(texts,model,tokenizer,batch_size=64,max_length=1024):
    features = []
    for i in trange(len(texts) // batch_size + 1):
        encs = tokenizer(
            texts[i*batch_size:(i+1)*batch_size],
            truncation=True,
            padding='longest',
            max_length=max_length,
            return_tensors='pt'
        )
        encs['input_ids'] = encs['input_ids'].cuda()
        encs['attention_mask'] = encs['attention_mask'].cuda()
        if 'token_type_ids' in encs.keys():
            encs['token_type_ids'] = encs['token_type_ids'].cuda()
        model_output = model(**encs)
        embeds = mean_pooling(model_output, encs['attention_mask'])
        embeds = torch.nn.functional.normalize(embeds, p=2, dim=1).detach().cpu().numpy()
        features.append(embeds)
    return np.concatenate(features,dtype=np.float32)

@torch.no_grad()
def encode_sentence(text,model,tokenizer,max_length=1024):
    encs = tokenizer.encode_plus(
        text,
        truncation=True,
        padding=False,
        max_length=max_length,
        return_tensors='pt'
    )
    encs['input_ids'] = encs['input_ids'].cuda()
    encs['attention_mask'] = encs['attention_mask'].cuda()
    model_output = model(**encs)
    embeds = mean_pooling(model_output, encs['attention_mask'])
    embeds = torch.nn.functional.normalize(embeds, p=2, dim=1).detach().cpu().numpy()
    return np.array(embeds[0],dtype=np.float32)

In [6]:
encoder = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5').cuda()
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')

In [7]:
features = create_embeds(misconcpts['MisconceptionName'].tolist(),encoder,tokenizer,batch_size=64)

  0%|          | 0/41 [00:00<?, ?it/s]

In [8]:
index = faiss.IndexFlatIP(features.shape[1]) #use faiss.IndexFlatL2 for l2 distance
faiss.normalize_L2(features)
index.add(features)
#res = faiss.StandardGpuResources()
#index = faiss.index_cpu_to_gpu(res, 0, index)
index.ntotal

2587

## Validate

In [9]:
def prepare_data(df):
    df_melted = df.melt(
        id_vars = ['QuestionId','ConstructName','SubjectName','QuestionText','CorrectAnswer'],
        value_vars = ['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText'],
        value_name = 'answer_value',
        var_name = 'answer_id'
    )
    return df_melted

def set_labels(df,df_prepared):
    df_melted = df.melt(
        id_vars = ['QuestionId'],
        value_vars = ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId'],
        value_name = 'misconcpts_id',
        var_name = 'answer_id'
    )
    df_melted['all_index'] = df_melted.apply(lambda x: f'{x.QuestionId}_{x.answer_id}',axis=1)
    df_prepared['all_index'] = df_prepared.apply(lambda x: f'{x.QuestionId}_{x.answer_id}',axis=1)
    df_melted = df_melted.drop(['QuestionId','answer_id'],axis=1)
    df_melted.set_index('all_index')
    df_prepared.set_index('all_index')
    return pd.concat([df_prepared,df_melted],axis=1).drop(['all_index'],axis=1).dropna()

def parse_request(x):
    x = x.fillna('')
    return f"{x.answer_value}  {x.ConstructName} {x.QuestionText} {x.answer_value}"

In [10]:
melted_train = prepare_data(train_data)
melted_train = set_labels(train_data,melted_train)
melted_train['request'] = melted_train.apply(parse_request,axis=1)

In [None]:
train_requests = create_embeds(melted_train['request'].tolist(),encoder,tokenizer,batch_size=64,max_length=512)

  0%|          | 0/69 [00:00<?, ?it/s]

In [11]:
faiss.normalize_L2(train_requests)
D,I = index.search(train_requests,k=25)

In [12]:
melted_train['top25_candidates'] = [x.tolist() for x in I]
melted_train['top25_scores'] = [x.tolist() for x in D]
melted_train['id_in_candidates'] = melted_train.apply(lambda x: int(x.misconcpts_id) in x.top25_candidates,axis=1).astype(np.int16)

In [13]:
melted_train['id_in_candidates'].sum() / melted_train['id_in_candidates'].shape[0]

0.5480549199084668

In [14]:
mapk(
    actual = melted_train['misconcpts_id'].astype(np.int16).values.reshape(-1,1).tolist(),
    predicted =  melted_train['top25_candidates'].tolist(),
    k=25
)

0.20027823392615068

In [15]:
melted_train.to_parquet('candidates_bge_large.parquet')