# Learning Equality Curriculum Reccomendations - Modeling

## Setup

### Imports

In [234]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn, Tensor
from torch.utils.data import DataLoader, Dataset, default_collate
from transformers import AutoTokenizer, AutoModel, AutoConfig
from pathlib import Path
from dataclasses import dataclass

### Definitions

In [253]:
@dataclass
class RetrieverConfig:
    # Global
    seed: int = 13
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Model architecture
    model: str = "sentence-transformers/all-MiniLM-L6-v2"
    top_k: int = 10
    field_sep_token: str = "[FLD]"
    max_seq_length: int = 512
        
    # Topic representation
    use_topic_title: bool = True
    use_topic_descr: bool = True
    use_topic_level: bool = True
    
    # Content representation
    use_content_title: bool = True
    use_content_descr: bool = True
    use_content_text: bool = True
        
    # Training
    input_dir: str = Path(r'/kaggle/input/learning-equality-curriculum-recommendations')
    batch_size: int = 32
    n_epochs: int = 100
    shuffle: bool = True

def get_data(cfg):
    topics_df = pd.read_csv(cfg.input_dir/'topics.csv').set_index('id').fillna({'title': '', 'description': ''})
    topics_df.name = 'topics'
    topics_df['level'] = topics_df.level.apply(lambda x: f'Level {x}')
    content_df = pd.read_csv(cfg.input_dir/'content.csv').set_index('id').fillna('')
    content_df.name = 'content'
    correlations_df = pd.read_csv(cfg.input_dir/'correlations.csv').set_index('topic_id')
    correlations_df.name = 'correlations'
    sample_submission_df = pd.read_csv(cfg.input_dir/'sample_submission.csv')
    sample_submission_df.name = 'sample_submission'
    
    return topics_df, content_df, correlations_df, sample_submission_df

def collate_fn(batch):
    return list(default_collate(batch))
    
class RetrieverDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        assert df.name in ['topics', 'content'], "Unrecognized DataFrame name, must be either 'topics' or 'content'."
        if df.name == 'topics':
            self.texts = self.__makerepr__(df, title=cfg.use_topic_title, descr=cfg.use_topic_descr, level=cfg.use_topic_level)
        elif df.name == 'content':
            self.texts = self.__makerepr__(df, title=cfg.use_content_title, descr=cfg.use_content_descr, text=cfg.use_content_text)
    def __makerepr__(self, df, title=True, descr=False, text=False, level=False):
        fields = []
        if title: fields.append('title')
        if descr: fields.append('description')
        if text: fields.append('text')
        if level: fields.append('level')
        texts = [df[field].to_list() for field in fields]
        texts = [f' {self.cfg.field_sep_token} '.join([f for f in t if f != '']) for t in zip(*texts)]
        return texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item_id):
        return (item_id, self.texts[item_id])
    
class MeanPooling(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

class Retriever(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model, additional_special_tokens=[cfg.field_sep_token])
        model = AutoModel.from_pretrained(cfg.model, config=self.config)
        model.resize_token_embeddings(len(self.tokenizer))
        model.to(cfg.device)
        self.model = model
        self.pool = MeanPooling()
    def forward(self, x):
        if type(x) == tuple: x = list(x)
        encodings = self.tokenizer(x, padding=True, truncation=True, return_tensors='pt').to(self.cfg.device)
        embeddings = self.model(**encodings)
        embeddings = self.pool(embeddings.last_hidden_state, encodings.attention_mask)
#         embeddings = F.normalize(embeddings, p=2, dim=1)
        return embeddings

## Experiments

In [236]:
if 0: topics_df, content_df, correlations_df, sample_submission_df = get_data(RetrieverConfig)

In [254]:
retriever_cfg = RetrieverConfig()

topics_dataset = RetrieverDataset(retriever_cfg, topics_df)
content_dataset = RetrieverDataset(retriever_cfg, content_df)

topics_dataloader = DataLoader(topics_dataset, batch_size=retriever_cfg.batch_size, shuffle=retriever_cfg.shuffle)
content_dataloader = DataLoader(content_dataset, batch_size=retriever_cfg.batch_size, shuffle=retriever_cfg.shuffle)

retriever_model = Retriever(retriever_cfg)

for item_ids, X in topics_dataloader:
    print(f'Processing item_ids: {item_ids}')
    Y = retriever_model(X)
    print(f'Outputs: {Y}')
    if max(item_ids) > 10: break

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Processing item_ids: tensor([66870, 17897, 18832, 15262, 28859, 75674, 41126, 71640, 23687, 35769,
        72874, 30784, 51618, 63060, 34662, 72322, 22570,  9728, 64610, 38086,
        15509, 36385,  7564, 33658, 29379, 76246, 37182,  1630, 64629, 18715,
        50742, 35641])
Outputs: tensor([[-0.1112, -0.0033,  0.0103,  ...,  0.0352,  0.0105, -0.1051],
        [-0.0260,  0.1217,  0.0117,  ..., -0.0583, -0.0311, -0.0468],
        [-0.0519,  0.0591, -0.0112,  ...,  0.1492, -0.0692, -0.0385],
        ...,
        [-0.0134, -0.0307,  0.0086,  ..., -0.0368, -0.0882, -0.0245],
        [-0.0002,  0.0244,  0.0029,  ...,  0.0083,  0.0186, -0.0084],
        [ 0.0102,  0.0350,  0.0123,  ...,  0.0247, -0.0715, -0.0972]],
       grad_fn=<DivBackward0>)
