## Evaluating Filter Models 
Creating an MVP for the models to-be-deployed, checking whether they actually perform what evaluated during training.

In [1]:
%load_ext autoreload
%autoreload 2

import os, math, numpy as np, pickle, torch 
cur_dir = os.getcwd()
while not os.getcwd().endswith('-analysis'): os.chdir('..')
import pickle, torch
from pprint import pprint
from util import get_contextual_query_data, set_all_seeds, common_evaluation

from safetensors import safe_open
from transformers.pipelines import TextClassificationPipeline
from modeling_jonberta import JonbertaForSequenceClassification, add_features_to_model
from transformers import AutoTokenizer, AutoConfig

query_data = get_contextual_query_data()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


positive_contextual:   0%|          | 0/66465 [00:00<?, ?it/s]

negative_contextual:   0%|          | 0/157913 [00:00<?, ?it/s]

	[1mdistribution 		train 	eval 	test [0m
	unbalanced       	179502 	22438 	22438
	classes          	106714 	13118 	13098
	subclasses       	13864 	 1772 	 1724
	biased           	10398 	 1329 	 1293
	oversampled      	252290 	31758 	31778
	oversampled_biased 	378435 	47637 	47667





In [None]:
results_dir = os.path.abspath('notebooks/paper/results/')
model_dirs = {
    'logres': sum(([p.path for p in os.scandir(
                        os.path.join(results_dir, '11_logres/{}/model'.format(partition)))] 
                    for partition in ['biased', 'classes', 'subclasses', 'unbalanced']), []),
    # 'codeberta': [p.path for p in os.scandir(
    #                     os.path.join(results_dir, '12_codeberta/huggingface/CodeBERTa-small-v1/model'))],
    # 'jonberta': sum(([p.path for p in os.scandir(
    #                     os.path.join(results_dir, '13_jonberta/{}/model'.format(base_model)))]
    #                 for base_model in ['CodeBERTa-small-v1', '12_codeberta-biased-2e-05lr--0']), []),
}

# remove the test models 
model_dirs = {model_type: [model for model in models if 'test' not in model and 'TEST' not in model] 
              for model_type, models in model_dirs.items()} 

# only keep the first run 
model_dirs_0 = {model_type: [m for m in models if m.endswith('-0') or m.endswith('-0.pkl')]
              for model_type, models in model_dirs.items()}
model_dirs_1 = {model_type: [m for m in models if m.endswith('-1') or m.endswith('-1.pkl')]
              for model_type, models in model_dirs.items()}
model_dirs_2 = {model_type: [m for m in models if m.endswith('-2') or m.endswith('-2.pkl')]
              for model_type, models in model_dirs.items()}
model_dirs_3 = {model_type: [m for m in models if m.endswith('-3') or m.endswith('-3.pkl')]
              for model_type, models in model_dirs.items()}
model_dirs_4 = {model_type: [m for m in models if m.endswith('-4') or m.endswith('-4.pkl')]
              for model_type, models in model_dirs.items()}

pprint(model_dirs_2)

#### logistic regression

In [19]:
# NOTE: Feature retrieval functions given a query 
def _shared_features(query):
    ''' Maintaining this so it's clearer which function adds what '''
    return [
        math.log(1 + query.time_since_last_completion),
        math.log(1 + query.get_document_length()),
        math.log(1 + query.get_offset()),
        query.get_offset_as_percentage(),
        *query.get_document_language_vector(),          # 5-24
    ]

def copilot(query):
    ''' Features used in reverse-engineering Copilot, 
        except those that depend on a pre-existing filter implementation '''

    return [
        *_shared_features(query),
        # we don't have a previous filter label
        int(query.get_whitespace_after_cursor()),
        # time since last label should be very close to time_since_last_completion
        math.log(1 + query.get_prefix_last_line_length()),
        math.log(1 + query.get_prefix_trimmed_last_line_length()),
        *query.get_prefix_last_character_vector(),
        *query.get_trimmed_prefix_last_character_vector()
    ]

def tr_copilot(query): 
    ''' Same as above, without the last character vector '''
    return [
        *_shared_features(query),
        int(query.get_whitespace_after_cursor()),
        math.log(1 + query.get_prefix_last_line_length()),
        math.log(1 + query.get_prefix_trimmed_last_line_length()),
        *query.get_prefix_last_character_vector(),
        # *query.get_trimmed_prefix_last_character_vector()
    ]

def ide_and_copilot(query):
    ''' Same as copilot_features, with below IDE features '''
    return [
        1 if query.ide == 'jetbrains' else 0, 
        1 if query.ide == 'vsc' else 0,
        *copilot(query),
    ]

def nontextual(query) -> list:
    ''' Get the features that could otherwise not be extracted from the context alone, 
        This is identical to `get_nontextual_features` from util.py '''
    return [
        *_shared_features(query),                       # 2-24
        1 if query.ide == 'jetbrains' else 0,           # 0
        1 if query.ide == 'vsc' else 0,                 # 1
    ]

class Logres:
    def __init__(self, weights, intercept, feature_fn):
        self.coef = weights
        self.intercept = intercept
        self.feature_fn = {
            'copilot': copilot,
            'tr_copilot': tr_copilot, 
            'ide_and_copilot': ide_and_copilot,
            'nontextual': nontextual,
        }[feature_fn]

    def preprocess(self, X): 
        return self.feature_fn(X)

    def predict(self, X): 
        return X @ self.coef + self.intercept  > 0 

In [None]:
import tqdm 
from tqdm.contrib.concurrent import process_map, thread_map

X_test, y_test = query_data['unbalanced']['X_test'], query_data['unbalanced']['y_test']

predictions = { 'ground_truth' : y_test } 
scores = {} 
    
def compute_score(path: str):

    og_logres = pickle.load(open(path, 'rb'))
    coef = og_logres.coef_[0]
    intercept = og_logres.intercept_

    model_name = path.split('/')[-1]
    assert model_name not in predictions, 'uh oh you have a duplicate!!' 
    feature_fn = model_name.split('-')[-2]

    logres = Logres(coef, intercept, feature_fn)

    set_all_seeds(42)
    score, y_preds = common_evaluation(
        lambda X_queries: np.array([np.array(logres.preprocess(q)) for q in X_queries]),
        lambda X: logres.predict(X),
        X_test, y_test, return_preds=True
    ) 

    return {model_name: y_preds}, {model_name: score}

# results = process_map(compute_score, model_dirs['logres'])
remainder = sum([model_dir_split['logres'] for model_dir_split \
             in [model_dirs_1, model_dirs_2, model_dirs_3, model_dirs_4]], [])
print('\n'.join(remainder))

results = thread_map(compute_score, remainder)
for preds, score in results:
    predictions.update(preds)
    scores.update(score)

# sort scores on 'macro avg', print the top five 
sorted_scores = sorted(scores.items(), key=lambda x: x[1]['macro avg'], reverse=True)
for model_name, score in sorted_scores[:5]:
    print(f'{model_name}: \t{score["macro avg"]:.2f}')

In [28]:
import pandas as pd 

for name in sorted(predictions.keys()):
    print(name)
pd.DataFrame(predictions).to_parquet('notebooks/paper/results/logres_preds.parquet')

-11_logres-biased-copilot-1.pkl
-11_logres-biased-copilot-2.pkl
-11_logres-biased-copilot-3.pkl
-11_logres-biased-copilot-4.pkl
-11_logres-biased-ide_and_copilot-1.pkl
-11_logres-biased-ide_and_copilot-2.pkl
-11_logres-biased-ide_and_copilot-3.pkl
-11_logres-biased-ide_and_copilot-4.pkl
-11_logres-biased-nontextual-1.pkl
-11_logres-biased-nontextual-2.pkl
-11_logres-biased-nontextual-3.pkl
-11_logres-biased-nontextual-4.pkl
-11_logres-biased-tr_copilot-1.pkl
-11_logres-biased-tr_copilot-2.pkl
-11_logres-biased-tr_copilot-3.pkl
-11_logres-biased-tr_copilot-4.pkl
-11_logres-classes-copilot-1.pkl
-11_logres-classes-copilot-2.pkl
-11_logres-classes-copilot-3.pkl
-11_logres-classes-copilot-4.pkl
-11_logres-classes-ide_and_copilot-1.pkl
-11_logres-classes-ide_and_copilot-2.pkl
-11_logres-classes-ide_and_copilot-3.pkl
-11_logres-classes-ide_and_copilot-4.pkl
-11_logres-classes-nontextual-1.pkl
-11_logres-classes-nontextual-2.pkl
-11_logres-classes-nontextual-3.pkl
-11_logres-classes-nontextua

#### codeberta / jonberta
I think jonberta stands for Joint Optimisation in atteNtion 

In [5]:
def tokenize_joint_sample(sample, max_suffix_tokens=128, tokenizer=None):
    ''' For a single sample, tokenize prefix and suffix, separating by </s> sep token. 
        Set max_suffix_tokens to maximal amount of suffix to include, when it exists. '''

    if tokenizer is None: 
        tokenizer = AutoTokenizer.from_pretrained('microsoft/codeberta-base') # woefully inefficient
    max_length = tokenizer.model_max_length # 512 

    # figure out how many suffix tokens we have (128 max)
    tokenizer.truncation_side = 'right'
    suffix = tokenizer(sample['suffix'], padding='do_not_pad', truncation=True, return_tensors='pt',
                          max_length = max_suffix_tokens + 1) # to accomodate removal of <s>

    n_suffix_tokens = len(suffix['input_ids'][0]) - 1

    tokenizer.truncation_side = 'left'
    prefix = tokenizer(sample['prefix'], padding='do_not_pad', truncation=True, return_tensors='pt',
                       max_length = max_length - n_suffix_tokens)

    n_prefix_tokens = len(prefix['input_ids'][0])
    tokenizer.truncation_side = 'right'
    suffix = tokenizer(sample['suffix'], padding='max_length', truncation=True, return_tensors='pt',
                       max_length = max_length - n_prefix_tokens + 1) # to accomodate removal of <s>
    
    suffix['input_ids'] = suffix['input_ids'][:, 1:]
    suffix['attention_mask'] = suffix['attention_mask'][:, 1:]

    sample.update({k: torch.cat((prefix[k], suffix[k]), dim=1) for k in prefix})
    return sample

def get_nontextual_features(query) -> list:
    ''' Get the features that could otherwise not be extracted from the context alone '''
    return [
        1 if query.ide == 'jetbrains' else 0,           # 0
        1 if query.ide == 'vsc' else 0,                 # 1
        math.log(1 + query.time_since_last_completion), # 1
        math.log(1 + query.get_document_length()),      # 2
        math.log(1 + query.get_offset()),               # 3
        query.get_offset_as_percentage(),               # 4
        *query.get_document_language_vector(),          # 5-24
    ]

class MyPipeline(TextClassificationPipeline):
    ''' oh yeah custom pipeline because of the custom tokenisation!
        how convenient huggingface ill hug your face extra hard next time i see you '''
    
    def __init__(self, *args, incl_features=True, preprocess_fn=tokenize_joint_sample, **kwargs):
        super().__init__(*args, **kwargs)
        self.incl_features = incl_features
        self.preprocess_fn = preprocess_fn

    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {} 
        if 'preprocess_fn' in kwargs: 
            preprocess_kwargs['preprocess_fn'] = kwargs.pop('preprocess_fn')
        return preprocess_kwargs, {}, {} 
    
    def preprocess(self, inputs, preprocess_fn=None):
        inputs = {
            'prefix': inputs.prefix, 
            'suffix': inputs.suffix, 
            'encoder_hidden_states': get_nontextual_features(inputs)
        }
        inputs = preprocess_fn(inputs) if self.preprocess_fn is None else self.preprocess_fn(inputs)
        if 'prefix' in inputs: del inputs['prefix']
        if 'suffix' in inputs: del inputs['suffix']
        # given that pipeline is used in sequential eval, we neeed to add a batch dimension for the model to not throw a tantrum
        if self.incl_features:
            inputs['encoder_hidden_states'] = torch.tensor(inputs['encoder_hidden_states'], dtype=torch.float32).unsqueeze(0)
        elif 'encoder_hidden_states' in inputs: 
            del inputs['encoder_hidden_states']
        return inputs
    
    def _forward(self, model_inputs):
        return self.model(**model_inputs)
    
    def postprocess(self, model_outputs):
        return model_outputs.logits.argmax(-1)
    
def get_model(model_path):
    config = AutoConfig.from_pretrained(model_path)

    model = JonbertaForSequenceClassification(config)
    if hasattr(config, 'add_head') and config.add_head: 
        add_features_to_model(model, config)

    # ah yes huggingface is a 5 BILLION dollar company now
    state_dict = {} 
    with safe_open(os.path.join(model_path, 'model.safetensors'), framework='pt') as f: 
        for key in f.keys():
            state_dict[key] = f.get_tensor(key)
    new_layers = model.load_state_dict(state_dict, strict=False)

    pprint(new_layers)
    # print(model)
    return model 

from util import set_all_seeds
from copy import deepcopy

def compute_transformer_score(model_path, device):
    
    X_test, y_test = query_data['unbalanced']['X_test'], query_data['unbalanced']['y_test']
    # create deep copies of X_test and y_test to avoid modifying the original data
    X_test, y_test = deepcopy(X_test), deepcopy(y_test)

    set_all_seeds(42)
    model = get_model(model_path)
    model_name = model_path.split('/')[-1]

    # assert model_name not in predictions, 'uh oh you have a duplicate!'
    # we need to re-instantiate a tokenizer per thread / process
    tokenizer = AutoTokenizer.from_pretrained('huggingface/CodeBERTa-small-v1')
    preprocess_fn = lambda sample: tokenize_joint_sample(sample, tokenizer=tokenizer)

    pipe = MyPipeline(
        device=device, task='text-classification',
        model=model, incl_features=True, # for telemetry 
        preprocess_fn=preprocess_fn
    )

    score, y_preds = common_evaluation(
        lambda X_queries: X_queries,
        lambda X: torch.cat([
            pipe(x) for x in  \
            X
            # tqdm.tqdm(X, total=len(X_test), desc='sequential test')
        ]),
        X_test, y_test, return_preds=True)

    return {model_name: y_preds}, {model_name: score}

In [None]:
import pandas as pd 

# merge the remaining _1-4 model dirs together 
todo_model_dirs = {model_type: model_dirs_1[model_type] + model_dirs_2[model_type] + model_dirs_3[model_type] + model_dirs_4[model_type]
                   for model_type in model_dirs_1.keys()}
todo_models = todo_model_dirs['codeberta'] + todo_model_dirs['jonberta']
n_todo = len(todo_models)

print(f'Evaluating {n_todo} models')
import time 

def run_model(kwargs):
    model, device_no = kwargs['model'], kwargs['device_no']
    # if device_no != 0: sleep 
    # if device_no != 0: 
    #     time.sleep(10)

    print(f'running {model} on cuda:{device_no}')
    # if not model.endswith('-4'): 
    #     time.sleep(60)

    preds, score = compute_transformer_score(model, f'cuda:{device_no}')
    model_name = model.split('/')[-1]
    pd.DataFrame(preds).to_csv(f'notebooks/paper/results/intermediate/preds_{model_name}.csv')
    pd.DataFrame(score).to_csv(f'notebooks/paper/results/intermediate/score_{model_name}.csv')
    return preds, score 

# # create tuples of models and the cuda device (%2)
# todo_models = [dict(model=model, device_no=i%2) for i, model in enumerate(todo_models)]
# results = process_map(run_model, todo_models, max_workers=4)

# instead, try running on 4 threads 
from threading import Thread
from queue import Queue

def worker(q: Queue, i): 
    while not q.empty(): 
        model_name = q.get()
        run_model(dict(model=model_name, device_no=i%2))
        q.task_done()

q = Queue()
for model in todo_models[277:]: 
    print(model)
    q.put(model)

for i in range(2):
    t = Thread(target=worker, args=(q, i))
    t.daemon = True
    t.start()

q.join()

# def worker(q):
#     while True:
#         kwargs = q.get()
#         run_model(kwargs)
#         q.task_done()

# q = Queue()
# for i, model in enumerate(todo_models[:3]): 
#     q.put(dict(model=model, device_no=i%2))

# for i in range(4):
#     t = Thread(target=worker, args=(q,))
#     t.daemon = True
#     t.start()



# models = model_dirs['codeberta'] + model_dirs['jonberta']
# n_models = len(models)
# print('running {} models'.format(n_models))

# for model in models: 
#     compute_transformer_score(model, 'cuda:0')

#     pd.DataFrame(scores).to_csv('notebooks/paper/results/all_model_scores.csv')
#     pd.DataFrame(predictions).to_csv('notebooks/paper/results/all_model_predictions.csv')

# Testing 

In [None]:
models_dir = os.path.join(os.getcwd(), '..', 'models')

# models of interest
joint_head_name = '-13_jonberta-biased-12_codeberta-biased-2e-05lr--0-(HEAD-dense--reinit)-2e-05lr-1'
joint_attn_name = '13_jonberta-biased-12_codeberta-biased-2e-05lr--0-(ATTN-208C_f-[0]L)-2e-05lr--4'
logres_name = '11_logres-biased-copilot-4.pkl'
context_name = '12_codeberta-biased-2e-05lr--0'

# original scores for sanity check  # 87.2 # 86.9 # 84.5
og_joint_head_score = { 'macro avg': 87.475, 'f1': 0.7373, 'precision': 0.5925, 'recall': 0.9757, }
og_joint_attn_score = { 'macro avg': 87.452, 'f1': 0.7308, 'precision': 0.5839, 'recall': 0.9763, }
og_context_score    = { 'macro avg': 86.732, 'f1': 0.7171, 'precision': 0.5678, 'recall': 0.9727, }

#### Logistic Regression

In [None]:
# NOTE: Feature retrieval functions given a query 
def _shared_features(query):
    ''' Maintaining this so it's clearer which function adds what '''
    return [
        math.log(1 + query.time_since_last_completion),
        math.log(1 + query.get_document_length()),
        math.log(1 + query.get_offset()),
        query.get_offset_as_percentage(),
        *query.get_document_language_vector(),          # 5-24
    ]

def copilot_features(query):
    ''' Features used in reverse-engineering Copilot, 
        except those that depend on a pre-existing filter implementation '''

    return [
        *_shared_features(query),
        # we don't have a previous filter label
        int(query.get_whitespace_after_cursor()),
        # time since last label should be very close to time_since_last_completion
        math.log(1 + query.get_prefix_last_line_length()),
        math.log(1 + query.get_prefix_trimmed_last_line_length()),
        *query.get_prefix_last_character_vector(),
        *query.get_trimmed_prefix_last_character_vector()
    ]

class Logres:
    def __init__(self, weights, intercept):
        self.coef = weights
        self.intercept = intercept

    def predict(self, X): 
        return X @ self.coef + self.intercept  > 0 

In [None]:
og_logres = pickle.load(open(os.path.join(models_dir, logres_name), 'rb'))
coef = og_logres.coef_[0]
intercept = og_logres.intercept_

logres = Logres(coef, intercept)
X_test, y_test = query_data['unbalanced']['X_test'], query_data['unbalanced']['y_test']

set_all_seeds(42)
og_scores, scores = (common_evaluation(
    lambda X_queries: np.array([np.array(copilot_features(q)) for q in X_queries]),
    lambda X: model.predict(X),
    X_test, y_test,) for model in (og_logres, logres))


In [None]:
# assert k, v pairs are equal in both dictionaries
def assert_dict_eq(d1, d2, strict=True):
    if strict: assert d1.keys() == d2.keys()
    for k in d1.keys():
        # 4x speedup if you ask me, but totally insignificant
        if 'time' in k: print(f'{k:20}: \t{d1[k]:.8f} != \t{d2[k]:.8f}')
        else: assert d1[k] == d2[k], f'{k}: {d1[k]} != {d2[k]}'

assert_dict_eq(og_scores, scores, strict=False)

In [None]:
# Cool, lets save the arrays
model_path = 'logres (weights, bias)'
with open(os.path.join(models_dir, model_path + '.pkl'), 'wb') as f:
    pickle.dump((coef, intercept), f)

print(coef, intercept)

#### CodeBERTa (Code Context Only)

In [None]:
def tokenize_joint_sample(sample, max_suffix_tokens=128):
    ''' For a single sample, tokenize prefix and suffix, separating by </s> sep token. 
        Set max_suffix_tokens to maximal amount of suffix to include, when it exists. '''

    max_length = tokenizer.model_max_length # 512 

    # figure out how many suffix tokens we have (128 max)
    tokenizer.truncation_side = 'right'
    suffix = tokenizer(sample['suffix'], padding='do_not_pad', truncation=True, return_tensors='pt',
                          max_length = max_suffix_tokens + 1) # to accomodate removal of <s>

    n_suffix_tokens = len(suffix['input_ids'][0]) - 1

    tokenizer.truncation_side = 'left'
    prefix = tokenizer(sample['prefix'], padding='do_not_pad', truncation=True, return_tensors='pt',
                       max_length = max_length - n_suffix_tokens)

    n_prefix_tokens = len(prefix['input_ids'][0])
    tokenizer.truncation_side = 'right'
    suffix = tokenizer(sample['suffix'], padding='max_length', truncation=True, return_tensors='pt',
                       max_length = max_length - n_prefix_tokens + 1) # to accomodate removal of <s>
    
    suffix['input_ids'] = suffix['input_ids'][:, 1:]
    suffix['attention_mask'] = suffix['attention_mask'][:, 1:]

    sample.update({k: torch.cat((prefix[k], suffix[k]), dim=1) for k in prefix})
    return sample


def get_nontextual_features(query) -> list:
    ''' Get the features that could otherwise not be extracted from the context alone '''
    return [
        1 if query.ide == 'jetbrains' else 0,           # 0
        1 if query.ide == 'vsc' else 0,                 # 1
        math.log(1 + query.time_since_last_completion), # 1
        math.log(1 + query.get_document_length()),      # 2
        math.log(1 + query.get_offset()),               # 3
        query.get_offset_as_percentage(),               # 4
        *query.get_document_language_vector(),          # 5-24
    ]

class MyPipeline(TextClassificationPipeline):
    ''' oh yeah custom pipeline because of the custom tokenisation!
        how convenient huggingface ill hug your face extra hard next time i see you '''
    
    def __init__(self, *args, incl_features=True, preprocess_fn=tokenize_joint_sample, **kwargs):
        super().__init__(*args, **kwargs)
        self.incl_features = incl_features
        self.preprocess_fn = preprocess_fn

    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {} 
        if 'preprocess_fn' in kwargs: 
            preprocess_kwargs['preprocess_fn'] = kwargs.pop('preprocess_fn')
        return preprocess_kwargs, {}, {} 
    
    def preprocess(self, inputs, preprocess_fn=None):
        inputs = {
            'prefix': inputs.prefix, 
            'suffix': inputs.suffix, 
            'encoder_hidden_states': get_nontextual_features(inputs)
        }
        inputs = preprocess_fn(inputs) if self.preprocess_fn is None else self.preprocess_fn(inputs)
        if 'prefix' in inputs: del inputs['prefix']
        if 'suffix' in inputs: del inputs['suffix']
        # given that pipeline is used in sequential eval, we neeed to add a batch dimension for the model to not throw a tantrum
        if self.incl_features:
            inputs['encoder_hidden_states'] = torch.tensor(inputs['encoder_hidden_states'], dtype=torch.float32).unsqueeze(0)
        elif 'encoder_hidden_states' in inputs: 
            del inputs['encoder_hidden_states']
        return inputs
    
    def _forward(self, model_inputs):
        return self.model(**model_inputs)
    
    def postprocess(self, model_outputs):
        return model_outputs.logits.argmax(-1)

In [None]:
def get_model(model_name):
    model_dir = os.path.join(models_dir, model_name)
    config = AutoConfig.from_pretrained(model_dir)

    model = JonbertaForSequenceClassification(config)
    if hasattr(config, 'add_head') and config.add_head: 
        add_features_to_model(model, config)

    # ah yes huggingface is a 5 BILLION dollar company now
    state_dict = {} 
    with safe_open(os.path.join(model_dir, 'model.safetensors'), framework='pt') as f: 
        for key in f.keys():
            state_dict[key] = f.get_tensor(key)
    new_layers = model.load_state_dict(state_dict, strict=False)

    pprint(new_layers)
    print(model)
    return model 

In [None]:
from util import set_all_seeds
import tqdm 
DEVICE = 'cuda' if torch.cuda.is_available() else\
    'mps' if torch.backends.mps.is_available() else 'cpu'

def get_score(model_name):
    
    set_all_seeds(42)
    model = get_model(model_name)

    pipe = MyPipeline(
        device=DEVICE, task='text-classification',
        model=model, incl_features=True # for telemetry 
    )

    return common_evaluation(
        lambda X_queries: X_queries,
        lambda X: torch.cat([pipe(x) for x in tqdm.tqdm(X, total=len(X_test), desc='sequential test')]),
        X_test, y_test)

In [None]:
scores = {}
for model_path in (joint_head_name, joint_attn_name, context_name):

    score = get_score(model_path)
    scores[model_path] = score 

    print(f'\033[1m{model_path}\033[0m')
    pprint(score)

In [None]:
new_scores = {}
for model_path in (joint_head_name, joint_attn_name, context_name):

    score = get_score(model_path)
    new_scores[model_path] = score 

    print(f'\033[1m{model_path}\033[0m')
    pprint(score)

In [None]:
# assert_dict_eq(og_joint_head_score, scores[joint_head_name], strict=False)
# assert_dict_eq(og_joint_attn_score, scores[joint_attn_name], strict=False)
# assert_dict_eq(og_context_score, scores[context_name], strict=False)
# pprint(scores[joint_head_name])
# pprint(og_joint_head_score)

# pprint(scores[joint_attn_name])
# pprint(og_joint_attn_score)

# pprint(scores[context_name])
# pprint(og_context_score)

for model in (joint_head_name, joint_attn_name, context_name):
    print(f'\033[1m{model}\033[0m')
    for (k_1, v_1), (k_2, v_2) in zip(scores[model].items(), new_scores[model].items()):
        print(f'{k_1:20}: \t{v_1:.8f} != \t{v_2:.8f}')


In [None]:
import gc 
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
    except:
        pass

# for obj in gc.get_objects():
#     if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
#         print(reduce(op.mul, obj.size()) if len(obj.size()) > 0 else 0, type(obj), obj.size())

## Sanity Check
Let's make sure the implementation of deployed filters is correct. 

In [None]:
os.chdir('..')
from sanity_check import Filter, filters, set_all_seeds
filters

In [None]:
import tqdm 
X, y = query_data['unbalanced']['X_test'], query_data['unbalanced']['y_test']
# X, y = X[:3000], y[:3000]

for f_type, f in filters.items():

    set_all_seeds()
    score = common_evaluation(
        lambda X_queries: X_queries,
        lambda X: [int(f(x)) for x in tqdm.tqdm(X, total=len(X), desc='sequential test')],
        X, y)
    
    print(f_type)
    pprint(score)

In [None]:
import tqdm 
X, y = query_data['unbalanced']['X_test'], query_data['unbalanced']['y_test']

for f_type, f in filters.items():

    set_all_seeds()
    score = common_evaluation(
        lambda X_queries: X_queries,
        lambda X: [int(f(x)) for x in tqdm.tqdm(X, total=len(X), desc='sequential test')],
        X, y)
    
    print(f_type)
    print(score)

In [None]:
import tqdm 
X, y = query_data['unbalanced']['X_test'], query_data['unbalanced']['y_test']

for f_type, f in filters.items():

    set_all_seeds()
    score = common_evaluation(
        lambda X_queries: X_queries,
        lambda X: [int(f(x)) for x in tqdm.tqdm(X, total=len(X), desc='sequential test')],
        X, y)
    
    print(f_type)
    print(score)