In [79]:
import numpy as np
import pandas as pd
from catboost import CatBoostRanker, CatBoostClassifier , Pool, cv
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, recall_score
from functools import partial 
from sklearn.base import BaseEstimator
from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from scipy.stats import rankdata
import pickle
from tqdm.auto import trange
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedGroupKFold

In [80]:
train_data = pd.read_json('dev.statement.jsonl',lines=True)
test_data = pd.read_json('test.statement.jsonl',lines=True)

train_data['preds'] = [x.tolist() for x in np.load('val_preds.npy')]
test_data['preds'] = [x.tolist() for x in np.load('simple_gemma.npy')]
train_data['preds_rank'] = train_data['preds'].apply(rankdata)
test_data['preds_rank'] = test_data['preds'].apply(rankdata)

In [81]:
def make_rank_data(data,is_test=False):
    df = []
    for i in trange(len(data)):
        row = data.iloc[i]
        conc_q = row['question']['question_concept']
        label = 'F' if is_test else row['answerKey']
        for j,choice in enumerate(row['question']['choices']):
            df.append({
                'group_id': i,
                'model_score': row['preds'][j],
                'model_rank': row['preds_rank'][j],
                'question_concept': conc_q,
                'text': choice['text'],
                'key': choice['label'],
                'labels': int(choice['label'] == label)
            })
    return pd.DataFrame(df)

In [82]:
train_df = make_rank_data(train_data,is_test=False)
test_df = make_rank_data(test_data,is_test=True)

  0%|          | 0/877 [00:00<?, ?it/s]

  0%|          | 0/2192 [00:00<?, ?it/s]

In [83]:
with open('./graph.pkl','rb') as f:
    state = pickle.load(f)

In [84]:
def get_neghbours(x):
    if x == -100:
        return {}
    return list(state['graph'].neighbors(x))

def iou(x,y):
    return len(set(x) & set(y)) / len(set(x) | set(y))

def obj(x,y):
    return len(set(x) & set(y))

def get_graph_features(data):
    data['question_concept_n'] = data['question_concept'].map(lambda x: state['concepts_maper'][x.replace(' ','_')])
    data['target_concapt_n'] = data['text'].apply(lambda x: state['concepts_maper'][x.replace(' ','_')] if x in state['concepts_maper'].keys() else -100)
    data['text_is_relevant'] = (data['target_concapt_n'] == -100).astype(int)
    
    data['is_neghbour_12'] = data.apply(lambda x: int(x['target_concapt_n'] in list(state['graph'].neighbors(x['question_concept_n']))),axis=1)
    data['is_neghbour_21'] = data.apply(lambda x: int(x['question_concept_n'] in list(state['graph'].neighbors(x['target_concapt_n'])) if x['text_is_relevant'] == 0 else -100),axis=1)
    data['iou_neighbours'] = data.apply(lambda x: iou(get_neghbours(x['question_concept_n']),get_neghbours(x['target_concapt_n'])),axis=1)
    data['obj_neighbours'] = data.apply(lambda x: obj(get_neghbours(x['question_concept_n']),get_neghbours(x['target_concapt_n'])),axis=1)
    
    data['relations'] = data.apply(lambda x: state['graph'].edges[x['question_concept_n'],x['target_concapt_n']]['relation']  if x.is_neghbour_12 == 1 else '-100',axis=1)
    data['cos_sim'] = data.apply(lambda x: cos_sim(state['embeds'][[x['question_concept_n']]],state['embeds'][[x['target_concapt_n']]])[0][0],axis=1)
    return data

In [85]:
train_df = get_graph_features(train_df)
test_df = get_graph_features(test_df)

In [86]:
params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'loss_function': 'YetiRank',
    'max_depth': 5,
    'eval_metric': 'PrecisionAt:top=1',
    'random_seed': 56,
    'task_type': 'CPU'
}

label_col = 'labels'
group_id = 'group_id'
cat_cols = []
drop_cols = [
    'question_concept',
    'text',
    'key',
    'relations', # mayby use it like emb feat,
]

In [87]:
class CustomRankerKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        self.num_repits = num_repits
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,group_col=None,verbose=False):
        self.scores = []
        
        for i in trange(self.num_repits):
            kfold = StratifiedGroupKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,y=train_data[label_col],groups=train_data[group_col])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]
                                
                train_pool = Pool(
                    data = train_df.drop([label_col,group_col]+drop_cols,axis=1),
                    label = train_df[label_col],
                    group_id = train_df[group_col],
                    cat_features = cat_features
                )
                
                test_pool = Pool(
                    data = test_df.drop([label_col,group_col]+drop_cols,axis=1),
                    label = test_df[label_col],
                    group_id = test_df[group_col],
                    cat_features = cat_features
                )

                cbm = CatBoostRanker(**self.params)
                cbm.fit(train_pool,eval_set=test_pool,verbose=verbose)
                self.models.append(cbm)
                
        #print(f"Total Score {np.mean(self.scores)}")
            
    def predict(self,test_data, drop_cols=None, cat_features=None, group_col=None):
        test_pool = Pool(
            data = test_data.drop([group_col] + drop_cols,axis=1),
            group_id = test_data[group_col],
            cat_features = cat_features,
        )
        preds = np.mean([
            model.predict(test_pool) for model in self.models
        ],axis=0)
        return preds
    
    def get_feature_importance(self,type='FeatureImportance'):
        imp_0 = self.models[0].get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [88]:
model = CustomRankerKfoldWraper(
    num_folds=5,
    num_repits=2,
    params=params,
    random_state=56,
)

In [89]:
model.fit(
    train_data=train_df,
    cat_features=cat_cols,
    group_col=group_id,
    drop_cols=drop_cols,
    label_col=label_col,
    verbose=500,
)

  0%|          | 0/2 [00:00<?, ?it/s]

0:	learn: 0.8202568	test: 0.8579545	best: 0.8579545 (0)	total: 2.53ms	remaining: 2.53s
500:	learn: 0.9871612	test: 0.8522727	best: 0.8863636 (36)	total: 956ms	remaining: 952ms
999:	learn: 0.9942939	test: 0.8352273	best: 0.8863636 (36)	total: 1.93s	remaining: 0us

bestTest = 0.8863636364
bestIteration = 36

Shrink model to first 37 iterations.
0:	learn: 0.8388017	test: 0.7954545	best: 0.7954545 (0)	total: 2.3ms	remaining: 2.3s
500:	learn: 0.9885877	test: 0.8068182	best: 0.8181818 (126)	total: 983ms	remaining: 979ms
999:	learn: 0.9928673	test: 0.7954545	best: 0.8181818 (126)	total: 1.94s	remaining: 0us

bestTest = 0.8181818182
bestIteration = 126

Shrink model to first 127 iterations.
0:	learn: 0.8319088	test: 0.8228571	best: 0.8228571 (0)	total: 2.26ms	remaining: 2.26s
500:	learn: 0.9914530	test: 0.8342857	best: 0.8514286 (41)	total: 965ms	remaining: 961ms
999:	learn: 0.9971510	test: 0.8400000	best: 0.8514286 (41)	total: 1.98s	remaining: 0us

bestTest = 0.8514285714
bestIteration = 41



In [90]:
test_ranks = model.predict(test_df,drop_cols=drop_cols, cat_features=cat_cols, group_col=group_id)

In [91]:
test_df['ranks'] = test_ranks

In [92]:
test_df = test_df.sort_values(by=['group_id','key'])

In [93]:
test_df

Unnamed: 0,group_id,model_score,model_rank,question_concept,text,key,labels,question_concept_n,target_concapt_n,text_is_relevant,is_neghbour_12,is_neghbour_21,iou_neighbours,obj_neighbours,relations,cos_sim,ranks
0,0,-8.687500,1.0,committing murder,ocean,A,0,55723,4042,0,0,0,0.000000,0,-100,0.499784,-2.086214
1,0,-6.500000,2.0,committing murder,fear,B,0,55723,3842,0,1,0,0.004831,1,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.553604,-0.996263
2,0,1.859375,5.0,committing murder,own death,C,0,55723,-100,1,0,-100,0.000000,0,-100,0.575224,2.160321
3,0,-3.609375,3.0,committing murder,imprisonment,D,0,55723,55729,0,1,0,0.011111,1,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.576761,-0.149942
4,0,-3.531250,4.0,committing murder,incarceration,E,0,55723,55730,0,1,0,0.011236,1,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.690385,0.352448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,2191,3.421875,5.0,clerk,shop,A,0,17799,16864,0,1,0,0.014286,3,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.568044,3.316126
10956,2191,-0.675781,3.0,clerk,post office,B,0,17799,-100,1,0,-100,0.000000,0,-100,0.461204,0.401740
10957,2191,-2.937500,1.0,clerk,at hotel,C,0,17799,-100,1,0,-100,0.000000,0,-100,0.461204,-0.685083
10958,2191,-1.734375,2.0,clerk,airport,D,0,17799,16385,0,1,0,0.000000,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.526466,0.206555


In [94]:
pre_sub = test_df.groupby('group_id')['ranks'].agg(lambda x: np.argmax(list(x)))

In [96]:
pre_sub.index = test_data['id']

In [98]:
pre_sub = pre_sub.to_dict()

In [100]:
df = pd.read_csv('subv56v2.csv',header=None)

In [102]:
df['preds_2'] = df[0].map(pre_sub)

In [104]:
df['preds_2'] = df['preds_2'].apply(lambda x: 'ABCDE'[x])

In [112]:
(df[1] == df['preds_2']).sum()

1966

In [115]:
df[[0,'preds_2']].to_csv('cb_rerank.csv',index=False,header=None)