In [None]:
!pip install faiss-gpu
!pip install wandb -qU

In [None]:
!pip install torch
!pip install pytorch-metric-learning

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import os
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from tqdm import tqdm
import plotly.express as px
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from pytorch_metric_learning import miners, losses, samplers
from pytorch_metric_learning.distances import LpDistance
from pytorch_metric_learning.distances import CosineSimilarity
from torch.optim import Adam
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE 
from sklearn.manifold import MDS 
import pickle
import faiss
import json
import random
import io

In [None]:
torch.cuda.get_device_name(0)

# Transform Class

In [None]:
class BERTEmbeddingTransform(object):
    def __init__(self, bert_model, tokenizer, device='cpu'):
        bert_model.eval()
        bert_model = bert_model.to(device)
        bert_model.share_memory()
        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.device = device
    
    def __call__(self, sample):
        code_tokens=self.tokenizer.tokenize(sample)
        tokens = code_tokens
        tokens_ids=self.tokenizer.convert_tokens_to_ids(tokens)
        done_tok = torch.split(torch.tensor(tokens_ids, device=self.device), 510)
        with torch.no_grad():
            embedings = []
            for input_tok in done_tok:
                input_tok = torch.cat((torch.tensor([0], device=self.device), input_tok, torch.tensor([2], device=self.device)))
                temp = self.bert_model(input_tok.clone().detach()[None,:], output_hidden_states = True)
                embedings.append(temp[1][-2])
            return torch.concat(embedings,dim=1).squeeze().mean(dim=0)

# Dataset class

In [None]:
class GPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cuda')
        else: return super().find_class(module, name)


In [None]:
class CFDataset(Dataset):
    def __init__(self, file_names, csv_dir, mean=True, train=True, test_split=0.2):
        super(CFDataset, self).__init__()
        self.mean = mean
        self.file_names = file_names # a list of pickle file names
        self.submissions_df = pd.read_orc(csv_dir)
        print('submissions_df size:', len(self.submissions_df))
        self.submissions_df['problem_url'] =  self.submissions_df['problem_url'].astype("category")
        self.submissions_df['problem_id'] = self.submissions_df.problem_url.cat.codes
        self.submissions_df['cf_tags_cat'] = self.submissions_df['cf_tags'].apply(lambda x: ",".join(sorted(x)) if x is not None else np.nan)
        self.submissions_df['cf_tags_cat'] = self.submissions_df['cf_tags_cat'].fillna("")
        self.submissions_df['cf_tags_cat'] = self.submissions_df['cf_tags_cat'].astype("category")
        self.submissions_df['cf_tags_id'] = self.submissions_df.cf_tags_cat.cat.codes
        
        self.submissions_df.drop(columns=["source_code"], inplace=True)
        self.submissions_df = self.submissions_df.sort_values(by=["problem_url", "submission_id"])
        #self.submissions_df.drop(labels="Unnamed: 0", axis=1, inplace=True)
        self.submissions_df = self.submissions_df.reset_index(drop=True)
        self.data = []
        for file_name in self.file_names:
            with open(file_name, "rb") as f:
                self.data.extend( GPU_Unpickler(f).load()) # load the data from the pickle file
        for i, sample in enumerate(self.data): # iterate over the data samples
            if sample is None:
                self.submissions_df.drop(axis=0, index=i, inplace=True)
                del self.data[i]
        self.submissions_df = self.submissions_df.reset_index(drop=True)
        dataset_size = len(self.submissions_df)
        print('submissions_df size after None filter:', dataset_size)
        split = int(np.floor(test_split * dataset_size))
        if train:
            self.submissions_df = self.submissions_df[split:]
            self.data = self.data[split:]
        else:
            self.submissions_df = self.submissions_df[:split]
            self.data = self.data[:split]
        self.submissions_df = self.submissions_df.reset_index(drop=True)
        
    def mean_transform(self, sample):
        return torch.reshape(sample, (-1, 768)).mean(dim=0)

    def __len__(self):
        return len(self.submissions_df)

    def __getitem__(self, idx):
        #label = self.submissions_df['cf_tags_id'][idx]
        label = self.submissions_df['problem_id'][idx]
        sample = self.data[idx]
        if self.mean:
            sample = self.mean_transform(sample)
        return sample, label

# Model class

In [None]:
class MLP256(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(768, 512),
            torch.nn.BatchNorm1d(512),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(512,256),
            # Plus 1 layer
            #torch.nn.BatchNorm1d(512),
            #torch.nn.GELU(),
            #torch.nn.Dropout(p=0.2),
            #torch.nn.Linear(512,256)
        )
    def forward(self, x):
        y = self.mlp(x)
        return y

In [None]:
class MLP128(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(768, 256),
            torch.nn.BatchNorm1d(256),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(256,128),
            # Plus 1 layer
            #torch.nn.BatchNorm1d(512),
            #torch.nn.GELU(),
            #torch.nn.Dropout(p=0.2),
            #torch.nn.Linear(512,256)
        )
    def forward(self, x):
        y = self.mlp(x)
        return y

In [None]:
class MLP256_old(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(768, 512),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(512, 512),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(512, 512),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(512, 256),
        )
    def forward(self, x):
        y = self.mlp(x)
        return y

In [None]:
class BiLSTMVectorizer(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.lstm = torch.nn.LSTM(bidirectional=True, num_layers=2, dropout=0.1, input_size=input_size, hidden_size=hidden_size)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(512, 256),
            torch.nn.GELU(),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(256, 256)
        )

    def forward(self, x):
        output, ctx = self.lstm(x)
        output=output[0][-1]
        return self.mlp(output)#, ctx

In [None]:
def BiLSTM_collate(batch):
    #[(0, 1), (2, 3)]
    embs = []
    labels = []
    for emb, label in batch:
        embs.append(emb)
        labels.append(label)
    return (embs, torch.Tensor(labels))

# Load data

In [None]:
batch_size=128
m_classes = 16
assert 128%16 == 0

In [None]:
#train_dt = CPPSubmissionsDataset("/kaggle/input/filter-cpp-submission-dataset/filtered.csv", bert_transform, train=True, test_split=0.2)
#test_dt = CPPSubmissionsDataset("/kaggle/input/filter-cpp-submission-dataset/filtered.csv", bert_transform, train=False, test_split=0.2)
tensors_paths = [
    "/kaggle/input/cc-embs/embeddings_CodeBERTcpp.pkl",
]
csv_path = "/kaggle/input/cc-embs/code_contests_cf_filtered_exploded_truncated.snappy.orc"
test_dt = CFDataset(tensors_paths, csv_path, mean=True, train=False)
train_dt = CFDataset(tensors_paths, csv_path, mean=True, train=True)


In [None]:
train_sampler = samplers.MPerClassSampler(train_dt.submissions_df["problem_id"], m_classes, batch_size=batch_size)
test_sampler = samplers.MPerClassSampler(test_dt.submissions_df["problem_id"], m_classes, batch_size=batch_size)

In [None]:
trainloader = DataLoader(train_dt, sampler=train_sampler, batch_size=batch_size, shuffle=False)
testloader = DataLoader(test_dt, sampler=test_sampler, batch_size=batch_size, shuffle=False)

In [None]:
# Val set up
ind_set = set(test_dt.submissions_df['problem_id']
              .value_counts()[
                  test_dt.submissions_df['problem_id'].value_counts()>5
              ].index.values)

index_val_df = test_dt.submissions_df[
        test_dt.submissions_df['problem_id'] \
        .isin(ind_set)
    ].groupby('problem_id') \
    .sample(n=5, random_state=1, replace=False)

query_val_df = test_dt.submissions_df[
    test_dt.submissions_df['problem_id'].isin(ind_set)&(~test_dt.submissions_df.index.isin(index_val_df.index))
    ].groupby('problem_id').sample(n=5, random_state=1, replace=True)

# Utils functions

In [None]:
model = MLP256_old()
model.load_state_dict(torch.load("/kaggle/input/mlp-output/MLP256_last.pth"))
model.eval()
model.to('cuda')

In [None]:
def calc_rr(problem_id, query_embedding, faiss_index, index_val_df):
    faiss.normalize_L2(query_embedding)
    
    indices = faiss_index.search(query_embedding, k=len(index_val_df))[1][0]
    for rank, found_problem_id in enumerate(index_val_df.problem_id.iloc[indices], start=1):
        if problem_id == found_problem_id:
            return 1 / rank
    return 0

In [None]:
def mrr(query_val_df, index_val_df, test_dt, faiss_index):
    sum_rr = 0
    with torch.no_grad():
        for idx, row in tqdm(query_val_df.iterrows(), total=len(query_val_df), desc='MRR calculating'):
            sum_rr += calc_rr(
                row.problem_id,
                model(test_dt.data[idx].reshape(-1,768)).cpu().numpy(),
                faiss_index,
                index_val_df
            )
    
    return sum_rr / len(query_val_df)


In [None]:
def get_val_mrr_by_problems(model, index_val_df, query_val_df, test_dt, shape=256):
    
    with torch.no_grad():
        vectors_for_faiss = np.vstack([model(test_dt.data[idx].reshape(-1,768)).cpu().numpy() for idx in tqdm(index_val_df.index, desc='Index vectorization')])
    
    faiss.normalize_L2(vectors_for_faiss)
    index_val_df = pd.concat(
        [index_val_df, pd.Series(list(vectors_for_faiss), name='embs', index=index_val_df.index)],
        axis=1
    )
    
    problem_embs_df = index_val_df.groupby('problem_id').agg(
        mean_embs=('embs', lambda x: np.vstack(x).mean(axis=0))
    ).reset_index()

    problem_index = faiss.IndexFlatIP(shape)
    problem_vectors_for_faiss = np.vstack(problem_embs_df.mean_embs.values)
    faiss.normalize_L2(problem_vectors_for_faiss)
    problem_index.add(problem_vectors_for_faiss)
    
    return mrr(query_val_df, problem_embs_df, test_dt, problem_index)

In [None]:
def get_val_mrr_by_submissions(model, index_val_df, query_val_df, test_dt, shape=256):
    
    with torch.no_grad():
        vectors_for_faiss = np.vstack([model(test_dt.data[idx].reshape(-1,768)).cpu().numpy() for idx in tqdm(index_val_df.index, desc='Index vectorization')])
    
    faiss.normalize_L2(vectors_for_faiss)
    index_val_df = pd.concat(
        [index_val_df, pd.Series(list(vectors_for_faiss), name='embs', index=index_val_df.index)],
        axis=1
    )
    
    faiss_index = faiss.IndexFlatIP(shape)
    faiss_index.add(vectors_for_faiss)
    
    return mrr(query_val_df, index_val_df, test_dt, faiss_index)

In [None]:
get_val_mrr_by_problems(model, index_val_df, query_val_df, test_dt )

In [None]:
get_val_mrr_by_submissions(model, index_val_df, query_val_df, test_dt )

# Set up training

In [None]:
model = MLP256()
#model = BiLSTMVectorizer(768, 256)
#model = MLP128()
if torch.cuda.is_available():
    model.to("cuda")

In [None]:
n_epochs = 100
lr = 1e-3

loss_func = losses.MultiSimilarityLoss()
miner = miners.MultiSimilarityMiner( epsilon=0.05)
optimizer = Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.1, min_lr=1e-5, verbose=True)

In [None]:
global_iter = 0
global_epoch = 0

## Logger

In [None]:
# Log in to your W&B account
import wandb

# Use wandb-core
wandb.require("core")
wandb.login()

In [None]:
wandb.init(
    project="AIST-CodeContest",
    config={
        "epochs": n_epochs,
        "batch_size": batch_size,
        'm_classes_per_batch': m_classes,
        "lr": lr,
        'loss_func': 'MultiSimilarityLoss',
        #'loss_margin': 0.2,
        'comment': 'learning by problems',
        'miner_eps': '0.05',
        "dropout": 0.2,
        "num_layers": 2,
        "shape": '512->256'
        })

# Train

In [None]:

def test_model(epoch, model, testloader=testloader):
    #test_iter  = tqdm(testloader)
    bilstm_mode = isinstance(model, BiLSTMVectorizer)
    sum_loss = 0
    num_batches = 0
    counter = 0
    for data, labels in tqdm(testloader):
        if bilstm_mode:
            embeddings = []
            for i in data:
                embeddings.append(model(i.reshape(1, -1, 768)).squeeze())
            embeddings = torch.stack(embeddings,dim=0)
        else:
            embeddings = model(data)
        hard_pairs = miner(embeddings, labels)
        loss = loss_func(embeddings, labels, hard_pairs)
        sum_loss += loss
        num_batches += 1
        
    sum_loss /= num_batches
    
    MRR = get_val_mrr_by_problems(model, index_val_df, query_val_df, test_dt, shape=256)
    
    
    #writer.add_scalar('Loss/val', sum_loss , epoch)
    
    return sum_loss, MRR

In [None]:
bilstm_mode = isinstance(model, BiLSTMVectorizer)
for epoch in range(0, n_epochs):
    epoch_iter = tqdm(trainloader, mininterval=0.5)
    model.train()
    running_loss = 0
    num_iters = 0
    for data, labels in epoch_iter:
        optimizer.zero_grad()
        if bilstm_mode:
            embeddings = []
            for i in data:
                embeddings.append(model(i.reshape(1, -1, 768)).squeeze())
            embeddings = torch.stack(embeddings,dim=0)
        else:
            embeddings = model(data)
            
        hard_pairs = miner(embeddings, labels)
        loss = loss_func(embeddings, labels, hard_pairs)
        
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        num_iters += 1 
        #epoch_iter.set_description("Epoch: %04d, Iter Loss: %.4f"  %(epoch, loss))
        #writer.add_scalar('Loss/train', loss , global_iter)
        
        
        #if global_iter != (len(epoch_iter) - 1):
        #    wandb.log(metrics)
        global_iter += 1

    with torch.no_grad():
        model.eval()
        v_loss, MRR = test_model(global_epoch, model, testloader=testloader)
        scheduler.step(v_loss)
        metrics = {"train/train_loss": running_loss/num_iters,
                   "train/epoch": epoch
                  }
        
        val_metrics = {"val/val_loss": v_loss,
                       "val/val_MRR": MRR}
    
    wandb.log({**metrics, **val_metrics})
    global_epoch += 1
    
    print("V_LOSS: ", v_loss,"\nV_MRR: ", MRR,"\n==== EPOCH #", epoch, '=======\n')

In [None]:
torch.save(model.state_dict(), f'MLP256_{wandb.run.name}.pth')

#wandb save
artifact = wandb.Artifact('model', type='model')
artifact.add_file(f'MLP256_{wandb.run.name}.pth')
wandb.log_artifact(artifact)

In [None]:
wandb.finish()

# Visualization

In [None]:
model.eval()
with torch.no_grad():
    vectors_for_vis = np.vstack([model(test_dt.data[idx].reshape(-1,768)).cpu().numpy() for idx in tqdm(index_val_df.index, desc='Index vectorization')])

index_vis_df = pd.concat(
    [index_val_df, pd.Series(list(vectors_for_vis), name='embs', index=index_val_df.index)],
    axis=1
)

problem_embs_df = index_vis_df.groupby('problem_id').agg(
        mean_embs=('embs', lambda x: np.vstack(x).mean(axis=0))
    ).reset_index()

problem_vectors_for_vis = np.vstack(problem_embs_df.mean_embs.values)

problem_embs_df = pd.concat(
    [
        problem_embs_df.set_index('problem_id'),
        index_vis_df[['problem_url', 'problem_id', 'cf_tags', 'cf_rating']].drop_duplicates(subset='problem_id').set_index('problem_id')
    ],
    axis=1
).reset_index()

In [None]:
tsne = TSNE(n_components=2)
tsne_result = tsne.fit_transform(problem_vectors_for_vis)
tsne_result.shape

In [None]:
problem_embs_df['X'] = tsne_result[:,0]
problem_embs_df['Y'] = tsne_result[:,1]

In [None]:
def feature_extract(x):
    x = str(x['cf_tags'])
    if "graph" in x or "tree" in x or "dfs" in x: return "graphs"
    elif ("math" in x\
        or "proba" in x\
        or "geometry" in x\
        or "number" in x\
        or "combina" in x) and "string" in x: return "math-and-string"
    elif "string" in x: return "string"
    elif "math" in x\
        or "proba" in x\
        or "geometry" in x\
        or "number" in x\
        or "combina" in x: return "math"
    else: return "else"
problem_embs_df['feature'] = problem_embs_df.apply(lambda x: feature_extract(x), axis=1)

In [None]:
fig = px.scatter(problem_embs_df, x="X", y="Y",
              color='feature',hover_name='cf_tags', hover_data=["cf_rating", "problem_url"])
fig.show()

In [None]:
fig = px.scatter(problem_embs_df, x="X", y="Y",
              color='cf_rating',hover_name='cf_tags', hover_data=["cf_rating", "problem_url"])
fig.show()

# Make Embeddings

In [None]:
model = MLP256()
model.load_state_dict(torch.load("/kaggle/working/MLP256_efficient-smoke-22.pth"))
model.to('cuda')
model.eval()

In [None]:
get_val_mrr_by_submissions(model, index_val_df, query_val_df, test_dt )

In [None]:
#train_dt = CPPSubmissionsDataset("/kaggle/input/filter-cpp-submission-dataset/filtered.csv", bert_transform, train=True, test_split=0.2)
#test_dt = CPPSubmissionsDataset("/kaggle/input/filter-cpp-submission-dataset/filtered.csv", bert_transform, train=False, test_split=0.2)
tensors_paths = [
    "/kaggle/input/cc-embs/embeddings_CodeBERTcpp.pkl",
]
tensors_path = "/kaggle/input/cc-embs/code_contests_cf_filtered_exploded_truncated.snappy.orc"
dt_for_vectorization = CFDataset(tensors_paths, tensors_path, mean=True, train=False, test_split=1)

In [None]:
mlp_embeddings = []
with torch.no_grad():
    for bert_emb in tqdm(dt_for_vectorization.data):
        mlp_embeddings.append(model(bert_emb.reshape(-1, 768)).cpu())

with open("embeddings_MLP.pkl", 'wb') as f:
    pickle.dump(mlp_embeddings, f)