In [25]:
from autogluon.tabular import TabularDataset,TabularPredictor

import numpy as np
import pandas as pd
import os
import torch

DATA_PATH = '../dataset/dataset2.0/'

def load_esm_embed(csv_file,embed_type):
    EMBED_PATH = DATA_PATH+embed_type+'_embed/'
    EMB_LAYER = 33
    Xs = []
    ys = []
    Embed_PATH = EMBED_PATH+csv_file.split('.')[0]
    data_df =  pd.read_csv(DATA_PATH+csv_file)
    for index, row in data_df.iterrows():
        id = row['id']
        label = row['label']

        fn = f'{Embed_PATH}/{id}.pt'
        embs = torch.load(fn)
        
        Xs.append(embs['mean_representations'][EMB_LAYER])
        ys.append(label)
    Xs = torch.stack(Xs, dim=0).numpy()
    print('load {} esm embedding'.format(csv_file))
    print(len(ys))
    print(Xs.shape)
    return Xs,ys

def load_embed(csv_file,embed_type):
    EMBED_PATH = DATA_PATH+embed_type+'_embed/'
    Embed_PATH = EMBED_PATH+csv_file.split('.')[0]+'_embeds.npy'
    data_df =  pd.read_csv(DATA_PATH+csv_file)
    ys = data_df['label']
    Xs = np.load(Embed_PATH)
    print('load {} {}_embed embedding from {}'.format(csv_file,embed_type,Embed_PATH))
    print(len(ys))
    print(Xs.shape)
    return Xs,ys

def embedding(csv_file,embed= None):
    if embed=='ESM':Xs,ys = load_esm_embed(csv_file,embed)
    else:Xs,ys = load_embed(csv_file,embed)
    return Xs,ys

def Create_dataset(data,embed=None):

    Xs,ys= embedding(data,embed)
    df = pd.DataFrame(Xs)
    df['label'] = list(ys)
    return df


In [26]:
AutoML_result = './AutoML_result_0608/'
test_result = './Test3_result/leaderboard_data-07-12-4/'
import os 
if not os.path.exists(test_result):
    os.makedirs(test_result)
def Test(d =" test1"):
    embed_types = ['ESM','ProtBert_bfd','ProtBert','T5','UniRep'] #'ESM2_15b'

    binary_metric = ['accuracy',
                     'mcc',
                     'roc_auc',
                     'average_precision',
                     'precision',
                     'recall',
                     'log_loss'
                    ]
    # print(model.evaluate(df, silent=True))
    for embed in embed_types:
        print(embed)
        if d == 'test1':
            data = 'test_data1_2023-6-9_15_31.csv'
        else:
            data = 'test_data2_2023-6-9_15_31.csv'
        test_df = Create_dataset(data,embed)
        predicter = TabularPredictor.load('{}AutoML_{}_Oversampling/'.format(AutoML_result,embed))
        leaderboard = predicter.leaderboard(test_df, extra_metrics = binary_metric,silent=True)
        result = predicter.evaluate(test_df,silent=True)
        eval_df = pd.DataFrame(result,index=['{}_{}'.format(data.split('.')[0],embed)])
        leaderboard.to_csv("./{}{}_{}.csv".format(test_result,data.split('.')[0],embed))
            

In [28]:
Test(d = 'test1')

ESM
load test_data1_2023-6-9_15_31.csv esm embedding
226
(226, 1280)
ProtBert_bfd
load test_data1_2023-6-9_15_31.csv ProtBert_bfd_embed embedding from ../dataset/dataset2.0/ProtBert_bfd_embed/test_data1_2023-6-9_15_31_embeds.npy
226
(226, 1024)
ProtBert
load test_data1_2023-6-9_15_31.csv ProtBert_embed embedding from ../dataset/dataset2.0/ProtBert_embed/test_data1_2023-6-9_15_31_embeds.npy
226
(226, 1024)
T5
load test_data1_2023-6-9_15_31.csv T5_embed embedding from ../dataset/dataset2.0/T5_embed/test_data1_2023-6-9_15_31_embeds.npy
226
(226, 1024)
UniRep
load test_data1_2023-6-9_15_31.csv UniRep_embed embedding from ../dataset/dataset2.0/UniRep_embed/test_data1_2023-6-9_15_31_embeds.npy
226
(226, 1900)
