# Load checkpoint and imports

In [1]:
import sys
import warnings
warnings.filterwarnings('ignore')
# !{sys.executable} -m pip install seaborn

In [2]:
parent_dir = "/Midgard/home/farzantn/phd/Olfaction/MoLFormer_N2024"
sys.path.append(parent_dir)
parent_dir="/Midgard/home/farzantn/mambaforge/envs/MolTran_CUDA11_cuda/lib/python3.8"
sys.path.append(parent_dir)

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate,train_test_split
import ast
from sklearn.metrics import roc_auc_score, mean_squared_error
import scipy
import os
from sklearn.decomposition import PCA
base_path = '/local_storage/datasets/farzaneh/alignment_olfaction_datasets/'
from sklearn.preprocessing import StandardScaler
from utils.util_alignment import set_seeds
from utils.prepare_datasets import prepare_dataset,select_features
from utils.helper_methods import custom_linear_regression,pipeline_regression
from constants import *

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/Midgard/home/farzantn/mambaforge/envs/Mol/lib/python3.8/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [4]:
seed= 2024
set_seeds(seed)

In [5]:
times=30
n_components=20

# Helper Methods

In [6]:
def literal_eval_list(list_string):
    list_string_temp=list_string.values.tolist()
    list_string_all = []
    for value in list_string_temp:
        list_string_all.append(ast.literal_eval(value))
    
    list_numpy = np.asarray(list_string_all)
    return list_numpy


In [7]:
def grand_average(df,ds):
    
    if ds=="keller":
        descriptors=keller_descriptors
       
    elif ds=="sagar":
        descriptors=sagar_descriptors
    elif ds=="dravinsk":
        descriptors=dravinsk_descriptors
    else:
        raise ValueError("Invalid dataset")
        
        
    df_groupbyCID=df.groupby('CID')[descriptors].mean().reset_index()

    df_groupbyCID['y'] = df_groupbyCID.loc[:, '0.1':descriptors[-1]].values.tolist()
    df_embeddings=df.drop_duplicates(subset=['CID'])
    df_embeddings=df_embeddings[['CID','embeddings']]
    df_groupbyCID = pd.merge(df_groupbyCID, df_embeddings, on='CID', how='left')
    return df_groupbyCID

In [8]:
def average_over_subject(df,ds):
    if ds=="keller":
        descriptors=keller_descriptors
       
    elif ds=="sagar":
        descriptors=sagar_descriptors
    elif ds=="dravinsk":
        descriptors=dravinsk_descriptors
    else:
        raise ValueError("Invalid dataset")
        
    df_groupbyCID=df.groupby(['CID','subject'])[descriptors].mean().reset_index()

    df_groupbyCID['y'] = df_groupbyCID.loc[:, '0.1':descriptors[-1]].values.tolist()
    df_embeddings=df.drop_duplicates(subset=['CID'])
    df_embeddings=df_embeddings[['CID','embeddings']]
    df_groupbyCID = pd.merge(df_groupbyCID, df_embeddings, on='CID', how='left')
    return df_groupbyCID

In [9]:
def metrics_per_descritor(X,y,linreg):
    predicted=linreg.predict(X)
    mseerrors = []
    correlations = []
    if len(y.shape)>1:
        for i in range(y.shape[1]):
            mseerror=mean_squared_error(predicted[:,i],y[:,i])
            correlation=scipy.stats.pearsonr(predicted[:,i], y[:,i])
            mseerrors.append(mseerror)
            correlations.append(correlation)
            # print(predicted[:,i], y[:,i])
        
    else:
        mseerror=mean_squared_error(predicted,y)
        correlation=scipy.stats.pearsonr(predicted, y)
        mseerrors.append(mseerror)
        correlations.append(correlation)
     # print(predicted[:,i], y[:,i])
    
    return predicted, mseerrors, correlations
        # plot()
        
    
    

In [10]:
def post_process_results_df(mserrorrs_corssvalidated,correlations_corssvalidated):
    mserrorrs_corssvalidated_array = np.asarray(mserrorrs_corssvalidated) 
    if len(mserrorrs_corssvalidated_array.shape)==3:
        mserrorrs_corssvalidated_array = np.squeeze(mserrorrs_corssvalidated_array,-1)
        mserrorrs_corssvalidated_array = np.moveaxis(mserrorrs_corssvalidated_array, 0,1)
    # print(mserrorrs_corssvalidated_array.shape,"shapeeee1")
    
    correlations_corssvalidated = np.asarray(correlations_corssvalidated)
    if len(correlations_corssvalidated.shape)==4:
        correlations_corssvalidated = np.moveaxis(correlations_corssvalidated, 0,1)
        # print("correlations_corssvalidateds",correlations_corssvalidated.shape)
        correlations_corssvalidated = np.squeeze(correlations_corssvalidated,2)
    # print(correlations_corssvalidated.shape,"shapeeee2")
    statistics_correlations_corssvalidated_array = correlations_corssvalidated[:,:,0]
    pvalues_correlations_corssvalidated_array = correlations_corssvalidated[:,:,1]

    return mserrorrs_corssvalidated_array,statistics_correlations_corssvalidated_array,pvalues_correlations_corssvalidated_array
    

In [11]:
def train_and_eval(data_groupbyCID,times,n_components=None,y_i=None):
    mserrorrs_corssvalidated = []
    correlations_corssvalidated = []
    predicteds = []
    y_tests = []
    runs = []
    CIDs = []
    
    X=np.asarray(data_groupbyCID.embeddings.values.tolist())
    if y_i is not None:
        y=np.asarray(data_groupbyCID.y.values.tolist())[:,y_i].reshape(-1,1)
    else:
       y=np.asarray(data_groupbyCID.y.values.tolist())
    for i in range(times):
        X_train, X_test, y_train, y_test,CID_train, CID_test = train_test_split(X, y,data_groupbyCID.CID, test_size=0.2, random_state=seed+i) 
        linreg,X_test = pipeline_regression(X_train,y_train,X_test,custom_linear_regression,seed,n_components=n_components)
        
        predicted, mseerrors, correlations=metrics_per_descritor(X_test,y_test,linreg)
        mserrorrs_corssvalidated.append(mseerrors)
        correlations_corssvalidated.append(correlations)
        predicteds.extend(predicted)
        y_tests.extend(y_test)
        runs.extend([i]*len(y_test))
        CIDs.extend(CID_test)
        
        
    return CIDs,predicteds,y_tests,runs,mserrorrs_corssvalidated, correlations_corssvalidated

In [12]:
def min_max_extraction(data_groupbyCID,times,y_i=None):
    min_max_dfs = []
    X=np.asarray(data_groupbyCID.embeddings.values.tolist())
    if y_i is not None:
        y=np.asarray(data_groupbyCID.y.values.tolist())[:,y_i].reshape(-1,1)
    else:
       y=np.asarray(data_groupbyCID.y.values.tolist())
    for i in range(times):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed+i)  
        # print(X_train.shape,"x shape")
        # print(y_train.shape,"y shape")
        df = pd.DataFrame(y_test)

        # Step 3: Extract the min and max values for each column
        min_values = df.min()
        max_values = df.max()

       # Create DataFrames for min and max values with an additional column for the label
        min_df = pd.DataFrame(min_values).T
        min_df['Type'] = 'Min'
    
        max_df = pd.DataFrame(max_values).T
        max_df['Type'] = 'Max'
    
        # Concatenate the min and max DataFrames
        min_max_df = pd.concat([min_df, max_df])
        min_max_df['Dataset'] = i
    
        # Append the concatenated DataFrame to the lis
    
        # Append the min_max_df to the list
        min_max_dfs.append(min_max_df)
    
    final_df = pd.concat(min_max_dfs)   
        # Step 4: Create a new DataFrame with the min and max values per column
    final_df.set_index(['Dataset', 'Type'], inplace=True)
    # min_max_df = pd.DataFrame([min_values, max_values], index=['Min', 'Max'])
        


In [13]:
def pipeline(model_name,input_file,input_file_alva=None,times=30,n_components=None,ds="keller",count=False):
    # input_file_keller = base_path+'openpom/data/curated_datasets/embeddings/molformer/keller_molformer_embeddings_13_Apr17.csv'
    df=pd.read_csv(input_file)
    df=prepare_dataset(df,'embeddings','y')
    df_groupbyCID=grand_average(df,ds)
    df_groupbyCIDSubject=average_over_subject(df,ds)
    
    

    if input_file_alva is not None:
        
        df_alva = select_features(input_file_alva)
        df_alva = df_alva.drop_duplicates(subset=['CID'])
        del df_groupbyCID['embeddings']
        df_groupbyCID= pd.merge(df_alva,df_groupbyCID,on="CID")
    
        
    
    if count:
        min_max_df=min_max_extraction(df_groupbyCID,times)
        return min_max_df
    else:
        CIDs, predicteds, y_tests,runs, mserrorrs_df_corssvalidated, correlations_df_corssvalidated=train_and_eval(df_groupbyCID,times=times,n_components=n_components)
   
    mserrorrs_corssvalidated_df,statistics_correlations_corssvalidated_df,pvalues_correlations_corssvalidated_df=post_process_results_df(mserrorrs_df_corssvalidated, correlations_df_corssvalidated)
    df_df_mse= pd.DataFrame(mserrorrs_corssvalidated_df)
    # df_df_mse = df_df_mse.T
    df_df_mse['model'] = model_name
    df_df_cor= pd.DataFrame(statistics_correlations_corssvalidated_df)
    df_df_cor['model'] = model_name
    print(np.asarray(predicteds).shape,np.asarray(y_tests).shape, np.asarray(runs).shape, np.asarray(CIDs).shape)
    
    # I want to make a dataframe with the predicted values, the true values and the run number for each prediction, (192, 22) (192, 22) (192,) should be converted to (196, 22+22+1), 
    df_predictions = pd.DataFrame(np.concatenate([np.asarray(CIDs).reshape(-1,1),np.asarray(predicteds),np.asarray(y_tests),np.asarray(runs).reshape(-1,1)],axis=1))
    df_predictions['model'] = model_name
    #and add a prefix to the columns to indicate the predicted vs true values
    tasks_length = len(sagar_tasks) if ds=='sagar' else len(keller_tasks)
    df_predictions.columns = ['CID']+[str(i)+'_predicted' for i in range(tasks_length)]+[str(i)+'_true' for i in range(tasks_length,int(tasks_length*2))]+['run']+['model']
    
    # df_predictions = pd.DataFrame(np.concatenate([np.asarray(predicteds),np.asarray(y_tests),np.asarray(runs).reshape(-1,1)],axis=1))
    # df_predictions['model'] = model_name
    
    
    return df_predictions,df_df_mse, df_df_cor

In [14]:
def compute_correlation(times,n_components,input_file_molformer,input_file_pom,input_file_alva,ds="keller"):
    
    
    df_predictions_pom, df_keller_mse_pom, df_keller_cor_pom = pipeline('pom',input_file_pom,times=times,n_components=n_components,ds=ds)
    
    df_predictions_alva,df_keller_mse_alva, df_keller_cor_alva = pipeline('alva',input_file_pom,input_file_alva,times=times,n_components=n_components,ds=ds)

    corrs_molformer=[]

    mses_molformer=[]
    df_predictions_molformers=[]

    for i in [0,1,2,3,4,5,6,7,8,9,10,11,13]:
        input_file_keller_molformer = input_file_molformer+str(i)+'_Apr17.csv'
    # for i in [0,13]:

        df_predictions_molformer,df_keller_mse_molformer, df_keller_cor_molformer = pipeline('molformer',input_file_keller_molformer,times=times,n_components=n_components,ds=ds)
        df_predictions_molformer['layer'] = i



        corrs_molformer.append(df_keller_cor_molformer)
        mses_molformer.append(df_keller_mse_molformer)
        df_predictions_molformers.append(df_predictions_molformer)
    
       

    df_predictions_pom['layer'] = 13
    df_predictions_alva['layer'] = 13

    return corrs_molformer,mses_molformer,df_keller_cor_pom,df_keller_mse_pom,df_keller_cor_alva,df_keller_mse_alva,df_predictions_molformers,df_predictions_pom,df_predictions_alva

In [15]:
def count_df_x_keller(times ,per_descritor=False,ds="keller"):

    # for i in [0,13]:
    if ds=="keller":
        input_file_keller_molformer = base_path+'alignment_olfaction_datasets/data/curated_datasets/embeddings/molformer/keller_molformer_embeddings_'+str(13)+'_Apr17.csv'
        min_max_df = pipeline('molformer',input_file_keller_molformer,times=times,n_components=n_components,count=True)
    elif ds=="sagar":
        input_file_sagar_molformer = base_path+'alignment_olfaction_datasets/data/curated_datasets/embeddings/molformer/sagar_molformer_embeddings_'+str(13)+'_Apr17.csv'
        min_max_df = pipeline('molformer',input_file_sagar_molformer,times=times,n_components=n_components,ds="sagar",count=True)




    return min_max_df

In [16]:
def post_process_tocsv(corrs,tasks):
    corrs[0]["layer"]=0
    corrss = corrs[0]
    for i in range(1,13):
        corrs[i]["layer"] = i
        corrss  = pd.concat([corrss, corrs[i]])
    del corrss['model']
    corrss.columns = tasks+["layer"]    
    corrss['model']='molformer'
    return corrss

In [17]:
def save_data(ds,df_cor_pom,df_cor_alva,df_mse_pom,df_mse_alva,corrs_molfomer,mses_molformer):
    if ds=="keller":
        tasks= keller_tasks
    elif ds=="sagar":
        tasks= sagar_tasks
    else:
        raise ValueError("Invalid dataset")
    
    df_cor_pom.columns = tasks+["model"]
    df_cor_pom.to_csv('df_'+ds+'_cor_pom.csv', index=False)  
    
    df_mse_pom.columns  = tasks+["model"]
    df_mse_pom.to_csv('df_'+ds+'_mse_pom.csv', index=False)  
    
    df_cor_alva.columns = tasks+["model"]
    df_cor_alva.to_csv('df_'+ds+'_cor_alva.csv', index=False)  
    
    df_mse_alva.columns = tasks+["model"]
    df_mse_alva.to_csv('df_'+ds+'_mse_alva.csv', index=False)   
    
    corrs_molfomer_df = post_process_tocsv(corrs_molfomer,tasks)
    corrs_molfomer_df.to_csv('df_'+ds+'_corrs_molfomer.csv', index=False)   
    
    mses_molformer_df = post_process_tocsv(mses_molformer,tasks)
    mses_molformer_df.to_csv('df_'+ds+'_mses_molfomer.csv', index=False)   

In [18]:
def save_predictions(df_predictions,ds):
    if ds=="keller":
        tasks= keller_tasks
    elif ds=="sagar":
        tasks= sagar_tasks
    else:
        raise ValueError("Invalid dataset")
    df_predictions = df_predictions.rename(columns=dict(zip(df_predictions.columns[1:len(tasks)+1], [tasks[i]+"_predicted" for i in range(len(tasks))])))
    df_predictions = df_predictions.rename(columns=dict(zip(df_predictions.columns[len(tasks)+1:len(tasks)*2+1], [tasks[i]+"_true" for i in range(len(tasks))])))
    
    df_predictions.to_csv(ds+'_predictions.csv', index=False)

In [19]:
def concat_dfs(df_predictions_molformers,df_predictions_pom,df_predictions_alva):
    df_predictions = pd.concat([df_predictions_molformers[0],df_predictions_molformers[1],df_predictions_molformers[2],df_predictions_molformers[3],df_predictions_molformers[4],df_predictions_molformers[5],df_predictions_molformers[6],df_predictions_molformers[7],df_predictions_molformers[8],df_predictions_molformers[9],df_predictions_molformers[10],df_predictions_molformers[11],df_predictions_molformers[12],df_predictions_pom,df_predictions_alva])
    return df_predictions

# Extracting Representations

## Keller

### Representations

In [None]:
input_file_keller_pom = base_path+'curated_datasets/embeddings/pom/keller_pom_embeddings_Apr17.csv'
input_file_keller_alva = base_path+'curated_datasets/alva/keller_molecules_alva_17Apr.csv'
input_file_keller_molformer = base_path+'curated_datasets/embeddings/molformer/keller_molformer_embeddings_'
corrs_molfomer,mses_molformer,df_keller_cor_pom,df_keller_mse_pom,df_keller_cor_alva,df_keller_mse_alva,df_predictions_molformers,df_predictions_pom,df_predictions_alva =compute_correlation(times, n_components,input_file_keller_molformer,input_file_keller_pom,input_file_keller_alva,ds="keller")

(2880, 22) (2880, 22) (2880,) (2880,)
(2880, 22) (2880, 22) (2880,) (2880,)
(2880, 22) (2880, 22) (2880,) (2880,)
(2880, 22) (2880, 22) (2880,) (2880,)
(2880, 22) (2880, 22) (2880,) (2880,)
(2880, 22) (2880, 22) (2880,) (2880,)
(2880, 22) (2880, 22) (2880,) (2880,)


In [None]:
df_predictions = concat_dfs(df_predictions_molformers,df_predictions_pom,df_predictions_alva)
save_predictions(df_predictions,ds="keller")

In [None]:
# pd.read_csv('kellerpredictions.csv')

In [None]:
save_data("keller",df_keller_cor_pom,df_keller_cor_alva,df_keller_mse_pom,df_keller_mse_alva,corrs_molfomer,mses_molformer)

## Sagar

### Representations


In [None]:
input_file_sagar_pom = base_path+'curated_datasets/embeddings/pom/sagar_pom_embeddings_Apr17.csv'

input_file_sagar_alva = base_path+'curated_datasets/alva/sagar_molecules_alva_17Apr.csv'
input_file_sagar_molformer = base_path+'curated_datasets/embeddings/molformer/sagar_molformer_embeddings_'


corrs_molfomer_sagar,mses_molformer_sagar,df_sagar_cor_pom,df_sagar_mse_pom,df_sagar_cor_alva,df_sagar_mse_alva,df_sagar_predictions_molformers,df_sagar_predictions_pom,df_sagar_predictions_alva =compute_correlation(times , n_components,input_file_sagar_molformer, input_file_sagar_pom,input_file_sagar_alva, ds="sagar")

In [None]:
df_predictions_sagar = concat_dfs(df_sagar_predictions_molformers,df_sagar_predictions_pom,df_sagar_predictions_alva)
save_predictions(df_predictions_sagar,ds="sagar")

In [None]:
save_data("sagar",df_sagar_cor_pom,df_sagar_cor_alva,df_sagar_mse_pom,df_sagar_mse_alva,corrs_molfomer_sagar,mses_molformer_sagar)


In [None]:
# min_max_df =count_df_x_keller(times ,per_descritor=False)
# min_max_df.to_csv('keller_min_max.csv', index=True)   
# 
# min_max_df =count_df_x_keller(times ,per_descritor=False,ds="sagar")
# min_max_df.to_csv('sagar_min_max.csv', index=True)   

## Draviensk 

### Representations

In [None]:
input_file_dravinsk_pom = base_path+'curated_datasets/embeddings/pom/dravienks1985App1_pom_embeddings_Apr17.csv'
input_file_dravinsk_alva = base_path+'curated_datasets/alva/dravienks1985App1_molecules_alva_17Apr.csv'
input_file_dravinsk_molformer = base_path+'curated_datasets/embeddings/molformer/dravienks1985App1_molformer_embeddings_'

corrs_molfomer_dravinsk,mses_molformer_dravinsk,df_dravinsk_cor_pom,df_dravinsk_mse_pom,df_dravinsk_cor_alva,df_dravinsk_mse_alva =compute_correlation(times , n_components,input_file_dravinsk_molformer,input_file_dravinsk_pom,input_file_dravinsk_alva,ds="dravinsk")   


In [None]:
save_data("dravienks1985App1",df_dravinsk_cor_pom,df_dravinsk_cor_alva,df_dravinsk_mse_pom,df_dravinsk_mse_alva,corrs_molfomer_dravinsk,mses_molformer_dravinsk)

In [None]:
input_file_dravinsk_molformer = base_path+'curated_datasets/embeddings/molformer/dravienks1985App1_molformer_embeddings_13_Apr17.csv'
file = pd.read_csv(input_file_dravinsk_molformer)

In [None]:
# iffd