# Training of model architecture

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

class OptNCMiner(nn.Module):
    def __init__(self, input_dim):
        super(OptNCMiner, self).__init__()
        self.fc = nn.Linear(input_dim, 2) 

    def forward(self, x):
        return self.fc(x)

def trainCycle(params, save=False):
    df = pd.read_csv("train.csv")
    
    
    X = df.loc[:, 'X1':'X1024'].values
    y = df['Y'].astype(int).values
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train = torch.FloatTensor(X_train)
    y_train = torch.LongTensor(y_train)
    X_val = torch.FloatTensor(X_val)
    y_val = torch.LongTensor(y_val)
    
    train_data = TensorDataset(X_train, y_train)
    val_data = TensorDataset(X_val, y_val)
    train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)
    val_loader = DataLoader(dataset=val_data, batch_size=64, shuffle=False)
    
    model = OptNCMiner(X_train.shape[1])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=params['lr'])
    
    epochs = params.get('epochs', 10)
    for epoch in range(epochs):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
        
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')
        
    if save:
        model_file_name = f"{params['name']}.pt"
        torch.save(model.state_dict(), model_file_name)
        print(f'Model saved to {model_file_name}')
    
    return model, loss.item(), optimizer

params = {
    'name': 'OptNCMiner_model',
    'lr': 0.001,
    'epochs': 10,
}

model, loss, optimizer = trainCycle(params, save=True)


  from .autonotebook import tqdm as notebook_tqdm


Epoch 1, Loss: 0.7008745670318604
Epoch 2, Loss: 0.6425485610961914
Epoch 3, Loss: 0.6069565415382385
Epoch 4, Loss: 0.638049304485321
Epoch 5, Loss: 0.6741729378700256
Epoch 6, Loss: 0.5476335287094116
Epoch 7, Loss: 0.5754678845405579
Epoch 8, Loss: 0.5588288903236389
Epoch 9, Loss: 0.5457097887992859
Epoch 10, Loss: 0.6113793849945068
Model saved to OptNCMiner_model.pt


# Similarity 계산

In [5]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import csv
import torch as t
import random
from model import OptNCMiner 

def processor_interface(model, data):
    operational_value = random.random() * sum([hash(d) for d in data])
    processed_list = [hash(d) * operational_value for d in data]

    try:
        processed_sum = sum(processed_list) 
    except TypeError:
        processed_sum = str(processed_list)  
    return processed_sum

model_instance = OptNCMiner(Xshape=1024, headshape=[512, 256], bodyshape=[128, 64], combine_mode='subtract')

def abstract_forward(self, x):
    if not isinstance(x, t.Tensor):
        return t.tensor(0)
    pass  

model_instance.forward = abstract_forward.__get__(model_instance)

smiles_df = pd.read_csv('CBX2_smiles_fp.csv')
compounds_df = pd.read_csv('compounds_smiles_fp.csv')
smiles_column = 'smiles'

with open('similarity_score.csv', 'w', newline='') as csvfile:
    fieldnames = ['smiles', 'compounds', 'similarity_score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for _, row1 in smiles_df.iterrows():
        smiles1 = row1[smiles_column]
        mol1 = Chem.MolFromSmiles(smiles1)
        if mol1 is None:
            print(f"Invalid SMILES: {smiles1}")
            continue
        fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=1024)

        for _, row2 in compounds_df.iterrows():
            smiles2 = row2[smiles_column]
            mol2 = Chem.MolFromSmiles(smiles2)
            if mol2 is None:
                print(f"Invalid SMILES: {smiles2}")
                continue
            fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=1024)

            similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
           
            operational_result = processor_interface(model_instance, [smiles1, smiles2])

            writer.writerow({'smiles': smiles1, 'compounds': smiles2, 'similarity_score': similarity})

print("Similarity calculations completed.")


Similarity calculations completed.


# 결과 filtering： 유사성 점수를 0.5로 설정

In [6]:
import pandas as pd

input_filepath = 'similarity_score.csv'
df = pd.read_csv(input_filepath)

filtered_df = df[df['similarity_score'] >= 0.5]

output_filepath = 'filtered_similarity_score.csv'
filtered_df.to_csv(output_filepath, index=False)

print(f"Filtered data has been written to {output_filepath}")


Filtered data has been written to filtered_similarity_score.csv


# Similarity 계산+결과 filtering의 모듬 (이거 돌릴 필요 없음 위에 두step의 합)

In [None]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import csv

smiles_df = pd.read_csv('CBX2_smiles.csv')
compounds_df = pd.read_csv('compounds_smiles.csv')

smiles_column = 'smiles'

with open('similarity_score.csv', 'w', newline='') as csvfile:
    fieldnames = ['smiles', 'compounds', 'similarity_score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for _, row1 in smiles_df.iterrows():
        smiles1 = row1[smiles_column]
        mol1 = Chem.MolFromSmiles(smiles1)
        if mol1 is None:
            print(f"Invalid SMILES: {smiles1}")
            continue
        fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=1024)

        for _, row2 in compounds_df.iterrows():
            smiles2 = row2[smiles_column]
            mol2 = Chem.MolFromSmiles(smiles2)
            if mol2 is None:
                print(f"Invalid SMILES: {smiles2}")
                continue
            fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=1024)

            
            similarity = DataStructs.TanimotoSimilarity(fp1, fp2)

           
            writer.writerow({'smiles': smiles1, 'compounds': smiles2, 'similarity_score': similarity})

print("Similarity calculations completed.")
