In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sentence_transformers import SentenceTransformer
import torch

class SequenceEncoder:
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, texts):
        embeddings = self.model.encode(texts, show_progress_bar=True, convert_to_tensor=True, device=self.device)
        return embeddings.cpu()

class MolecularFormulaEncoder:
    @staticmethod
    def formula_to_vector(formula):
        elements = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 
            'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Ti', 'V', 'Cr', 
            'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 
            'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 
            'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 
            'Ba', 'La', 'Ce', 'Nd', 'Sm', 'Gd', 'Ho', 'Lu', 'Hf', 'Ta', 
            'W', 'Re', 'Os', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ra', 'Ac', 'Bk']
        vector = np.zeros(len(elements))
        for i, element in enumerate(elements):
            count = 0
            for j in range(len(formula)):
                if formula[j:j+len(element)] == element:
                    count = 1
                    k = j + len(element)
                    while k < len(formula) and formula[k].isdigit():
                        count = count * 10 + int(formula[k])
                        k += 1
                    vector[i] = count
        return vector

# Initialize encoders
sequence_encoder = SequenceEncoder(device='cuda' if torch.cuda.is_available() else 'cpu')
label_encoder = LabelEncoder()

df = pd.read_csv("cleaned_hetero.csv")

df['State'] = df['State'].fillna('').astype(str)
df['Groups'] = df['Groups'].fillna('').astype(str)
df['Categories'] = df['Categories'].fillna('').astype(str)
df['ATC Codes'] = df['ATC Codes'].fillna('').astype(str)
df['Targets'] = df['Targets'].fillna('').astype(str)
df['Interactions'] = df['Interactions'].fillna('').astype(str)
df['Interactions'] = df['Interactions'].apply(lambda x: x.split('; '))

# Encode columns
encoded_drugbank_id = label_encoder.fit_transform(df['DrugBank ID'])
encoded_name = sequence_encoder(df['Name'])
encoded_state = label_encoder.fit_transform(df['State'])

# Reinitialize MultiLabelBinarizer for each multi-label column
mlb_groups = MultiLabelBinarizer()
mlb_categories = MultiLabelBinarizer()
mlb_atc_codes = MultiLabelBinarizer()
mlb_targets = MultiLabelBinarizer()
mlb_interactions = MultiLabelBinarizer()

encoded_groups = mlb_groups.fit_transform(df['Groups'].str.split('; '))
encoded_categories = mlb_categories.fit_transform(df['Categories'].str.split('; '))
encoded_atc_codes = mlb_atc_codes.fit_transform(df['ATC Codes'].str.split('; '))
encoded_targets = mlb_targets.fit_transform(df['Targets'].str.split('; '))
encoded_interactions = mlb_interactions.fit_transform(df['Interactions'])

# Encoding Molecular Formula
encoded_molecular_formula = np.array([MolecularFormulaEncoder.formula_to_vector(formula) for formula in df['Molecular Formula']])

# Encoding Doping
encoded_doping = df['Doping'].values

# Convert encoded data to DataFrames
encoded_name_df = pd.DataFrame(encoded_name.numpy())
encoded_drugbank_id_df = pd.DataFrame(encoded_drugbank_id, columns=['DrugBank ID'])
encoded_state_df = pd.DataFrame(encoded_state, columns=['State'])
encoded_groups_df = pd.DataFrame(encoded_groups, columns=mlb_groups.classes_)
encoded_categories_df = pd.DataFrame(encoded_categories, columns=mlb_categories.classes_)
encoded_atc_codes_df = pd.DataFrame(encoded_atc_codes, columns=mlb_atc_codes.classes_)
encoded_targets_df = pd.DataFrame(encoded_targets, columns=mlb_targets.classes_)
encoded_interactions_df = pd.DataFrame(encoded_interactions, columns=mlb_interactions.classes_)
encoded_molecular_formula_df = pd.DataFrame(encoded_molecular_formula)
encoded_doping_df = pd.DataFrame(encoded_doping, columns=['Doping'])

# Save DataFrames to CSV
encoded_name_df.to_csv('encoders_small/encoded_name.csv', index=False)
encoded_drugbank_id_df.to_csv('encoders_small/encoded_drugbank_id.csv', index = False)
encoded_state_df.to_csv('encoders_small/encoded_state.csv', index=False)
encoded_groups_df.to_csv('encoders_small/encoded_groups.csv', index=False)
encoded_categories_df.to_csv('encoders_small/encoded_categories.csv', index=False)
encoded_atc_codes_df.to_csv('encoders_small/encoded_atc_codes.csv', index=False)
encoded_targets_df.to_csv('encoders_small/encoded_targets.csv', index=False)
encoded_interactions_df.to_csv('encoders_small/encoded_interactions.csv', index=False)
encoded_molecular_formula_df.to_csv('encoders_small/encoded_molecular_formula.csv', index=False)
encoded_doping_df.to_csv('encoders_small/encoded_doping.csv', index=False)

print("Data saved to CSV files.")


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Data saved to CSV files.
