In [None]:
from mol2vec.features import mol2alt_sentence, MolSentence, sentences2vec
from gensim.models import word2vec
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, QED
import pandas as pd
import numpy as np


morgan_fp_radius = 2
morgan_fp_bits = 2048


def gen_morgan_fp(mol, radius=morgan_fp_radius, num_bits=morgan_fp_bits):
    fp = Chem.AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
    return np.array(list(fp), dtype=int)

df = pd.read_parquet(
    "./data/raw/deepchem_lipophilicity_bench.parquet"
)
df["mols"] = df["smiles"].apply(Chem.MolFromSmiles)



morgan_fps = df["mols"].apply(gen_morgan_fp)
fp_df = pd.DataFrame({
    "morgan_fp": morgan_fps.tolist(),
    "exp": df["exp"]
})
fp_df.to_parquet("./data/processed/deepchem_morgan_fp.parquet")

model = word2vec.Word2Vec.load("./model_300dim.pkl")
mol_sentences = [MolSentence(mol2alt_sentence(mol,1)) for mol in df["mols"]]
mol2vec_vectors = sentences2vec(mol_sentences, model, unseen="UNK")
df["mol2vec"] = mol2vec_vectors.tolist()

print("\nGenerating additional indicators")

extended_df = df.copy()

extended_df["molwt"] = df["mols"].apply(Descriptors.MolWt)
extended_df["clogp"] = df["mols"].apply(Chem.Crippen.MolLogP)
extended_df["hba"] = df["mols"].apply(Chem.Lipinski.NumHAcceptors)
extended_df["hbd"] = df["mols"].apply(Chem.Lipinski.NumHDonors)

extended_df["tpsa"] = extended_df["mols"].apply(Descriptors.TPSA)
extended_df["num_rotatable_bonds"] = extended_df["mols"].apply(Descriptors.NumRotatableBonds)

extended_df["num_rings"] = extended_df["mols"].apply(Descriptors.RingCount)
extended_df["num_aromatic_rings"] = extended_df["mols"].apply(Descriptors.NumAromaticRings)

extended_df["fraction_csp3"] = extended_df["mols"].apply(Lipinski.FractionCSP3)
extended_df["num_heavy_atoms"] = extended_df["mols"].apply(Descriptors.HeavyAtomCount)
extended_df["num_valence_electrons"] = extended_df["mols"].apply(Descriptors.NumValenceElectrons)

extended_df = extended_df.drop("mols", axis=1)
extended_df.to_parquet("./data/processed/deepchem_extended_mol2vec_300.parquet")

df = df.drop("mols", axis=1)
df.to_parquet("./data/processed/deepchem_mol2vec_300.parquet")