In [167]:
import numpy as np
import pandas as pd
import sys, os

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from models.mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from models.mol2vec.helpers import depict_identifier, mol_to_svg
from gensim.models import word2vec

# 경로 설정
path = os.getcwd()
model_path = path + '/models/'
if not os.path.exists(model_path):
    os.makedirs(model_path)
data_path = path + '/data/'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
# data 불러오기
data = pd.read_csv(data_path + 'dataset.csv') 
mol2vec_df = data.iloc[:,0:2] # ID와 smiles만 추출

# SMILES 추출
smile_list = data['SMILES'].values

# SMILES to molecule  변환 
mol = [Chem.MolFromSmiles(x) for x in smile_list]
Draw.MolsToGridImage(mol, molsPerRow=5, useSVG=False) # 시각화
mol2vec_df['ROMol'] = mol

# molecule 별로 sentence 생성
mol2vec_df['sentence'] = mol2vec_df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)

# pretrained mol2vec model 불러오기
model = word2vec.Word2Vec.load(model_path + 'mol2vec/mol2vec_300dim.pkl')

# mol2vec embedding vector 생성
mol2vec_df['mol2vec'] = [DfVec(x) for x in sentences2vec(mol2vec_df['sentence'], model, unseen='UNK')]

# mol2vec embedding vector 저장
mol2vec_emb = np.array([x.vec for x in mol2vec_df['mol2vec']])

col_list = [] 
for i in range(300):
    col_names = f'Mol2vec_{i}'
    col_list.append(col_names)
    
Mol2vec = pd.DataFrame(mol2vec_emb)
Mol2vec.columns = col_list

Mol2vec = pd.concat([mol2vec_df.iloc[:,0:2],Mol2vec], axis=1)
Mol2vec.to_csv(data_path+'Mol2vec.csv',index=False)

