In [19]:
import rdkit
from rdkit import Chem

RDKit可以把分子对象保存成Smiles, molBlock, mol, inchi, inchikey文件

# 1. 输出SMILES/SMARTS
**输出默认式**
- 输出SMILES: MolToSmiles(mol, isomericSmiles, kekuleSmiles, canonical, ...)
- isomericSmiles: 默认True，区分同分异构体("@"表示手性，""和"/"表示顺反异构)
- kekuleSmiles: 默认False，不使用kekule时，脂肪族碳用"C"表示(大写)，芳香族用"c"表示(小写)
- canonical: 默认True，输出标准SMILES

In [20]:
m1 = Chem.MolFromSmiles("C1=CC=CC=CC=C1")
m2 = Chem.MolFromSmiles("C1=CC=CC=C1")
m3 = Chem.MolFromSmiles("C1=COC=C1")
mols = [m1, m2, m3]

smiles_list = [Chem.MolToSmiles(m) for m in mols]
print(smiles_list)

['C1=CC=CC=CC=C1', 'c1ccccc1', 'c1ccoc1']


**设置立体参数**
- 不区分同分异构体，通过isomericSmiles控制

In [21]:
m4 = Chem.MolFromSmiles('C[C@H](O)c1ccccc1')

print(Chem.MolToSmiles(m4))  # 默认包含立体化学信息
print(Chem.MolToSmiles(m4, isomericSmiles=False))  # 不包含立体化学信息

C[C@H](O)c1ccccc1
CC(O)c1ccccc1


# 2. 批量输出SMILES
- 批量输出SMILES: SmilesWriter(fileName, delimiter, includeHeader, nameHeader, isomricSmiles, kekuleSmiles)
  - fileName: 输出文件名
  - delimiter: 分隔符，默认为''
  - includeHeader: 是否写入表头，默认为True
  - nameHeader: 分子名一列的列名，默认'Name'
  - isomricSmiles: 立体信息，默认True
  - kekuleSmiles: kekule形式，默认False
  - 返回一个writer对象

- 写入SMILES: SmilesWriter.write(mol, conFld)
  - mol: mol对象
  - conFld: 写入的第几个构象

In [22]:
# 创建SmilesWriter对象，将SMILES写入文件'data/batch.smi'，以制表符为分隔符
writer = Chem.SmilesWriter('data/batch.smi', delimiter='\t')

# 遍历数据集，将SMILES写入文件
for i, mol in enumerate(mols):
    writer.write(mol)

# 关闭SmilesWriter对象
writer.close()

In [23]:
%more data/batch.smi

SMILES	Name	
C1=CC=CC=CC=C1	0
c1ccccc1	1
c1ccoc1	2


# 3. 批量输出SMILES和属性
- 批量输出SMILES及属性，通过以下函数进行操作:
    - mol.GetPropNames()，查看分子属性列表
    - mol.GetProp()，获取相应属性
    - mol.SetProp(key, val)，新增属性名key、对应属性值val
    - writer.SetProps()，设置哪些属性要输出
- 以输出分子量和LogP为例
    - 使用Descriptors计算属性并添加

In [24]:
from rdkit.Chem import Descriptors

writer = Chem.SmilesWriter('data/batch_smiles2.smi')
writer.SetProps(['LOGP', 'MW'])

for i, mol in enumerate(mols):
    mw = Descriptors.ExactMolWt(mol)
    logp = Descriptors.MolLogP(mol)
    
    mol.SetProp('MW', f"{mw}")
    mol.SetProp('LOGP', f"{logp}")
    mol.SetProp('_Name', f"No_{i}")
    writer.write(mol)
writer.close()

print('number of mols:', writer.NumMols())
print('mol properties:', [i for i in mol.GetPropNames()])

number of mols: 3
mol properties: ['MW', 'LOGP']


In [25]:
%more data/batch_smiles2.smi

SMILES Name LOGP MW
C1=CC=CC=CC=C1 No_0 2.2248 104.062600256
c1ccccc1 No_1 1.6866 78.046950192
c1ccoc1 No_2 1.2795999999999998 68.026214748


# 4. 输出SMARTS
- 输出SMARTS: MolToSmarts()

In [26]:
Chem.MolToSmarts(m3, isomericSmiles=True)

'[#6]1:[#6]:[#8]:[#6]:[#6]:1'

# 5. 批量输出到sdf文件
- 批量输出到文件: SDWriter()
  - 使用方法类似于SMILES的批量输出，可以自定义属性信息并记录在.sdf文件中，返回writer对象
- 写入sdf: writer(mol, confld)
  - mol: mol对象
  - confld: 写入的第几个构象(不同构象坐标不一样) 

In [28]:
writer = Chem.SDWriter('data/batch2.sdf')
writer.SetProps(['LOGP', 'MW'])

for i, mol in enumerate(mols):
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    
    mol.SetProp('MW', str(mw))
    mol.SetProp('LOGP', str(logp))
    mol.SetProp('_Name', f"No_{i}")
    writer.write(mol)
writer.close()

In [29]:
%more data/batch2.sdf

No_0
     RDKit          2D

  8  8  0  0  0  0  0  0  0  0999 V2000
    1.9598    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.3858   -1.3858    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0000   -1.9598    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.3858   -1.3858    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.9598    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.3858    1.3858    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    1.9598    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.3858    1.3858    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  2  0
  2  3  1  0
  3  4  2  0
  4  5  1  0
  5  6  2  0
  6  7  1  0
  7  8  2  0
  8  1  1  0
M  END
>  <LOGP>  (1) 
2.2248

>  <MW>  (1) 
104.15199999999997

$$$$
No_1
     RDKit          2D

  6  6  0  0  0  0  0  0  0  0999 V2000
    1.5000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.7500   -1.2990    0.0000 C   0  0  0  0  0  0  0  0 

# 6. 输出mol
(1) 输出为连接表
- 直接输出: MolToMolBlock()

In [30]:
m1 = Chem.MolFromSmiles('C1CCC1')
print(Chem.MolToMolBlock(m1))


     RDKit          2D

  4  4  0  0  0  0  0  0  0  0999 V2000
    1.0607    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0000   -1.0607    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.0607    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    1.0607    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0
  2  3  1  0
  3  4  1  0
  4  1  1  0
M  END



(2) 输出到.mol
- 输出到文件: MolToMolFile(mol, filename, includeStereo, ...)
- mol: mol对象
- filename: 文件名
- includeStereo: 立体信息，默认True

In [31]:
m1.SetProp('_Name', 'cyclobutane')
Chem.MolToMolFile(m1, 'data/output2.mol')

In [32]:
%more data/output2.mol

cyclobutane
     RDKit          2D

  4  4  0  0  0  0  0  0  0  0999 V2000
    1.0607    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.0000   -1.0607    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.0607    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    1.0607    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0
  2  3  1  0
  3  4  1  0
  4  1  1  0
M  END


# 7. 输出其他格式: pdb, fasta, xyz ...

In [None]:
# .pdb
Chem.MolToPDBBlock()
Chem.MolToPDBFile()
Chem.PDBWriter()

# .fasta
Chem.MolToFASTA()

# .xyz
Chem.MolToXYZBlock()
Chem.MolToXYZFile()