In [1]:
#import necessary libraries
import pandas as pd
import os


In [2]:
# Mount Google Drive to access files if necessary
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define the folder name and file name
folder_name = "Ersilia"
file_name = "PubChem Data.csv"

# Define the full path to the file
file_path = os.path.join("/content/drive/MyDrive", folder_name, file_name)

In [4]:
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows in DataFrame
print(df.head())

    cid         cmpdname                                        cmpdsynonym  \
0   206           Hexose  Hexopyranose|hexopyranoside|Hexose|6-(hydroxym...   
1   312     Chloride Ion  16887-00-6|chloride|chloride ion|Chloride anio...   
2   588       creatinine  60-27-5|AYI8EX34EU|creatinine|2-Imino-1-methyl...   
3   767    Carbonic Acid  463-79-6|carbonic acid|H2CO3|Koehlensaeure|hyd...   
4   769  Bicarbonate Ion  71-52-3|Bicarbonate (HCO3-)|Bicarbonate ion|Bi...   

        mw       mf  polararea  complexity  xlogp  heavycnt  hbonddonor  ...  \
0  180.160  C6H12O6      110.0       151.0   -2.6        12           5  ...   
1   35.450      Cl-        0.0         0.0    0.8         1           0  ...   
2  113.120  C4H7N3O       58.7       151.0   -1.8         8           1  ...   
3   62.025    CH2O3       57.5        26.3   -0.1         4           2  ...   
4   61.017    CHO3-       60.4        24.8    0.5         4           1  ...   

   gpfamilycnt  neighbortype   meshheadings 

Converting the canonicalsmile columns to standardize SMILES


In [5]:
!pip install standardiser
!pip install rdkit




In [6]:
from standardiser import standardise
from rdkit import Chem

def convert_to_standard_smiles(canonical_smiles):
    mol = Chem.MolFromSmiles(canonical_smiles)
    if mol is not None:
        try:
            mol = standardise.run(mol)
        except:
            mol = None

    if mol is not None:
        return Chem.MolToSmiles(mol)
    else:
        return None

# Apply the function to the 'canonicalsmiles' column of the DataFrame
df['standardized_smiles'] = df['canonicalsmiles'].apply(convert_to_standard_smiles)

# Print the DataFrame
print(df)



[05:06:13] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 6 9
[05:06:13] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 7 10
[05:06:14] Can't kekulize mol.  Unkekulized atoms: 3 6
[05:06:14] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 6 9
[05:06:14] Can't kekulize mol.  Unkekulized atoms: 0 1 4 6 7 9
[05:06:14] Explicit valence for atom # 3 Si, 8, is greater than permitted
[05:06:14] Explicit valence for atom # 1 Si, 8, is greater than permitted
[05:06:14] Explicit valence for atom # 1 Si, 8, is greater than permitted


            cid                                           cmpdname  \
0           206                                             Hexose   
1           312                                       Chloride Ion   
2           588                                         creatinine   
3           767                                      Carbonic Acid   
4           769                                    Bicarbonate Ion   
...         ...                                                ...   
2259   21924740                         Potassium;hydron;phosphate   
2260   21924748                          Disodium;hydron;phosphate   
2261   58592228                                       CID 58592228   
2262   90657278  (2S)-2-azaniumyl-3-[hydroxy(nitroso)amino]prop...   
2263  129632725                                      CID 129632725   

                                            cmpdsynonym       mw        mf  \
0     Hexopyranose|hexopyranoside|Hexose|6-(hydroxym...  180.160   C6H12O6   
1  



In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264 entries, 0 to 2263
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0    cid                    2264 non-null   int64  
 1   cmpdname                2264 non-null   object 
 2   cmpdsynonym             2253 non-null   object 
 3   mw                      2264 non-null   float64
 4   mf                      2264 non-null   object 
 5   polararea               2264 non-null   float64
 6   complexity              2264 non-null   float64
 7   xlogp                   1547 non-null   float64
 8   heavycnt                2264 non-null   int64  
 9   hbonddonor              2264 non-null   int64  
 10  hbondacc                2264 non-null   int64  
 11  rotbonds                2264 non-null   int64  
 12  inchi                   2264 non-null   object 
 13  isosmiles               2264 non-null   object 
 14  canonicalsmiles         2264 non-null   

In [8]:
df = df.rename(columns={'standardized_smiles': 'smiles'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264 entries, 0 to 2263
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0    cid                    2264 non-null   int64  
 1   cmpdname                2264 non-null   object 
 2   cmpdsynonym             2253 non-null   object 
 3   mw                      2264 non-null   float64
 4   mf                      2264 non-null   object 
 5   polararea               2264 non-null   float64
 6   complexity              2264 non-null   float64
 7   xlogp                   1547 non-null   float64
 8   heavycnt                2264 non-null   int64  
 9   hbonddonor              2264 non-null   int64  
 10  hbondacc                2264 non-null   int64  
 11  rotbonds                2264 non-null   int64  
 12  inchi                   2264 non-null   object 
 13  isosmiles               2264 non-null   object 
 14  canonicalsmiles         2264 non-null   

In [9]:
df.dropna(subset=['smiles'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1623 entries, 0 to 2263
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0    cid                    1623 non-null   int64  
 1   cmpdname                1623 non-null   object 
 2   cmpdsynonym             1619 non-null   object 
 3   mw                      1623 non-null   float64
 4   mf                      1623 non-null   object 
 5   polararea               1623 non-null   float64
 6   complexity              1623 non-null   float64
 7   xlogp                   1431 non-null   float64
 8   heavycnt                1623 non-null   int64  
 9   hbonddonor              1623 non-null   int64  
 10  hbondacc                1623 non-null   int64  
 11  rotbonds                1623 non-null   int64  
 12  inchi                   1623 non-null   object 
 13  isosmiles               1623 non-null   object 
 14  canonicalsmiles         1623 non-null   

In [11]:
# Select random 1000 rows
sample_data = df.sample(n=1000)

In [12]:
# Filter the DataFrame to include only three desired columns
filter_df = sample_data[['canonicalsmiles', 'smiles','inchikey', 'mw']]
filter_df

Unnamed: 0,canonicalsmiles,smiles,inchikey,mw
1342,CCC(C)SSC1=NC=CN1,CCC(C)SSc1ncc[nH]1,BPBPYQWMFCTCNG-UHFFFAOYSA-N,188.30
320,C1C=C2C(=CC(=O)O2)C(O1)O,O=C1C=C2C(=CCOC2O)O1,ZRWPUFFVAOMMNM-UHFFFAOYSA-N,154.12
679,CN(C)C(=N)N=C(N)N.Cl,CN(C)C(=N)N=C(N)N,OETHQSJEHLVLGH-UHFFFAOYSA-N,165.62
1614,CCCCOCC(C)OCC(C)O,CCCCOCC(C)OCC(C)O,CUVLMZNMSPJDON-UHFFFAOYSA-N,190.28
538,CCC(C)(C)C1=CC=C(C=C1)O,CCC(C)(C)c1ccc(O)cc1,NRZWYNLTFLDQQX-UHFFFAOYSA-N,164.24
...,...,...,...,...
1007,CC(C(C1=CC(=CC=C1)O)O)N,CC(N)C(O)c1cccc(O)c1,WXFIGDLSSYIKKV-RCOVLWMOSA-N,167.20
533,CN(C)C,CN(C)C,GETQZCLCWQTVFV-UHFFFAOYSA-N,59.11
502,C(C(=O)O)NCC(=O)O,O=C(O)CNCC(=O)O,NBZBKCUXIYYUSX-UHFFFAOYSA-N,133.10
1865,CC(=O)OC(C(=C)Cl)OC(=O)C,C=C(Cl)C(OC(C)=O)OC(C)=O,PPHOKWJFQIHKNR-UHFFFAOYSA-N,192.60


In [13]:
# Rename the 'mw' column to 'molecularweight'
filter_df.rename(columns={'mw': 'molecularweight'}, inplace=True)
filter_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df.rename(columns={'mw': 'molecularweight'}, inplace=True)


Unnamed: 0,canonicalsmiles,smiles,inchikey,molecularweight
1342,CCC(C)SSC1=NC=CN1,CCC(C)SSc1ncc[nH]1,BPBPYQWMFCTCNG-UHFFFAOYSA-N,188.30
320,C1C=C2C(=CC(=O)O2)C(O1)O,O=C1C=C2C(=CCOC2O)O1,ZRWPUFFVAOMMNM-UHFFFAOYSA-N,154.12
679,CN(C)C(=N)N=C(N)N.Cl,CN(C)C(=N)N=C(N)N,OETHQSJEHLVLGH-UHFFFAOYSA-N,165.62
1614,CCCCOCC(C)OCC(C)O,CCCCOCC(C)OCC(C)O,CUVLMZNMSPJDON-UHFFFAOYSA-N,190.28
538,CCC(C)(C)C1=CC=C(C=C1)O,CCC(C)(C)c1ccc(O)cc1,NRZWYNLTFLDQQX-UHFFFAOYSA-N,164.24
...,...,...,...,...
1007,CC(C(C1=CC(=CC=C1)O)O)N,CC(N)C(O)c1cccc(O)c1,WXFIGDLSSYIKKV-RCOVLWMOSA-N,167.20
533,CN(C)C,CN(C)C,GETQZCLCWQTVFV-UHFFFAOYSA-N,59.11
502,C(C(=O)O)NCC(=O)O,O=C(O)CNCC(=O)O,NBZBKCUXIYYUSX-UHFFFAOYSA-N,133.10
1865,CC(=O)OC(C(=C)Cl)OC(=O)C,C=C(Cl)C(OC(C)=O)OC(C)=O,PPHOKWJFQIHKNR-UHFFFAOYSA-N,192.60


In [14]:

filter_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1342 to 1202
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   canonicalsmiles  1000 non-null   object 
 1   smiles           1000 non-null   object 
 2   inchikey         1000 non-null   object 
 3   molecularweight  1000 non-null   float64
dtypes: float64(1), object(3)
memory usage: 39.1+ KB


In [15]:
# Defining the folder name
folder_name = "Ersilia"

# Defining the file name
file_name = "clean_data.csv"

# Defining the full path to the file
output_file_path = f"/content/drive/MyDrive/Ersilia/clean_data.csv"

# Save the cleaned DataFrame to a CSV file
filter_df.to_csv(output_file_path, index=False)

In [16]:
# To confirm that the file has been saved
print("Cleaned dataset saved to:", output_file_path)

Cleaned dataset saved to: /content/drive/MyDrive/Ersilia/clean_data.csv
