In [1]:
#import necessary libraries
import pandas as pd
import os


In [2]:
# Mount Google Drive to access files if necessary
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Define the folder name and file name
folder_name = "Ersilia"
file_name = "PubChem Data.csv"

# Define the full path to the file
file_path = os.path.join("/content/drive/MyDrive", folder_name, file_name)

In [4]:
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows in DataFrame
print(df.head())

    cid         cmpdname                                        cmpdsynonym  \
0   206           Hexose  Hexopyranose|hexopyranoside|Hexose|6-(hydroxym...   
1   312     Chloride Ion  16887-00-6|chloride|chloride ion|Chloride anio...   
2   588       creatinine  60-27-5|AYI8EX34EU|creatinine|2-Imino-1-methyl...   
3   767    Carbonic Acid  463-79-6|carbonic acid|H2CO3|Koehlensaeure|hyd...   
4   769  Bicarbonate Ion  71-52-3|Bicarbonate (HCO3-)|Bicarbonate ion|Bi...   

        mw       mf  polararea  complexity  xlogp  heavycnt  hbonddonor  ...  \
0  180.160  C6H12O6      110.0       151.0   -2.6        12           5  ...   
1   35.450      Cl-        0.0         0.0    0.8         1           0  ...   
2  113.120  C4H7N3O       58.7       151.0   -1.8         8           1  ...   
3   62.025    CH2O3       57.5        26.3   -0.1         4           2  ...   
4   61.017    CHO3-       60.4        24.8    0.5         4           1  ...   

   gpfamilycnt  neighbortype   meshheadings 

Converting the canonicalsmile columns to standardize SMILES


In [5]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [6]:
from rdkit import Chem

# function to convert SMILES to standardized SMILES
def convert_to_standardized_smiles(smiles):
    """
    Convert a SMILES string to standardized SMILES notation.

    Args:
        smiles (str): The SMILES string to convert.

    Returns:
        str: The standardized SMILES notation.
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
        standardized_smiles = Chem.MolToSmiles(mol)
        return standardized_smiles
    except Exception as e:
        print(f"Error converting SMILES '{smiles}' to standardized SMILES: {e}")
        return None

# Apply the function to the 'canonicalsmiles' column of the DataFrame
df['standardized_smiles'] = df['canonicalsmiles'].apply(convert_to_standardized_smiles)

# Print the DataFrame to verify the results
print(df)


[06:43:13] Explicit valence for atom # 3 Si, 8, is greater than permitted
[06:43:13] Explicit valence for atom # 1 Si, 8, is greater than permitted
[06:43:13] Explicit valence for atom # 1 Si, 8, is greater than permitted


Error converting SMILES '[NH4+].[NH4+].F[Si-2](F)(F)(F)(F)F' to standardized SMILES: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)
Error converting SMILES 'F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]' to standardized SMILES: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)
Error converting SMILES 'F[Si-2](F)(F)(F)(F)F.[Ca+2]' to standardized SMILES: Python arg



In [7]:
# Rename the 'standardized_smiles' column to 'smiles'
df = df.rename(columns={'standardized_smiles': 'smiles'})

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264 entries, 0 to 2263
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0    cid                    2264 non-null   int64  
 1   cmpdname                2264 non-null   object 
 2   cmpdsynonym             2253 non-null   object 
 3   mw                      2264 non-null   float64
 4   mf                      2264 non-null   object 
 5   polararea               2264 non-null   float64
 6   complexity              2264 non-null   float64
 7   xlogp                   1547 non-null   float64
 8   heavycnt                2264 non-null   int64  
 9   hbonddonor              2264 non-null   int64  
 10  hbondacc                2264 non-null   int64  
 11  rotbonds                2264 non-null   int64  
 12  inchi                   2264 non-null   object 
 13  isosmiles               2264 non-null   object 
 14  canonicalsmiles         2264 non-null   

In [9]:
# Select random 1000 rows
sample_data = df.sample(n=1000)

In [10]:
# Filter the DataFrame to include only three desired columns
filter_df = sample_data[['canonicalsmiles', 'smiles','inchikey', 'mw']]
filter_df

Unnamed: 0,canonicalsmiles,smiles,inchikey,mw
533,CN(C)C,CN(C)C,GETQZCLCWQTVFV-UHFFFAOYSA-N,59.110
521,C1CCC(CC1)NS(=O)(=O)O,O=S(=O)(O)NC1CCCCC1,HCAJEUSONLESMK-UHFFFAOYSA-N,179.240
1206,CC(C1=CC(=CC=C1)O)N(C)C,CC(c1cccc(O)c1)N(C)C,GQZXRLWUYONVCP-QMMMGPOBSA-N,165.230
1729,C(C(CS(=O)(=O)O)S)S,O=S(=O)(O)CC(S)CS,JLVSRWOIZZXQAD-UHFFFAOYSA-N,188.300
1249,CC(C)CC(C(=O)O)O,CC(C)CC(O)C(=O)O,LVRFTAZAXQPQHI-YFKPBYRVSA-N,132.160
...,...,...,...,...
374,C1=CC=C(C(=C1)O)O,Oc1ccccc1O,YCIMNLLNPGFGHC-UHFFFAOYSA-N,110.110
1762,C[N+](C)(C)CC(=O)O,C[N+](C)(C)CC(=O)O,KWIUHFFTVRNATP-UHFFFAOYSA-O,118.150
1799,C[Se]CC(C(=O)O)N,C[Se]CC(N)C(=O)O,XDSSPSLGNGIIHP-UHFFFAOYSA-N,182.090
1779,CC(=O)O.CC(=O)O.[Zn],CC(=O)O.CC(=O)O.[Zn],ZOIORXHNWRGPMV-UHFFFAOYSA-N,185.500


In [11]:
# Rename the 'mw' column to 'molecularweight'
filter_df.rename(columns={'mw': 'molecularweight'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df.rename(columns={'mw': 'molecularweight'}, inplace=True)


In [12]:

filter_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 533 to 1334
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   canonicalsmiles  1000 non-null   object 
 1   smiles           999 non-null    object 
 2   inchikey         1000 non-null   object 
 3   molecularweight  1000 non-null   float64
dtypes: float64(1), object(3)
memory usage: 39.1+ KB


In [13]:
# Defining the folder name
folder_name = "Ersilia"

# Defining the file name
file_name = "clean_data.csv"

# Defining the full path to the file
output_file_path = f"/content/drive/MyDrive/Ersilia/clean_data.csv"

# Save the cleaned DataFrame to a CSV file
filter_df.to_csv(output_file_path, index=False)

In [14]:
# To confirm that the file has been saved
print("Cleaned dataset saved to:", output_file_path)

Cleaned dataset saved to: /content/drive/MyDrive/Ersilia/clean_data.csv
