In [1]:
import pandas as pd
df_reach = pd.read_csv('/localhome/cschiebroek/ShuZe/vp/data cleaning/reach_Vapour_4.csv', sep = "|", encoding='latin-1', header = 1)
df_reach.head()

Unnamed: 0,#name,cas,iupacName,P (upperQualifier),P (upperValue),P (lowerQualifier),P (lowerValue),P (unit),T (upperQualifier),T (upperValue),T (lowerQualifier),T (lowerValue),T (unit)
0,Ô_-alanine,107-95-9,3-Aminopropionic acid,,,,5.06e-08,mm Hg,,,,25.0,C
1,Ô_-alanine,107-95-9,3-Aminopropionic acid,,,,6.74e-06,Pa,,,,25.0,C
2,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,0.344,hPa,,,,20.0,C
3,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,0.4,hPa,,,,20.0,C
4,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,0.9,hPa,,,,20.0,C


# Consistency cecking
## General check
### identifying missing fields and checking the readability of the existing fields

In [2]:
#needs to have a value for P and T
print(len(df_reach))
df_reach = df_reach.dropna(subset=['P (lowerValue)'])
df_reach = df_reach.dropna(subset=['T (lowerValue)'])
#drop if no value for #name, cas AND iupacName
print(len(df_reach))
df_reach = df_reach.dropna(subset=['#name', 'cas', 'iupacName'], how='all')
print(len(df_reach))

6133
6025
6025


### perform a CASRN checksum check

In [3]:
def casrn_checksum(cas):
    #build in check: if NaN, return NaN
    if cas != cas:
        return cas
    #cas is a string seperated by -. get the first two parts
    cas = cas.split('-')
    if len(cas) != 3:
        return False
    cas_iterable = cas[0]+cas[1]
    cas_modulo = cas[2]
    #now go backwards through the string and multiply by increasing number, starting from one, and sum up
    #iterate through the string backwards
    cas_iterable = cas_iterable[::-1]
    checksum = 0
    for i in range(len(cas_iterable)):
        checksum += int(cas_iterable[i])*(i+1)
    #now take the modulo 10 of the checksum
    checksum = checksum%10
    #check if this is the same as the third part of the original cas
    if checksum == int(cas_modulo):
        return True
    else:
        return False
#perform this on the cas column, drop if false, NaN if no cas
print(len(df_reach))
df_reach = df_reach[df_reach['cas'].map(casrn_checksum)]
print(len(df_reach))


6025


ValueError: Cannot mask with non-boolean array containing NA / NaN values

### identifying duplicates in each field separately (i.e. MolBlock, SMILES, CASRN, name);

In [None]:
df_reach['identifier_duplicates'] = df_reach.duplicated(subset=['#name', 'cas', 'iupacName'], keep=False)

Unnamed: 0,#name,cas,iupacName,P (upperQualifier),P (upperValue),P (lowerQualifier),P (lowerValue),P (unit),T (upperQualifier),T (upperValue),T (lowerQualifier),T (lowerValue),T (unit),identifier_duplicates
0,Ô_-alanine,107-95-9,3-Aminopropionic acid,,,,5.060000e-08,mm Hg,,,,25.0,C,True
1,Ô_-alanine,107-95-9,3-Aminopropionic acid,,,,6.740000e-06,Pa,,,,25.0,C,True
2,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,3.440000e-01,hPa,,,,20.0,C,True
3,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,4.000000e-01,hPa,,,,20.0,C,True
4,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,9.000000e-01,hPa,,,,20.0,C,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6127,vinyl laurate,2146-71-6,Vinyl laurate,,,,2.800000e-01,Pa,,,,25.0,C,True
6128,vinyl laurate,2146-71-6,Vinyl laurate,,,,7.110000e-01,Pa,,,,25.0,C,True
6130,vinylene carbonate,872-36-6,"1,3-dioxol-2-one",,,,3.350000e+02,Pa,,,,25.0,C,False
6131,warfarin,81-81-2,(RS)-4-hydroxy-3-(3-oxo-1-phenylbutyl)coumarin,,,,3.470000e-03,Pa,,,,20.0,C,False


In [6]:
import cirpy
#make columns: name_smiles, cas_smiles, iupac_smiles. use cirpy to get smiles from name, cas, iupac
def get_smiles(row):
    #if there is a name, get the smiles from the name
    if row['#name'] == row['#name']:
        try:
            compound = cirpy.resolve(row['#name'], 'smiles')
            return compound
        except:
            return False
    #if there is a cas, get the smiles from the cas
    elif row['cas'] == row['cas']:
        try:
            compound = cirpy.resolve(row['cas'], 'smiles')
            return compound
        except:
            return False
    #if there is a iupac, get the smiles from the iupac
    elif row['iupacName'] == row['iupacName']:
        try:
            compound = cirpy.resolve(row['iupacName'], 'smiles')
            return compound
        except:
            return False
    else:
        return False
df_reach['name_smiles'] = df_reach.apply(get_smiles, axis=1)
df_reach['cas_smiles'] = df_reach.apply(get_smiles, axis=1)
df_reach['iupac_smiles'] = df_reach.apply(get_smiles, axis=1)
df_reach.head()

Unnamed: 0,#name,cas,iupacName,P (upperQualifier),P (upperValue),P (lowerQualifier),P (lowerValue),P (unit),T (upperQualifier),T (upperValue),T (lowerQualifier),T (lowerValue),T (unit),name_smiles,cas_smiles,iupac_smiles
0,Ô_-alanine,107-95-9,3-Aminopropionic acid,,,,5.06e-08,mm Hg,,,,25.0,C,,,
1,Ô_-alanine,107-95-9,3-Aminopropionic acid,,,,6.74e-06,Pa,,,,25.0,C,,,
2,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,0.344,hPa,,,,20.0,C,,,
3,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,0.4,hPa,,,,20.0,C,,,
4,Ô_-butyrolactone,96-48-0,oxolan-2-one,,,,0.9,hPa,,,,20.0,C,,,


In [7]:
df_reach.to_csv('df_reach_identifier_smiles.csv')

## Checks:
1. Stereochemistry
2. Hypervalency
3. Duplicate structures
4. Identifiers
5. Mismatches between structure representations 


