In [1]:
#!pip list #To obtain requirements.txt for Python 3 ipykernel (Python 3.7.12)

In [1]:
# Notebook Summary:

# V.History: 
# Date Last Modified: 14 May 2025

# This notebook extracts chemical names from a given text (Determinand Definition or Name) using the advanced NLP library 'Spacy’. 
# Named Entity Recognition is used to gain insight into the unstructured text - 'Determinand Definition'.
# Three columns - 'Element', 'Group', and 'Metal' - are added to the distinct chemical name CSV file.
# RDKIT, CHEMDATAEXTRACTOR, and CHEMLIB are the chemical libraries explored during development. 
# ChemDataExtractor (Product of Cambridge University) is used to extract chemical properties for a given 
# Determinand Definition. Other experiments were also conducted during development.
# This notebook uses a Python version earlier than 3.8, specifically an ipykernel with Python 3.7.12.

# Note: For the advanced version of chemical name extractions (Field name: chem2_name), refer to the notebook: 10_NB_Final_chem2.ipynb

#Pre-Requisite : 
    #Kernel Python 3 (ipykernel) is required to run this notebook 
    #Required python version - Python 3.7.12 and its compatible Numpy , ScikitLearn libraries

#Old Name: 08_NB_ChemNameExtraction.ipynb

In [2]:
!python -V
python_version=!(python --version 2>&1)
print (python_version)

Python 3.7.12
['Python 3.7.12']


In [3]:
#!pip install -U pip
#!pip install -U cryptography

!pip install scikit-learn rdkit | grep -v 'already satisfied'

#!pip install dawg
#!pip install -U dawg2

#!pip install dawg2  | grep -v 'already satisfied'
!pip install -U dawg2  | grep -v 'already satisfied'

!pip install chemdataextractor  | grep -v 'already satisfied'
!pip install -U chemdataextractor  | grep -v 'already satisfied'

#!pip install chemdataextractor2 | grep -v 'already satisfied'     #Commented on 10 May 2025, Due to python version compatibility issue 
#!pip install -U chemdataextractor2 | grep -v 'already satisfied'  #Commented on 10 May 2025, Due to python version compatibility issue

!conda install git+https://github.com/maddenfederico/ChemDataExtractor2

!pip install chemdataextractor --no-dependencies  | grep -v 'already satisfied'

!pip3 install chemdataextractor2 --no-dependencies  | grep -v 'already satisfied'

!pip install -U chemlib

#!pip3 install chemdataextractor2 --no-dependencies

!pip install git+https://github.com/maddenfederico/ChemDataExtractor  | grep -v 'already satisfied'


Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: failed

PackagesNotFoundError: The following packages are not available from current channels:

  - //github.com/maddenfederico/chemdataextractor2

Current channels:

  - https://conda.anaconda.org/conda-forge

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.


Collecting git+https://github.com/maddenfederico/ChemDataExtractor
  Cloning https://github.com/maddenfederico/ChemDataExtractor to /var/tmp/pip-req-build-ah8eonyq
  Running command git clone --filter=blob:none --quiet https://github.com/maddenfederico/ChemDataExtractor /var/tmp/pip-req-build-ah8eonyq
  Resolved https://github.com/maddenfederico/ChemDataExtractor to commit f1a1b523b6a0862bae29c359a9728f97ec57b11e
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finishe

In [4]:
#%run "99_NB_CommonUtils.ipynb" #Installing other required libraries
import pandas as pd
import spacy
from chemlib import PeriodicTable
from chemlib import Element
import spacy
from chemlib import chemistry
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from sklearn.cluster import AgglomerativeClustering

import spacy
from spacy.tokens import doc
nlp = spacy.load('en_core_web_sm')


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#User-Defined functions
def showtime():
    import time
    from datetime import datetime
    
    t = time.localtime()
    d = datetime.now()
    current_time = time.strftime("%H:%M:%S", t)
    current_date = d.strftime("%d %B %Y")
    print(current_date, current_time)
    return()


In [6]:
#Save dataset into GCP Cloud Storage
def savedata(tDF, tname, tpath = 'gcs://rdmai_data/') :
    #tpath = 'gcs://rdmai_data/'
    tclensed = 'cleansed/'
    tDF.to_csv(tpath+tclensed+tname)
    print("saved, Location: ", tpath+tclensed+tname)
    return("saved, Location: ", tpath+tclensed+tname)

In [7]:
#Read dataset from GCP Cloud Storage
def loaddata(fpath, fname, path = 'gcs://rdmai_data/') : #added a path parameter as optional paramter  Jan 2 2025 - Pasu
    #Constants declaration for the folder path for files stored under Google Cloud Storage 
    #path = 'gcs://rdmai_data/' #added a path parameter as optional paramter Jan 2 2025, hence commented here - Pasu

    #all_files = glob.glob(os.path.join(path + raw, "*.csv"))
    #wqpath = 'Water_Quality_EA/'
    ##wqpath = '' #Make this commented when reading from Google Cloud Storage

    print (path+fpath+fname)

    #Read full clensed Set where the data is from 2000 till 2004
    retdf = pd.DataFrame()
    #showtime()
    retdf = pd.read_csv(path+fpath+fname)
    #showtime()
           
    return (retdf)

In [8]:
# Create a custom entity matcher
def extract_chemical_elements(text):
    doc = nlp(text)  # Process the text using spaCy's NER

    # Loop through each token in the text
    fnd=False
    for token in doc:
        #print(token)
        # Check if the token matches any chemical element name
        #if token.text.capitalize() in element_names: #Old method
        #    return (pdchElements[(pdchElements.Element.isin([token.text.capitalize()]))])
        #else:
        #    return(pd.DataFrame({'Element': [doc], 'Group': [99], 'Metal': ['False']}))
        if token.text.capitalize() in element_names: #Old method
            return (pdchElements[(pdchElements.Element.isin([token.text.capitalize()]))])
    if ~fnd:
        return(pd.DataFrame({'Element': [doc], 'Group': [99], 'Metal': [False]}))

In [9]:
df = pd.DataFrame()
raw = 'raw/'
curated = "curated/"
cleansed = "cleansed/"

folderpath = raw
#filename = 'Water_Quality_EA2000_2024_ChemBERT.csv'                    #Commented on 10 May 2025
filename = '03_nb_get_unique_dminands.csv'                              #Added on 10 May 2025

showtime()
#df = loaddata(folderpath, filename, path = 'gcs://rdmai_dev_data/')    #Commented on 10 May 2025
df = loaddata(cleansed, filename, path = 'gcs://rdmai_dev_data/')       #Added on 10 May 2025
showtime()

df_org = pd.read_csv('gcs://rdmai_dev_data/raw/Water_Quality_EA2000_2024_ChemBERT.csv')
df = df_org

df.head(2) 

11 May 2025 21:37:56
gcs://rdmai_dev_data/cleansed/03_nb_get_unique_dminands.csv


  import cryptography.exceptions


11 May 2025 21:37:57


Unnamed: 0.1,Unnamed: 0,determinand_notation,determinand_name,determinand_definition,result,determinand_unit_name,samplingPoint_notation,samplingPoint_name
0,0,7444,D Site Insp,Descriptive Site Inspection : Pass/Fail 1/0,1.0,coded,AN-011262,STEANE PARK THE MANOR HOUSE STW
1,4,664,Oil & Grs Vs,"Visible oil or grease, significant trace: Pres...",0.0,pres/nf,AN-011262,STEANE PARK THE MANOR HOUSE STW


In [10]:
#tempdf = df[df['sampledMaterialType_name']=='RIVER / RUNNING SURFACE WATER']
#tempdf = df.copy()
#tempdf.head(2)
df['determinand_definition'].head(10)

0          Descriptive Site Inspection : Pass/Fail 1/0
1    Visible oil or grease, significant trace: Pres...
2                                      BOD : 5 Day ATU
3                             Ammoniacal Nitrogen as N
4                           Solids, Suspended at 105 C
5                                   No flow /No sample
6                      Chemical Oxygen Demand :- {COD}
7                                             Chloride
8         Weather : Unusual Condition Flag : Scale 0-5
9                                                   pH
Name: determinand_definition, dtype: object

In [11]:
smiles_list = df['determinand_definition'].to_list()
#smiles_list = df['determinand_name'].to_list()
smiles_list[0:10]

['Descriptive Site Inspection : Pass/Fail 1/0',
 'Visible oil or grease, significant trace: Present/Not found (1/0)',
 'BOD : 5 Day ATU',
 'Ammoniacal Nitrogen as N',
 'Solids, Suspended at 105 C',
 'No flow /No sample',
 'Chemical Oxygen Demand :- {COD}',
 'Chloride',
 'Weather : Unusual Condition Flag : Scale 0-5',
 'pH']

In [12]:
print (chemistry.pte.Element.shape)

(118,)


In [13]:
# Get a list of chemical element names from the periodic table using chemlib
element_list    = [element for element in chemistry.pte]
element_names   = [element for element in chemistry.pte.Element]
element_group   = [element for element in chemistry.pte.Group]
element_symbol  = [element for element in chemistry.pte.Symbol]
element_period  = [element for element in chemistry.pte.Period]
element_metal   = [element for element in chemistry.pte.Metal]
element_natural = [element for element in chemistry.pte.Natural]

combined_list = pd.DataFrame({'element_names'   : element_names, 
                              'element_group'   : element_group,
                              'element_symbol'  : element_symbol,
                              'element_period'  : element_period,
                              'element_metal'   : element_metal,
                              'element_natural' : element_natural}) 
#print(element_list, '|', element_names, '|', element_group, '|', element_symbol, '|', element_period, '|', element_metal, '|', element_natural)
#print(combined_list)


In [14]:
chemElements = pd.DataFrame(chemistry.pte)
chemElements[:2]

pdchElements = chemElements[['Element', 'Group', 'Metal']]

# Create new rows
new_row = pd.DataFrame({'Element': ['Flow', 'ph', 'Temperature', 'Turbidity', 'Nitrite', 'Nitrate', 'Orthophosphate', 'Chlorophyll', 'Cyanide', 'Hexachlorobenzene', 'Pentachlorophenol', 'Trichlorobenzene', 'Fenitrothion', 'Dieldrin', 'Trifluralin', 'Trichlorobenzene', 'Hexachlorobutadiene', 'Isodrin', 'Aldrin', 'Fluoride', 'Trichlorophenol', 'Phenol', 'Propyzamide', 'Chlorotoluron', 'Dicamba', 'Tecnazene', 'Pirimiphos-methyl', 'Methomyl', 'Oxamyl', 'Clopyralid', 'Chloridazon', 'Carbaryl', 'Endosulfan B', 'Dimethoate', 'Carbetamide', 'Desmetryn', 'Methabenzthiazuron', 'Azinphos-ethyl', 'Methiocarb', 'Tetrachloroethane'], 
            'Group': [99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99], 
            #40'Metal': ['Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others'
            'Metal': [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False  
                     ]})
pdchElements = pd.concat([pdchElements, new_row], ignore_index=False)

new_row = pd.DataFrame({'Element': ['Solid', 'BOD', 'Orthophosphate', 'Chloride', 'Conductivity', 'Odour', 'Phenol', 'Sulphate', 'DDT', 'Endrin', 'Azinphos-methyl', 'DDE', 'Dichloroethane', 'TDE', 'HCH', 'Chloro', 'Dimethylphenol', 'PAH', 'PCB', 'Toxicity', 'Other-Grain Size', 'Algal', 'PHI', 'Density', 'BS EN'],
            'Group': [99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99], 
            #25'Metal': ['Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others','Others'
            'Metal': [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False
                        ]})
pdchElements = pd.concat([pdchElements, new_row], ignore_index=False)

# Append rows
pdchElements = pd.concat([pdchElements, new_row], ignore_index=False)
element_names   = [element for element in pdchElements.Element]
pdchElements[:3]

Unnamed: 0,Element,Group,Metal
0,Hydrogen,1,False
1,Helium,18,False
2,Lithium,1,True


In [15]:
element_names[:3]

['Hydrogen', 'Helium', 'Lithium']

In [16]:
# Extract Chemical name for Determinand and its related group & Metal type?
extracted_elements = pd.DataFrame(columns=['Element','Group','Metal'])
i=0
for text in smiles_list:
    #print(text)
    extracted_elements = pd.concat([extracted_elements, extract_chemical_elements(text).head(1)], axis=0, ignore_index=True)
    #df = pd.concat([df, extracted_elements], ignore_index=False)
    #if (i<10):
    #    print (extracted_elements) 
    #    i=i+1
extracted_elements[:10]
#df.head(10)

Unnamed: 0,Element,Group,Metal
0,"(Descriptive, Site, Inspection, :, Pass, /, Fa...",99,False
1,"(Visible, oil, or, grease, ,, significant, tra...",99,False
2,"(BOD, :, 5, Day, ATU)",99,False
3,Nitrogen,15,False
4,"(Solids, ,, Suspended, at, 105, C)",99,False
5,Flow,99,False
6,Oxygen,16,False
7,Chloride,99,False
8,"(Weather, :, Unusual, Condition, Flag, :, Scal...",99,False
9,(pH),99,False


In [17]:
smiles_list[:10]

['Descriptive Site Inspection : Pass/Fail 1/0',
 'Visible oil or grease, significant trace: Present/Not found (1/0)',
 'BOD : 5 Day ATU',
 'Ammoniacal Nitrogen as N',
 'Solids, Suspended at 105 C',
 'No flow /No sample',
 'Chemical Oxygen Demand :- {COD}',
 'Chloride',
 'Weather : Unusual Condition Flag : Scale 0-5',
 'pH']

In [18]:
#extracted_elements.reindex()
df['Chemical_Names'] = extracted_elements['Element']
df['Chemical_Group'] = extracted_elements['Group']
df['Metal'] = extracted_elements['Metal']
df.head(5)

Unnamed: 0.1,Unnamed: 0,determinand_notation,determinand_name,determinand_definition,result,determinand_unit_name,samplingPoint_notation,samplingPoint_name,Chemical_Names,Chemical_Group,Metal
0,0,7444,D Site Insp,Descriptive Site Inspection : Pass/Fail 1/0,1.0,coded,AN-011262,STEANE PARK THE MANOR HOUSE STW,"(Descriptive, Site, Inspection, :, Pass, /, Fa...",99,False
1,4,664,Oil & Grs Vs,"Visible oil or grease, significant trace: Pres...",0.0,pres/nf,AN-011262,STEANE PARK THE MANOR HOUSE STW,"(Visible, oil, or, grease, ,, significant, tra...",99,False
2,38,85,BOD ATU,BOD : 5 Day ATU,7.5,mg/l,AN-011396,BUCKINGHAM GOLF CLUB TINGEWICK RD.,"(BOD, :, 5, Day, ATU)",99,False
3,39,111,Ammonia(N),Ammoniacal Nitrogen as N,10.4,mg/l,AN-011396,BUCKINGHAM GOLF CLUB TINGEWICK RD.,Nitrogen,15,False
4,40,135,Sld Sus@105C,"Solids, Suspended at 105 C",10.0,mg/l,AN-011396,BUCKINGHAM GOLF CLUB TINGEWICK RD.,"(Solids, ,, Suspended, at, 105, C)",99,False


In [19]:
# Save the clustered data
#df.to_csv('08_F_CSV_ChemNameExtraction.csv', index=True)
#savedata(df, "08_F_CSV_ChemNameExtraction.csv")                           #Commented on 10 May 2025
savedata(df, "04_nb_extract_chem_name.csv", tpath='gcs://rdmai_dev_data/') #Added on 10 May 2025

saved, Location:  gcs://rdmai_dev_data/cleansed/04_nb_extract_chem_name.csv


('saved, Location: ',
 'gcs://rdmai_dev_data/cleansed/04_nb_extract_chem_name.csv')

In [20]:
#dbutils.notebook.exit("End Workload - Scrip stopped")

In [21]:
#END CARD