In [None]:
'''
/// @file CAS_Validation_Notebook.ipynb
/// @author Austin Vandegriffe
/// @date 2020-04-20
/// @brief Search CAS number and return name and standardized number
/// @pre A CSV with CAS numbers and ingredient names is required
/// @style K&R, "one true brace style" (OTBS), and '_' variable naming
/////////////////////////////////////////////////////////////////////
/// @references
/// ## N/A
/////////////////////////////////////////////////////////////////////
/// @notes
/// ## 1. This may have to run overnight with large datasets to prevent
/// ####  the CAS querier from being detected by anti-crawler defenses.
'''

In [1]:
import pandas as pd
import numpy as np

# For sleeping
import time

# Progressbar
from tqdm.notebook import tqdm

# Web crawler
import cas_query
querier = cas_query.CAS_Querier()

In [2]:
# Dataframe for VALid CAS numbers
df_val = pd.read_csv("mock_cas_dataset.csv")

# Dataframe for invalid CAS numbers, i.e. DROPped from df_val
df_drop = pd.DataFrame(columns=df_val.columns)

In [3]:
df_val.head(5)

Unnamed: 0,IngredientName,CASNumber
0,"Distillates, petroleum, hydrotreated light",64742-47-8
1,Potassium hydroxide,1310-58-3
2,Aluminum Oxide,1344-28-1
3,Listed Below,Listed Below
4,1-(Benzyl)quinolinium chloride,15619-48-4


In [4]:
# Validate all UNIQUE CAS numbers
## I added a cool progressbar with tqdm
unique_cas_numbers = df_val["CASNumber"].unique()
for cas in tqdm( unique_cas_numbers ):
    if len(cas) == 0:
        # No CAS number, drop from valid dataframe
        df_drop = df_drop.append(df_val.loc[df_val["CASNumber"] == cas])
        df_val = df_val.loc[~(df_val["CASNumber"] == cas)]
        continue
    try:
        # Search databases for the CAS number
        tmp = querier.search(cas)
    except (cas_query.InvalidCASError, IndexError):
        # CAS number is invalid, drop from valid dataframe
        df_drop = df_drop.append(df_val.loc[df_val["CASNumber"] == cas])
        df_val = df_val.loc[~(df_val["CASNumber"] == cas)]
        continue
    # If it makes it here, the CAS number is valid
    ## make ingredient name and number format uniform.
    df_val.loc[df_val["CASNumber"] == cas, "IngredientName"] = tmp[0]
    df_val.loc[df_val["CASNumber"] == cas, "CASNumber"] = tmp[1]
    
    # Mimic real user, add a random delay.
    ## don't bombard the websites too quickly
    ## they will catch you...
    time.sleep(np.random.randint(10,30))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [5]:
df_val.head(5)

Unnamed: 0,IngredientName,CASNumber
0,C13-14 isoparaffin,0064742-47-8
1,Potassium hydroxide,0001310-58-3
2,aluminium oxide,0001344-28-1
4,"Quinolinium, 1-(phenylmethyl)-, chloride (1:1)",0015619-48-4
5,Water,7732-18-5


In [6]:
df_drop.head(5)

Unnamed: 0,IngredientName,CASNumber
3,Listed Below,Listed Below
7,Listed Below,Listed Below
25,Listed Below,Listed Below
26,Listed Below,Listed Below
29,Listed Below,Listed Below
