In [None]:
'''
/// @file CAS_Validation_Notebook.pynb
/// @author Austin Vandegriffe
/// @date 2020-04-20
/// @brief Search CAS number and return name and standardized number
/// @pre A CSV with CAS numbers and ingredient names is required
/// @style K&R, and "one true brace style" (OTBS), and '_' variable naming
/////////////////////////////////////////////////////////////////////
/// @references
/// ## N/A
/////////////////////////////////////////////////////////////////////
/// @notes
/// ## 1. This may have to run overnight with large datasets to prevent
/// ####  the CAS querier from being detected by anti-crawler defenses.
'''

In [None]:
import pandas as pd
import numpy as np

# For sleeping
import time

# Progressbar
from tqdm import tqdm

# Web crawler
import cas_query
querier = cas_query.CAS_Querier()

In [None]:
# Dataframe for VALid CAS numbers
df_val = pd.read_csv("mock_cas_dataset.csv")

# Dataframe for invalid CAS numbers, i.e. DROPped from df_val
df_drop = pd.DataFrame(columns=df_val.columns)

In [None]:
# Validate all UNIQUE CAS numbers
## I added a cool progressbar with tqdm
unique_cas_numbers = df_val["CASNumber"].unique()
for cas in tqdm( unique_cas_numbers ):
    if len(cas) == 0:
        # No CAS number, drop from valid dataframe
        df_drop = df_drop.append(df_val.loc[df_val["CASNumber"] == cas])
        df_val = df_val.loc[~(df_val["CASNumber"] == cas)]
        continue
    try:
        # Search databases for the CAS number
        tmp = querier.search(cas)
    except (cas_query.InvalidCASError, IndexError):
        # CAS number is invalid, drop from valid dataframe
        df_drop = df_drop.append(df_val.loc[df_val["CASNumber"] == cas])
        df_val = df_val.loc[~(df_val["CASNumber"] == cas)]
        continue
    # If it makes it here, the CAS number is valid
    ## make ingredient name and number format uniform.
    df_val.loc[df_val["CASNumber"] == cas, "IngredientName"] = tmp[0]
    df_val.loc[df_val["CASNumber"] == cas, "CASNumber"] = tmp[1]
    
    # Mimic real user, add a random delay.
    ## don't bombard the websites too quickly
    ## they will catch you...
    time.sleep(np.random.randint(10,30))