In [1]:
import pandas as pd

In [2]:
import io
import pandas as pd
import requests

def csv_from_get_request(url) -> dict:
    """
    Extract data text string accessible with a GET request

    Paramters
    ---------
    url: str
        url for the extraction endpoint, including any query string
    Return 
    ------
    df: dataframe
        covid data
    """

    r = requests.get(url, timeout=10)
    data = r.content.decode('utf-8')
    df =  pd.read_csv(io.StringIO(data), low_memory=False)
    return df


In [108]:
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
df = csv_from_get_request(url)

In [109]:
# date, validate_test, validate_flag, massage
# 2022-06-05, cases_vs_deaths, fail, Death counts cannot exceed case counts. 

# 2022-06-05, cases_vs_deaths, pass, Successful

In [111]:
def cases_vs_deaths(df):
    """Checks that death count is no more than case count."""
    failure_message = "Death counts cannot exceed case counts."
    date_now = pd.to_datetime('today').date()
    if (df['deaths'] == df['cases']).all():
        print(f'Data test cases_vs_deaths passed.')
        return pd.DataFrame([[date_now, 'cases_vs_deaths', 'pass', 'Successful','INFO']],
                   columns=['date', 'validate_test', 'validate_flag','massage','error_level'])
        
    else:
        print(f'Data test cases_vs_deaths failed. {failure_message}')
        return pd.DataFrame([[date_now, 'cases_vs_deaths', 'fail', failure_message,'WARNING']],
                             columns=['date', 'validate_test', 'validate_flag','massage','error_level'])
    
def unique_records(df):
    """Checks that each date and FIPs combination is unique."""
    failure_message = "Only one record per FIPs, per date allowed."
    date_now = pd.to_datetime('today').date()
    if df[['date', 'fips']].drop_duplicates().shape[0] == df.shape[0]:
        print(f'Data test unique_records passed.')   
        return pd.DataFrame([[date_now, 'unique_records', 'pass', 'Successful','INFO']],
                   columns=['date', 'validate_test', 'validate_flag','massage','error_level'])
    else:
        print(f'Data test unique_records failed. {failure_message}')  
        return pd.DataFrame([[date_now, 'unique_records', 'fail', failure_message,'WARNING' ]],
                            columns=['date', 'validate_test', 'validate_flag','massage','error_level'])
    

def no_nulls_test(df):
    """Checks that all elements are not null"""
    failure_message = "All values are expected to be non-null."
    date_now = pd.to_datetime('today').date()
    if df.isnull().any().sum() == 0:
        print(f'Data test no_nulls_test passed.')
        return pd.DataFrame([[date_now, 'no_nulls_test', 'pass', 'Successful','INFO']],
                   columns=['date', 'validate_test', 'validate_flag','massage','error_level'])
        
    else:
        print(f'Data test no_nulls_test failed. {failure_message}')
        return pd.DataFrame([[date_now, 'no_nulls_test', 'fail', failure_message,'CRITICAL']],
                             columns=['date', 'validate_test', 'validate_flag','massage','error_level'])
    

def range_test(series, min, max):
    """Checks that all values in a series are within a range, inclusive"""
    return (series >= min).all() and (series <= max).all() 

def cases_range_test(df):
    """Checks that all cases are non-negative and <= 10M"""
    failure_message = "Cases must be non-negative and <= 10M."
    date_now = pd.to_datetime('today').date()
    if range_test(df['cases'], 0, 10e6):
        print(f'Data test cases_range_test passed.')
        return pd.DataFrame([[date_now, 'cases_range_test', 'pass', 'Successful','INFO']],
                   columns=['date', 'validate_test', 'validate_flag','massage','error_level'])
        
    else:
        print(f'Data test cases_range_test failed. {failure_message}')
        return pd.DataFrame([[date_now, 'cases_range_test', 'fail', failure_message,'CRITICAL']],
                             columns=['date', 'validate_test', 'validate_flag','massage','error_level'])

        
def deaths_range_test(df):
    """Checks that all deaths are non-negative and <= 100K"""
    failure_message = "Deaths must be non-negative and <= 100K."
    date_now = pd.to_datetime('today').date()
    if range_test(df['deaths'], 0, 10e6):      
        print(f'Data test deaths_range_test passed.')
        return pd.DataFrame([[date_now, 'deaths_range_test', 'pass', 'Successful','INFO']],
                   columns=['date', 'validate_test', 'validate_flag','massage','error_level'])
    else:
        print(f'Data test deaths_range_test failed. {failure_message}')
        return pd.DataFrame([[date_now, 'deaths_range_test', 'fail', failure_message,'CRITICAL']],
                             columns=['date', 'validate_test', 'validate_flag','massage','error_level'])

        

def append_to_validate_dataframe(df):
    
    cases_vs_deaths_result_df   = cases_vs_deaths(df)
    unique_records_result_df    = unique_records(df)
    no_nulls_test_result_df     = no_nulls_test(df)
    cases_range_test_result_df  = cases_range_test(df)
    deaths_range_test_result_df = deaths_range_test(df)
    
    
    validate_df = pd.concat([cases_vs_deaths_result_df,
                             unique_records_result_df,
                             no_nulls_test_result_df,
                             cases_range_test_result_df,
                             deaths_range_test_result_df], ignore_index=True)
    return validate_df


print("Dataframe for validate info:")
validate_df= append_to_validate_dataframe(df)
validate_df

Dataframe for validate info:
Data test cases_vs_deaths failed. Death counts cannot exceed case counts.
Data test unique_records failed. Only one record per FIPs, per date allowed.
Data test no_nulls_test failed. All values are expected to be non-null.
Data test cases_range_test passed.
Data test deaths_range_test failed. Deaths must be non-negative and <= 100K.


Unnamed: 0,date,validate_test,validate_flag,massage,error_level
0,2023-06-05,cases_vs_deaths,fail,Death counts cannot exceed case counts.,WARNING
1,2023-06-05,unique_records,fail,"Only one record per FIPs, per date allowed.",WARNING
2,2023-06-05,no_nulls_test,fail,All values are expected to be non-null.,CRITICAL
3,2023-06-05,cases_range_test,pass,Successful,INFO
4,2023-06-05,deaths_range_test,fail,Deaths must be non-negative and <= 100K.,CRITICAL


In [121]:
def nyt_cases_counties_validate_check(validate_df):
    """Run validate check provided data tests on provided data.

    Parameters
    ----------
    validate_df : pandas.DataFrame object
      The dataset to test.
    Returns
    ----------
    boolean
    """
    for item, row in validate_df.iterrows():
        if (row['validate_flag']) == 'pass':
            print(f"Data test {row['validate_test']} passed.")
        elif (row['validate_flag'] == 'fail') and (row['error_level'] == 'CRITICAL'):
            print(f"[{row['error_level']}]: Data test {row['validate_test']} failed. {row['massage']}")
        else:
            print(f"Data test {row['validate_test']} failed. {row['massage']}")

        pass_result = len(validate_df[validate_df['validate_flag'] == 'pass'])
        all_result  = len(validate_df)

    print()
    print(f'{pass_result}/{all_result} passed.')
    return (pass_result == all_result)

nyt_cases_counties_validate_check(validate_df)

Data test cases_vs_deaths failed. Death counts cannot exceed case counts.
Data test unique_records failed. Only one record per FIPs, per date allowed.
[CRITICAL]: Data test no_nulls_test failed. All values are expected to be non-null.
Data test cases_range_test passed.
[CRITICAL]: Data test deaths_range_test failed. Deaths must be non-negative and <= 100K.

1/5 passed.


False

In [None]:
# DEBUG - Detailed information, typically of interest only when diagnosing problems.
# INFO - Confirmation that things are working as expected.
# WARNING - An indication that something unexpected happened, or indicative of some problem in the near future (e.g. ‘disk space low’). The software is still working as expected.
# ERROR - Due to a more serious problem, the software has not been able to perform some function.
# CRITICAL - A serious error, indicating that the program itself may be unable to continue running.