# Testing the Water Data for submission into 2024 Spring Water submission
This is a revised notebook of Water_20230406.ipynb for 2024 data call. It validates the related water data, and do the tests on Nebraska Water system. 

In [53]:
import pandas as pd
from libraries import general

In [54]:
counties = general.get_Counties_FIPS('NE')

## Required Libraries
These are the required libraries for validation. Will later be added into a separate library

In [55]:
# First attempt to creat Data class models
import pandas as pd
from datetime import date
from typing import Optional, List, Literal
from pydantic import BaseModel, ValidationError, Field, conint, confloat, constr, validator
import datetime

In [56]:
import pydantic

version = pydantic.__version__
print(version)


1.10.7


In [57]:
# ['RowIdentifier', 'PWSIDNumber', 'YearAssociatedTo', 'YearPulled',
#       'PWSName', 'PrincipalCountyServedFIPS', 'PrincipalCityFeatureID',
#       'TotalConnections', 'SystemPopulation', 'PrimarySourceCode', 'Latitude',
#       'Longitude', 'LocationDerivationCode']

class PWS_Inventory(BaseModel):
    RowIdentifier: int
    StateFIPSCode: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #Change NE to represent your state code
    
    YearAssociatedTo: conint(ge=1999, le=2024) 
    YearPulled: conint(ge=1999, le=2024)
    
    PWSName: str #Should it have distinction between Unknows and Not Submitted? or just be blank?
    
    PrincipalCountyServedFIPS: str

    @validator('PrincipalCountyServedFIPS')
    def check_PrincipalCountyServedFIPS(cls, v):
        allowed_values = counties['fips'].tolist()
        if v not in allowed_values:
            raise ValueError('PrincipalCountyServedFIPS must be a valid FIPS code')
        return v      

    PrincipalCityFeatureID: int # ????How to get it from the introduced source?

    TotalConnections: conint(ge=1, le=9999999)
    SystemPopulation: conint(ge=10, le=99999999)
    PrimarySourceCode: Literal['GU', 'GUP', 'GW', 'GWP', 'SW', 'SWP', 'U', 'NS']

    # For Nebraska in NAD83
    Latitude: confloat(ge= 39.999998, le=43.001702) 
    Longitude: confloat(ge= -104.053514, le=-95.308290)
    LocationDerivationCode: Literal['SA', 'MFL', 'PCS', 'GSH','O', '-999', '-888']

        
# ['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode', 'DateSampled',
#        'AggregationType', 'NumSamplingLocations', 'SummaryTimePeriod',
#        'NumSamples', 'NumNonDetects', 'ConcentrationUnits', 'Concentration']
class Sampling_Summary(BaseModel):
    RowIdentifier: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #for Nebraska

    Year: conint(ge=1999, le=2024)
    
    
    AnalyteCode: Literal['1005', '2050', '2456', '2950', '2039', '1038', '1040', '2987', 
    '2984', '4010', '4006']
    ConcentrationUnits: Literal['ug/l', 'mg/l','pci/l'] # TODO: Apply the rules of what Analyte each applies to
    Concentration: float

    DateSampled: datetime.date #validate to be from 1/1/1999 to the latest complete year

    AggregationType: Literal['X', 'MX']
    NumSamplingLocations: conint(ge=1, le=9999) #TODO: '-888' for Not Submitted
    SummaryTimePeriod: str #TODO: look into its Data Dictionary
    NumSamples: int
    NumNonDetects: int



class Sampling(BaseModel):
    RowIdentifier: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #for Nebraska

    Year: conint(ge=1999, le=2024)
    
    
    AnalyteCode: Literal['1005', '2050', '2456', '2950', '2039','1038', '1040', '2987', 
    '2984', '4010', '4006']
    # check if ConcentrationUnits is one of the strings in this list, make the list case insensitive
    ConcentrationUnits: Literal['ug/l', 'mg/l','pci/l'] # TODO: Apply the rules of what Analyte each applies to

    # check if Concentration is a float and is greater than or equal to 0
    Concentration: confloat(ge=0.0)

    DateSampled: datetime.date #validate to be from 1/1/1999 to the latest complete year


## Checking the PWS_Inventory


In [58]:
inventory = pd.read_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/PWSInventory2022_2023.xlsx')

In [59]:
inventory.PWSIDNumber.nunique()

593

In [60]:
# Check for duplicates
inventory[inventory.duplicated(subset=['PWSIDNumber', 'YearAssociatedTo'], keep=False)]

Unnamed: 0,StateFIPSCode,PWSIDNumber,YearAssociatedTo,YearPulled,PWSName,PrincipalCountyServedName,PrincipalCountyServed FIPS,PrincipalCityName,﻿PrincipalCityFeatureId,TotalConnections,SystemPopulation,PrimarySourceCode,Horiz_Ref_Datum,Latitude,Longitude,LocationDerivationCode


In [61]:
inventory.columns.tolist()

['StateFIPSCode',
 'PWSIDNumber',
 'YearAssociatedTo',
 'YearPulled',
 'PWSName',
 'PrincipalCountyServedName',
 'PrincipalCountyServed FIPS',
 'PrincipalCityName',
 '\ufeffPrincipalCityFeatureId',
 'TotalConnections',
 'SystemPopulation',
 'PrimarySourceCode',
 'Horiz_Ref_Datum',
 'Latitude',
 'Longitude',
 'LocationDerivationCode']

In [62]:
#inventory.columns = inventory.columns.str.replace('\ufeff', '')

#Change the names of several columns to match the names in the validator
inventory.rename(columns={'PrincipalCountyServed FIPS': 'PrincipalCountyServedFIPS', '\ufeffPrincipalCityFeatureId': 'PrincipalCityFeatureID'}, inplace=True)

In [64]:
# Adding a uique Identifier, as first column, for each row
inventory.insert(0, 'RowIdentifier', inventory.index)
#inventory['RowIdentifier'] = inventory.index

In [65]:
inventory.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/PWSInventory_latest.csv', index=False)

In [67]:
rm_column = list(set(inventory.columns) - set((PWS_Inventory.__fields__.keys())))

In [68]:
set(PWS_Inventory.__fields__.keys()) - set(inventory.columns)

set()

In [69]:
inventory.Horiz_Ref_Datum.isna().sum()

28

In [70]:
inventory.drop(columns=rm_column, inplace=True)

In [71]:
# For 2024
inventory.columns

Index(['RowIdentifier', 'StateFIPSCode', 'PWSIDNumber', 'YearAssociatedTo',
       'YearPulled', 'PWSName', 'PrincipalCountyServedFIPS',
       'PrincipalCityFeatureID', 'TotalConnections', 'SystemPopulation',
       'PrimarySourceCode', 'Latitude', 'Longitude', 'LocationDerivationCode'],
      dtype='object')

In [72]:
inventory.shape

(593, 14)

In [73]:
# This one is the column names for 2023 data call
# inventory.columns

In [74]:
PWS_Inventory.__fields__.keys()

dict_keys(['RowIdentifier', 'StateFIPSCode', 'PWSIDNumber', 'YearAssociatedTo', 'YearPulled', 'PWSName', 'PrincipalCountyServedFIPS', 'PrincipalCityFeatureID', 'TotalConnections', 'SystemPopulation', 'PrimarySourceCode', 'Latitude', 'Longitude', 'LocationDerivationCode'])

In [75]:
# Doing the validation for each row as a PWS_Inventory object
valid_rows = []
# Creat a dictionary that contains the RowIdentifier of the invalid rows and the error message
invalid_rows = {}
for index, row in inventory.iterrows():
    
    try:
        PWS_Inventory(**row)
        # If passeed, add RowIdentifier into valid_rows list
        valid_rows.append(row['RowIdentifier'])

    except ValidationError as e:
        # If failed, add RowIdentifier and the error message into invalid_rows dictionary
        invalid_rows[row['RowIdentifier']] = e.errors

        print(e)

3 validation errors for PWS_Inventory
Latitude
  ensure this value is greater than or equal to 39.999998 (type=value_error.number.not_ge; limit_value=39.999998)
Longitude
  ensure this value is greater than or equal to -104.053514 (type=value_error.number.not_ge; limit_value=-104.053514)
LocationDerivationCode
  unexpected value; permitted: 'SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888' (type=value_error.const; given=nan; permitted=('SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888'))
3 validation errors for PWS_Inventory
Latitude
  ensure this value is greater than or equal to 39.999998 (type=value_error.number.not_ge; limit_value=39.999998)
Longitude
  ensure this value is greater than or equal to -104.053514 (type=value_error.number.not_ge; limit_value=-104.053514)
LocationDerivationCode
  unexpected value; permitted: 'SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888' (type=value_error.const; given=nan; permitted=('SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888'))
3 validation errors for PWS_

In [76]:
invalid_rows

{43: <bound method ValidationError.errors of ValidationError(model='PWS_Inventory', errors=[{'loc': ('Latitude',), 'msg': 'ensure this value is greater than or equal to 39.999998', 'type': 'value_error.number.not_ge', 'ctx': {'limit_value': 39.999998}}, {'loc': ('Longitude',), 'msg': 'ensure this value is greater than or equal to -104.053514', 'type': 'value_error.number.not_ge', 'ctx': {'limit_value': -104.053514}}, {'loc': ('LocationDerivationCode',), 'msg': "unexpected value; permitted: 'SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888'", 'type': 'value_error.const', 'ctx': {'given': nan, 'permitted': ('SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888')}}])>,
 48: <bound method ValidationError.errors of ValidationError(model='PWS_Inventory', errors=[{'loc': ('Latitude',), 'msg': 'ensure this value is greater than or equal to 39.999998', 'type': 'value_error.number.not_ge', 'ctx': {'limit_value': 39.999998}}, {'loc': ('Longitude',), 'msg': 'ensure this value is greater than or equal to -104.

In [77]:
# Take the errorous rows of the inventory dataframe from key values in invalid_rows dictionary
errorous_rows = inventory[inventory['RowIdentifier'].isin(invalid_rows.keys())]

In [78]:
errorous_rows

Unnamed: 0,RowIdentifier,StateFIPSCode,PWSIDNumber,YearAssociatedTo,YearPulled,PWSName,PrincipalCountyServedFIPS,PrincipalCityFeatureID,TotalConnections,SystemPopulation,PrimarySourceCode,Latitude,Longitude,LocationDerivationCode
43,43,31,NE3121486,2023,2024,BELLEVUE TERRACE MHC LLC,31153,827304,94,357,SWP,,,
48,48,31,NE3110910,2023,2024,"BENNET, VILLAGE OF",31109,-888,428,1084,GWP,,,
50,50,31,NE3121227,2023,2024,BIC JOINT WATER AGENCY,31065,827204,3,355,GW,,,
127,127,31,NE3121429,2023,2024,COTTONWOOD TERRACE,31111,831719,243,562,GWP,,,
138,138,31,NE3110704,2023,2024,"CROFTON, CITY OF",31107,828463,368,754,SWP,,,
139,139,31,NE3120824,2023,2024,CROOKED CREEK WATER SYSTEM,31109,837279,33,64,GW,,,
180,180,31,NE3121485,2023,2024,EAGLE WAY MHC LLC,31025,828917,53,99,GWP,,,
278,278,31,NE3121481,2023,2024,K & K MANUFACTURED HOME COMMUNITY,31001,829848,50,180,GWP,,,
310,310,31,NE3121368,2023,2024,LOWER BIG BLUE NRD - WYMORE,31067,834893,228,856,GWP,,,
336,336,31,NE3121363,2023,2024,MEADOWBROOK ESTATES WATER SYSTEM,31055,835483,266,675,SWP,,,


In [79]:
errorous_rows.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/PWSInventory_2024_errors.csv', index=False)

## Checking the Sampling
This is the latest file (The unaggregated)

In [83]:
import pandas as pd
sampling = pd.read_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/PWSSampleResults2022-2023.xlsx')

In [81]:
sampling.columns

Index(['PWSIDNumber', 'Year', 'AnalyteName', 'AnalyteCode',
       'ConcentrationUnits', 'Concentration', 'DateSampled', 'SamplePointID',
       'DetectionLimit', 'DetectionLimitUom', 'NonDetectFlag'],
      dtype='object')

In [84]:
# check for duplicates, and add them into a separate dataframe

duplicates = sampling[sampling.duplicated(subset=['PWSIDNumber', 'Year', 'AnalyteCode', 'DateSampled', 'SamplePointID'], keep=False)]

In [32]:
duplicates.to_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/duplicates_samples.xlsx')

In [85]:
sampling.columns

Index(['PWSIDNumber', 'Year', 'AnalyteName', 'AnalyteCode',
       'ConcentrationUnits', 'Concentration', 'DateSampled', 'SamplePointID',
       'DetectionLimit', 'DetectionLimitUom', 'NonDetectFlag'],
      dtype='object')

In [86]:
Sampling.__fields__.keys()

dict_keys(['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode', 'ConcentrationUnits', 'Concentration', 'DateSampled'])

In [87]:
sampling.AnalyteCode.value_counts()

1038    4340
1005    1175
2050    1110
2039    1110
2987    1016
2984    1016
2950     696
2456     694
4010     433
4006     167
1041       4
Name: AnalyteCode, dtype: int64

In [88]:
# count the number of rows for each year, adding heading to the output
sampling.Year.value_counts().to_frame('Number of Rows')

Unnamed: 0,Number of Rows
2022,5968
2023,5793


In [89]:
# For AnalyteCode Replace all 1038 values with 1040
#sampling['AnalyteCode'] = sampling['AnalyteCode'].replace(1038, 1040)

# Delete all rows with 1041 as AnalyteCode, which are only NITRITE tests
sampling = sampling[sampling['AnalyteCode'] != 1041] #Contained only 17 rows

In [90]:
sampling['ConcentrationUnits'] = sampling['ConcentrationUnits'].str.lower()

In [91]:
del_cols = list(set(sampling.columns) - set(Sampling.__fields__.keys()))

# Remove the columns that are not in the Sampling validator
sampling_validation = sampling.drop(columns=del_cols)

sampling_validation.insert(0, 'RowIdentifier', sampling.index)

In [92]:
print(sampling_validation.columns)
print(Sampling.__fields__.keys())

Index(['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode',
       'ConcentrationUnits', 'Concentration', 'DateSampled'],
      dtype='object')
dict_keys(['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode', 'ConcentrationUnits', 'Concentration', 'DateSampled'])


In [93]:
# change type of colum AnlyteCode to string
sampling_validation['AnalyteCode'] = sampling_validation['AnalyteCode'].astype(str)

In [94]:
# now validating the sampling dataframe
# Doing the validation for each row as a PWS_Inventory object
valid_rows_sampling = []
# Creat a dictionary that contains the RowIdentifier of the invalid rows and the error message
invalid_rows_sampling = {}
for index, row in sampling_validation.iterrows():
    
    try:
        Sampling(**row)
        # If passeed, add RowIdentifier into valid_rows list
        valid_rows_sampling.append(row['RowIdentifier'])

    except ValidationError as e:
        # If failed, add RowIdentifier and the error message into invalid_rows dictionary
        invalid_rows_sampling[row['RowIdentifier']] = e.errors

        #print(e)

In [95]:
invalid_rows_sampling

{}

In [96]:
sampling.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/Sample_results_2024.csv', index=False)

In [97]:
# Looks like there are more undefined AnalyteCodes in the sampling dataframe
# Let's see what they are
sampling.AnalyteCode.value_counts()

1038    4340
1005    1175
2050    1110
2039    1110
2987    1016
2984    1016
2950     696
2456     694
4010     433
4006     167
Name: AnalyteCode, dtype: int64

In [98]:
# What are allowable AnalyteCodes as defined in the Sampling class
Sampling.__fields__['AnalyteCode'].type_


typing.Literal['1005', '2050', '2456', '2950', '2039', '1038', '1040', '2987', '2984', '4010', '4006']

### How to Aggregate into Sampling Results

Each community water system:
* annual mean and maz concentration of:

--- arsenic, disinfection byproducts (HAA5 and TTHM), 

--- nitrates, 

--- atrazine, 

--- di(2-ethylhexyl) phthalate (DEHP), 

--- radium, 

--- tetrachloroethene (tetrachloroethylene) (PCE), 

--- trichloroethene (trichloroethylene) (TCE), and 

--- uranium


* Mean concentration per quarter 

--- Nitrate

---- Atrazine

In [101]:
set(inventory.PWSIDNumber.unique()) - set(sampling.PWSIDNumber.unique())

{'NE3102703',
 'NE3102707',
 'NE3104308',
 'NE3113901',
 'NE3117304',
 'NE3117307',
 'NE3117903',
 'NE3120031',
 'NE3120358'}

In [102]:
sampling.columns

Index(['PWSIDNumber', 'Year', 'AnalyteName', 'AnalyteCode',
       'ConcentrationUnits', 'Concentration', 'DateSampled', 'SamplePointID',
       'DetectionLimit', 'DetectionLimitUom', 'NonDetectFlag'],
      dtype='object')

In [103]:
sampling.Year.value_counts()

2022    5964
2023    5793
Name: Year, dtype: int64

In [104]:
sampling.head()

Unnamed: 0,PWSIDNumber,Year,AnalyteName,AnalyteCode,ConcentrationUnits,Concentration,DateSampled,SamplePointID,DetectionLimit,DetectionLimitUom,NonDetectFlag
0,NE3115509,2022,TETRACHLOROETHYLENE,2987,ug/l,0.25,2022-01-04,001,0.5,UG/L,1
1,NE3115509,2022,TRICHLOROETHYLENE,2984,ug/l,0.25,2022-01-04,001,0.5,UG/L,1
2,NE3108101,2022,TETRACHLOROETHYLENE,2987,ug/l,0.25,2022-01-03,G-028307,0.5,UG/L,1
3,NE3103505,2022,TETRACHLOROETHYLENE,2987,ug/l,0.25,2022-01-03,G-073387,0.5,UG/L,1
4,NE3103505,2022,TETRACHLOROETHYLENE,2987,ug/l,0.25,2022-01-03,G-074035,0.5,UG/L,1


In [105]:
sampling.dtypes

PWSIDNumber                   object
Year                           int64
AnalyteName                   object
AnalyteCode                    int64
ConcentrationUnits            object
Concentration                float64
DateSampled           datetime64[ns]
SamplePointID                 object
DetectionLimit               float64
DetectionLimitUom             object
NonDetectFlag                  int64
dtype: object