# Testing the Water Data for submission into 2024 Spring Water submission
This is a revised notebook of Water_20240515.ipynb for 2025 data call. It validates the related water data, and do the tests on Nebraska Water system. 

In [30]:
import pandas as pd
from libraries import general
# importlib.reload(libraries.general)

In [32]:
counties = general.get_Counties_FIPS_with_requests('NE')

## Required Libraries
These are the required libraries for validation. Will later be added into a separate library

In [33]:
# First attempt to creat Data class models
import pandas as pd
from datetime import date
from typing import Optional, List, Literal
from pydantic import BaseModel, ValidationError, Field, conint, confloat, constr, validator
import datetime

In [34]:
import pydantic

version = pydantic.__version__
print(version)


1.10.7


In [55]:
# ['RowIdentifier', 'PWSIDNumber', 'YearAssociatedTo', 'YearPulled',
#       'PWSName', 'PrincipalCountyServedFIPS', 'PrincipalCityFeatureID',
#       'TotalConnections', 'SystemPopulation', 'PrimarySourceCode', 'Latitude',
#       'Longitude', 'LocationDerivationCode']

class PWS_Inventory(BaseModel):
    RowIdentifier: int
    StateFIPSCode: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #Change NE to represent your state code
    
    YearAssociatedTo: conint(ge=1999, le=2025) 
    YearPulled: conint(ge=1999, le=2025)
    
    PWSName: str #Should it have distinction between Unknows and Not Submitted? or just be blank?
    
    PrincipalCountyServedFIPS: str

    @validator('PrincipalCountyServedFIPS')
    def check_PrincipalCountyServedFIPS(cls, v):
        allowed_values = counties['fips'].tolist()
        if v not in allowed_values:
            raise ValueError('PrincipalCountyServedFIPS must be a valid FIPS code')
        return v      

    PrincipalCityFeatureID: int # ????How to get it from the introduced source?

    TotalConnections: conint(ge=1, le=9999999)
    SystemPopulation: conint(ge=10, le=99999999)
    PrimarySourceCode: Literal['GU', 'GUP', 'GW', 'GWP', 'SW', 'SWP', 'U', 'NS']

    # For Nebraska in NAD83
    Latitude: confloat(ge= 39.999998, le=43.001702) 
    Longitude: confloat(ge= -104.053514, le=-95.308290)
    LocationDerivationCode: Literal['SA', 'MFL', 'PCS', 'GSH','O', '-999', '-888']

        
# ['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode', 'DateSampled',
#        'AggregationType', 'NumSamplingLocations', 'SummaryTimePeriod',
#        'NumSamples', 'NumNonDetects', 'ConcentrationUnits', 'Concentration']
class Sampling_Summary(BaseModel):
    RowIdentifier: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #for Nebraska

    Year: conint(ge=1999, le=2024)
    
    
    AnalyteCode: Literal['1005', '2050', '2456', '2950', '2039', '1038', '1040', '2987', 
    '2984', '4010', '4006']
    ConcentrationUnits: Literal['ug/l', 'mg/l','pci/l'] # TODO: Apply the rules of what Analyte each applies to
    Concentration: float

    DateSampled: datetime.date #validate to be from 1/1/1999 to the latest complete year

    AggregationType: Literal['X', 'MX']
    NumSamplingLocations: conint(ge=1, le=9999) #TODO: '-888' for Not Submitted
    SummaryTimePeriod: str #TODO: look into its Data Dictionary
    NumSamples: int
    NumNonDetects: int



class Sampling(BaseModel):
    RowIdentifier: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #for Nebraska

    Year: conint(ge=1999, le=2025)
    
    
    AnalyteCode: Literal['1005', '2050', '2456', '2950', '2039','1038', '1040', '2987', 
    '2984', '4010', '4006']
    # check if ConcentrationUnits is one of the strings in this list, make the list case insensitive
    ConcentrationUnits: Literal['ug/l', 'mg/l','pci/l'] # TODO: Apply the rules of what Analyte each applies to

    # check if Concentration is a float and is greater than or equal to 0
    Concentration: confloat(ge=0.0)

    DateSampled: datetime.date #validate to be from 1/1/1999 to the latest complete year


## Checking the PWS_Inventory


In [None]:
inventory = pd.read_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024_2025/PWSInventory_2024.xlsx')

In [73]:
inventory.PWSIDNumber.nunique()

593

In [74]:
# Check for duplicates
inventory[inventory.duplicated(subset=['PWSIDNumber', 'YearAssociatedTo'], keep=False)]

Unnamed: 0,StateFIPSCode,PWSIDNumber,YearAssociatedTo,YearPulled,PWSName,PrincipalCountyServedName,PrincipalCountyServed FIPS,PrincipalCityName,﻿PrincipalCityFeatureId,TotalConnections,SystemPopulation,PrimarySourceCode,Horiz_Ref_Datum,Latitude,Longitude,LocationDerivationCode


In [75]:
inventory.columns.tolist()

['StateFIPSCode',
 'PWSIDNumber',
 'YearAssociatedTo',
 'YearPulled',
 'PWSName',
 'PrincipalCountyServedName',
 'PrincipalCountyServed FIPS',
 'PrincipalCityName',
 '\ufeffPrincipalCityFeatureId',
 'TotalConnections',
 'SystemPopulation',
 'PrimarySourceCode',
 'Horiz_Ref_Datum',
 'Latitude',
 'Longitude',
 'LocationDerivationCode']

In [76]:
#inventory.columns = inventory.columns.str.replace('\ufeff', '')

#Change the names of several columns to match the names in the validator
inventory.rename(columns={'PrincipalCountyServed FIPS': 'PrincipalCountyServedFIPS', '\ufeffPrincipalCityFeatureId': 'PrincipalCityFeatureID'}, inplace=True)

In [77]:
# Adding a uique Identifier, as first column, for each row
inventory.insert(0, 'RowIdentifier', inventory.index)
#inventory['RowIdentifier'] = inventory.index

In [78]:
inventory.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024_2025/PWSInventory_2024.csv', index=False)

In [79]:
rm_column = list(set(inventory.columns) - set((PWS_Inventory.__fields__.keys())))

In [80]:
set(PWS_Inventory.__fields__.keys()) - set(inventory.columns)

set()

In [81]:
inventory.Horiz_Ref_Datum.isna().sum()

np.int64(27)

In [82]:
inventory.drop(columns=rm_column, inplace=True)

In [83]:
# For 2024
inventory.columns

Index(['RowIdentifier', 'StateFIPSCode', 'PWSIDNumber', 'YearAssociatedTo',
       'YearPulled', 'PWSName', 'PrincipalCountyServedFIPS',
       'PrincipalCityFeatureID', 'TotalConnections', 'SystemPopulation',
       'PrimarySourceCode', 'Latitude', 'Longitude', 'LocationDerivationCode'],
      dtype='object')

In [84]:
inventory.shape

(593, 14)

In [85]:
# This one is the column names for 2023 data call
# inventory.columns

In [86]:
PWS_Inventory.__fields__.keys()

dict_keys(['RowIdentifier', 'StateFIPSCode', 'PWSIDNumber', 'YearAssociatedTo', 'YearPulled', 'PWSName', 'PrincipalCountyServedFIPS', 'PrincipalCityFeatureID', 'TotalConnections', 'SystemPopulation', 'PrimarySourceCode', 'Latitude', 'Longitude', 'LocationDerivationCode'])

In [87]:
# Doing the validation for each row as a PWS_Inventory object
valid_rows = []
# Creat a dictionary that contains the RowIdentifier of the invalid rows and the error message
invalid_rows = {}
for index, row in inventory.iterrows():
    
    try:
        PWS_Inventory(**row)
        # If passeed, add RowIdentifier into valid_rows list
        valid_rows.append(row['RowIdentifier'])

    except ValidationError as e:
        # If failed, add RowIdentifier and the error message into invalid_rows dictionary
        invalid_rows[row['RowIdentifier']] = e.errors

        print(e)

3 validation errors for PWS_Inventory
Latitude
  ensure this value is greater than or equal to 39.999998 (type=value_error.number.not_ge; limit_value=39.999998)
Longitude
  ensure this value is greater than or equal to -104.053514 (type=value_error.number.not_ge; limit_value=-104.053514)
LocationDerivationCode
  unexpected value; permitted: 'SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888' (type=value_error.const; given=nan; permitted=('SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888'))
3 validation errors for PWS_Inventory
Latitude
  ensure this value is greater than or equal to 39.999998 (type=value_error.number.not_ge; limit_value=39.999998)
Longitude
  ensure this value is greater than or equal to -104.053514 (type=value_error.number.not_ge; limit_value=-104.053514)
LocationDerivationCode
  unexpected value; permitted: 'SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888' (type=value_error.const; given=nan; permitted=('SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888'))
3 validation errors for PWS_

In [88]:
invalid_rows

{43: <bound method ValidationError.errors of ValidationError(model='PWS_Inventory', errors=[{'loc': ('Latitude',), 'msg': 'ensure this value is greater than or equal to 39.999998', 'type': 'value_error.number.not_ge', 'ctx': {'limit_value': 39.999998}}, {'loc': ('Longitude',), 'msg': 'ensure this value is greater than or equal to -104.053514', 'type': 'value_error.number.not_ge', 'ctx': {'limit_value': -104.053514}}, {'loc': ('LocationDerivationCode',), 'msg': "unexpected value; permitted: 'SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888'", 'type': 'value_error.const', 'ctx': {'given': nan, 'permitted': ('SA', 'MFL', 'PCS', 'GSH', 'O', '-999', '-888')}}])>,
 48: <bound method ValidationError.errors of ValidationError(model='PWS_Inventory', errors=[{'loc': ('Latitude',), 'msg': 'ensure this value is greater than or equal to 39.999998', 'type': 'value_error.number.not_ge', 'ctx': {'limit_value': 39.999998}}, {'loc': ('Longitude',), 'msg': 'ensure this value is greater than or equal to -104.

In [89]:
# Take the errorous rows of the inventory dataframe from key values in invalid_rows dictionary
errorous_rows = inventory[inventory['RowIdentifier'].isin(invalid_rows.keys())]

In [90]:
# ok. Let's take care of it to change those values to -999 following the HTG, then will run data validator again
inventory[inventory['RowIdentifier'].isin(invalid_rows.keys())]

Unnamed: 0,RowIdentifier,StateFIPSCode,PWSIDNumber,YearAssociatedTo,YearPulled,PWSName,PrincipalCountyServedFIPS,PrincipalCityFeatureID,TotalConnections,SystemPopulation,PrimarySourceCode,Latitude,Longitude,LocationDerivationCode
43,43,31,NE3121486,2024,2025,BELLEVUE TERRACE MHC LLC,31153,827304,94,357,SWP,,,
48,48,31,NE3110910,2024,2025,"BENNET, VILLAGE OF",31109,-888,428,1084,GWP,,,
50,50,31,NE3121227,2024,2025,BIC JOINT WATER AGENCY,31065,827204,3,355,GW,,,
138,138,31,NE3110704,2024,2025,"CROFTON, CITY OF",31107,828463,368,754,SWP,,,
139,139,31,NE3120824,2024,2025,CROOKED CREEK HOA,31109,837279,33,64,GW,,,
180,180,31,NE3121485,2024,2025,EAGLE WAY MHC LLC,31025,828917,53,99,GWP,,,
278,278,31,NE3121481,2024,2025,K & K MANUFACTURED HOME COMMUNITY,31001,829848,50,180,GWP,,,
310,310,31,NE3121368,2024,2025,LOWER BIG BLUE NRD - WYMORE,31067,834893,228,856,GWP,,,
336,336,31,NE3121363,2024,2025,MEADOWBROOK ESTATES WATER SYSTEM,31055,835483,266,675,SWP,,,
361,361,31,NE3121478,2024,2025,NEBRASKAN MANUFACTURED HOME COMMUNITY,31001,829848,61,108,GWP,,,


In [63]:
errorous_rows.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024_2025/PWSInventory_2025_errors.csv', index=False)

In [91]:
mask = inventory['RowIdentifier'].isin(invalid_rows.keys())

inventory.loc[mask, 'LocationDerivationCode'] = '-999'
inventory.loc[mask, 'Longitude'] = -999
inventory.loc[mask, 'Latitude'] = -99.99

In [95]:
inventory.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024_2025/toSubmit_2025/PWSInventory.csv')

## Checking the Sampling
This is the latest file (The unaggregated)

In [125]:
import pandas as pd
sampling = pd.read_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024_2025/PWSSampleResults_2024.xlsx')

In [126]:
sampling.columns

Index(['PWSIDNumber', 'Year', 'AnalyteName', 'AnalyteCode',
       'ConcentrationUnits', 'Concentration', 'DateSampled', 'SamplePointID',
       'DetectionLimit', 'DetectionLimitUom', 'NonDetectFlag'],
      dtype='object')

In [127]:
# check for duplicates, and add them into a separate dataframe

duplicates = sampling[sampling.duplicated(subset=['PWSIDNumber', 'Year', 'AnalyteCode', 'DateSampled', 'SamplePointID'], keep=False)]

In [102]:
duplicates.to_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024_2025/toSubmit_2025/duplicates_samples.xlsx')

In [128]:
Sampling.__fields__.keys()

dict_keys(['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode', 'ConcentrationUnits', 'Concentration', 'DateSampled'])

In [129]:
sampling.AnalyteCode.value_counts()

AnalyteCode
1038    4496
1005    1259
2050    1193
2039    1193
2984    1006
2987    1006
4010     747
2950     682
2456     682
4006     208
Name: count, dtype: int64

In [105]:
# count the number of rows for each year, adding heading to the output
sampling.Year.value_counts().to_frame('Number of Rows')

Unnamed: 0_level_0,Number of Rows
Year,Unnamed: 1_level_1
2024,6676
2023,5796


In [130]:
# For AnalyteCode Replace all 1038 values with 1040
# sampling['AnalyteCode'] = sampling['AnalyteCode'].replace(1038, 1040)

# Delete all rows with 1041 as AnalyteCode, which are only NITRITE tests
# sampling = sampling[sampling['AnalyteCode'] != 1041] #Contained only 17 rows

In [131]:
sampling['ConcentrationUnits'] = sampling['ConcentrationUnits'].str.lower()

In [132]:
del_cols = list(set(sampling.columns) - set(Sampling.__fields__.keys()))

# Remove the columns that are not in the Sampling validator
sampling_validation = sampling.drop(columns=del_cols)

sampling_validation.insert(0, 'RowIdentifier', sampling.index)

In [133]:
sampling

Unnamed: 0,PWSIDNumber,Year,AnalyteName,AnalyteCode,ConcentrationUnits,Concentration,DateSampled,SamplePointID,DetectionLimit,DetectionLimitUom,NonDetectFlag
0,NE3113903,2023,NITRATE-NITRITE,1038,mg/l,7.420,2023-01-24,G-113667,0.0,,0
1,NE3106105,2023,NITRATE-NITRITE,1038,mg/l,7.990,2023-01-25,010,0.0,,0
2,NE3108101,2023,NITRATE-NITRITE,1038,mg/l,2.320,2023-01-25,G-028307,0.0,,0
3,NE3108101,2023,NITRATE-NITRITE,1038,mg/l,8.650,2023-01-25,G-028309,0.0,,0
4,NE3108101,2023,NITRATE-NITRITE,1038,mg/l,7.350,2023-01-25,G-028310R,0.0,,0
...,...,...,...,...,...,...,...,...,...,...,...
12467,NE3112702,2024,COMBINED RADIUM (-226 & -228),4010,pci/l,0.816,2024-12-17,011,,,0
12468,NE3112708,2024,COMBINED RADIUM (-226 & -228),4010,pci/l,0.749,2024-12-17,G-106126,,,0
12469,NE3112702,2024,COMBINED RADIUM (-226 & -228),4010,pci/l,0.749,2024-12-17,G-106126,,,0
12470,NE3112708,2024,COMBINED RADIUM (-226 & -228),4010,pci/l,1.100,2024-12-17,G-040958B,,,0


In [134]:
print(sampling_validation.columns)
print(Sampling.__fields__.keys())

Index(['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode',
       'ConcentrationUnits', 'Concentration', 'DateSampled'],
      dtype='object')
dict_keys(['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode', 'ConcentrationUnits', 'Concentration', 'DateSampled'])


In [135]:
# change type of colum AnlyteCode to string
sampling_validation['AnalyteCode'] = sampling_validation['AnalyteCode'].astype(str)

In [136]:
# now validating the sampling dataframe
# Doing the validation for each row as a PWS_Inventory object
valid_rows_sampling = []
# Creat a dictionary that contains the RowIdentifier of the invalid rows and the error message
invalid_rows_sampling = {}
for index, row in sampling_validation.iterrows():
    
    try:
        Sampling(**row)
        # If passeed, add RowIdentifier into valid_rows list
        valid_rows_sampling.append(row['RowIdentifier'])

    except ValidationError as e:
        # If failed, add RowIdentifier and the error message into invalid_rows dictionary
        invalid_rows_sampling[row['RowIdentifier']] = e.errors

        #print(e)

In [137]:
invalid_rows_sampling

{}

In [138]:
sampling.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024_2025/toSubmit_2025/Sample_results_2024.csv', index=False)

In [139]:
# Looks like there are more undefined AnalyteCodes in the sampling dataframe
# Let's see what they are
sampling.AnalyteCode.value_counts().to_frame("Number of rows")

Unnamed: 0_level_0,Number of rows
AnalyteCode,Unnamed: 1_level_1
1038,4496
1005,1259
2050,1193
2039,1193
2984,1006
2987,1006
4010,747
2950,682
2456,682
4006,208


In [119]:
# What are allowable AnalyteCodes as defined in the Sampling class
Sampling.__fields__['AnalyteCode'].type_


typing.Literal['1005', '2050', '2456', '2950', '2039', '1038', '1040', '2987', '2984', '4010', '4006']

### How to Aggregate into Sampling Results

Each community water system:
* annual mean and max concentration of:

--- arsenic, disinfection byproducts (HAA5 and TTHM), 

--- nitrates, 

--- atrazine, 

--- di(2-ethylhexyl) phthalate (DEHP), 

--- radium, 

--- tetrachloroethene (tetrachloroethylene) (PCE), 

--- trichloroethene (trichloroethylene) (TCE), and 

--- uranium


* Mean concentration per quarter 

--- Nitrate

---- Atrazine

In [120]:
set(inventory.PWSIDNumber.unique()) - set(sampling.PWSIDNumber.unique())

{'NE3102302',
 'NE3102703',
 'NE3102707',
 'NE3104308',
 'NE3113901',
 'NE3117304',
 'NE3117307',
 'NE3117903',
 'NE3120031',
 'NE3120358',
 'NE3121445'}

In [121]:
sampling.columns

Index(['PWSIDNumber', 'Year', 'AnalyteName', 'AnalyteCode',
       'ConcentrationUnits', 'Concentration', 'DateSampled', 'SamplePointID',
       'DetectionLimit', 'DetectionLimitUom', 'NonDetectFlag'],
      dtype='object')

In [122]:
sampling.Year.value_counts()

Year
2024    6676
2023    5796
Name: count, dtype: int64

In [123]:
sampling.head()

Unnamed: 0,PWSIDNumber,Year,AnalyteName,AnalyteCode,ConcentrationUnits,Concentration,DateSampled,SamplePointID,DetectionLimit,DetectionLimitUom,NonDetectFlag
0,NE3113903,2023,NITRATE-NITRITE,1040,mg/l,7.42,2023-01-24,G-113667,0.0,,0
1,NE3106105,2023,NITRATE-NITRITE,1040,mg/l,7.99,2023-01-25,010,0.0,,0
2,NE3108101,2023,NITRATE-NITRITE,1040,mg/l,2.32,2023-01-25,G-028307,0.0,,0
3,NE3108101,2023,NITRATE-NITRITE,1040,mg/l,8.65,2023-01-25,G-028309,0.0,,0
4,NE3108101,2023,NITRATE-NITRITE,1040,mg/l,7.35,2023-01-25,G-028310R,0.0,,0


In [124]:
sampling.dtypes

PWSIDNumber                   object
Year                           int64
AnalyteName                   object
AnalyteCode                    int64
ConcentrationUnits            object
Concentration                float64
DateSampled           datetime64[ns]
SamplePointID                 object
DetectionLimit               float64
DetectionLimitUom             object
NonDetectFlag                  int64
dtype: object