# Testing the Water Data for submission into 2024 Spring Water submission
This is a revised notebook of Water_20230406.ipynb for 2024 data call. It validates the related water data, and do the tests on Nebraska Water system. 

In [None]:
import pandas as pd
from libraries import general

In [None]:
counties = general.get_Counties_FIPS('NE')

## Required Libraries
These are the required libraries for validation. Will later be added into a separate library

In [None]:
# First attempt to creat Data class models
import pandas as pd
from datetime import date
from typing import Optional, List, Literal
from pydantic import BaseModel, ValidationError, Field, conint, confloat, constr, validator
import datetime

In [None]:
import pydantic

version = pydantic.__version__
print(version)


In [None]:
# ['RowIdentifier', 'PWSIDNumber', 'YearAssociatedTo', 'YearPulled',
#       'PWSName', 'PrincipalCountyServedFIPS', 'PrincipalCityFeatureID',
#       'TotalConnections', 'SystemPopulation', 'PrimarySourceCode', 'Latitude',
#       'Longitude', 'LocationDerivationCode']

class PWS_Inventory(BaseModel):
    RowIdentifier: int
    StateFIPSCode: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #Change NE to represent your state code
    
    YearAssociatedTo: conint(ge=1999, le=2024) 
    YearPulled: conint(ge=1999, le=2024)
    
    PWSName: str #Should it have distinction between Unknows and Not Submitted? or just be blank?
    
    PrincipalCountyServedFIPS: str

    @validator('PrincipalCountyServedFIPS')
    def check_PrincipalCountyServedFIPS(cls, v):
        allowed_values = counties['fips'].tolist()
        if v not in allowed_values:
            raise ValueError('PrincipalCountyServedFIPS must be a valid FIPS code')
        return v      

    PrincipalCityFeatureID: int # ????How to get it from the introduced source?

    TotalConnections: conint(ge=1, le=9999999)
    SystemPopulation: conint(ge=10, le=99999999)
    PrimarySourceCode: Literal['GU', 'GUP', 'GW', 'GWP', 'SW', 'SWP', 'U', 'NS']

    # For Nebraska in NAD83
    Latitude: confloat(ge= 39.999998, le=43.001702) 
    Longitude: confloat(ge= -104.053514, le=-95.308290)
    LocationDerivationCode: Literal['SA', 'MFL', 'PCS', 'GSH','O', '-999', '-888']

        
# ['RowIdentifier', 'PWSIDNumber', 'Year', 'AnalyteCode', 'DateSampled',
#        'AggregationType', 'NumSamplingLocations', 'SummaryTimePeriod',
#        'NumSamples', 'NumNonDetects', 'ConcentrationUnits', 'Concentration']
class Sampling_Summary(BaseModel):
    RowIdentifier: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #for Nebraska

    Year: conint(ge=1999, le=2024)
    
    
    AnalyteCode: Literal['1005', '2050', '2456', '2950', '2039', '1038', '1040', '2987', 
    '2984', '4010', '4006']
    ConcentrationUnits: Literal['ug/l', 'mg/l','pci/l'] # TODO: Apply the rules of what Analyte each applies to
    Concentration: float

    DateSampled: datetime.date #validate to be from 1/1/1999 to the latest complete year

    AggregationType: Literal['X', 'MX']
    NumSamplingLocations: conint(ge=1, le=9999) #TODO: '-888' for Not Submitted
    SummaryTimePeriod: str #TODO: look into its Data Dictionary
    NumSamples: int
    NumNonDetects: int



class Sampling(BaseModel):
    RowIdentifier: int
    PWSIDNumber: constr(regex=r'^NE\d{7}') #for Nebraska

    Year: conint(ge=1999, le=2024)
    
    
    AnalyteCode: Literal['1005', '2050', '2456', '2950', '2039','1038', '1040', '2987', 
    '2984', '4010', '4006']
    # check if ConcentrationUnits is one of the strings in this list, make the list case insensitive
    ConcentrationUnits: Literal['ug/l', 'mg/l','pci/l'] # TODO: Apply the rules of what Analyte each applies to

    # check if Concentration is a float and is greater than or equal to 0
    Concentration: confloat(ge=0.0)

    DateSampled: datetime.date #validate to be from 1/1/1999 to the latest complete year


## Checking the PWS_Inventory


In [None]:
inventory = pd.read_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/PWSInventory2022_2023.xlsx')

In [None]:
inventory.PWSIDNumber.nunique()

In [None]:
# Check for duplicates
inventory[inventory.duplicated(subset=['PWSIDNumber', 'YearAssociatedTo'], keep=False)]

In [None]:
inventory.columns.tolist()

In [None]:
#inventory.columns = inventory.columns.str.replace('\ufeff', '')

#Change the names of several columns to match the names in the validator
inventory.rename(columns={'PrincipalCountyServed FIPS': 'PrincipalCountyServedFIPS', '\ufeffPrincipalCityFeatureId': 'PrincipalCityFeatureID'}, inplace=True)

In [None]:
# Adding a uique Identifier, as first column, for each row
inventory.insert(0, 'RowIdentifier', inventory.index)
#inventory['RowIdentifier'] = inventory.index

In [None]:
inventory.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/PWSInventory_latest.csv', index=False)

In [None]:
rm_column = list(set(inventory.columns) - set((PWS_Inventory.__fields__.keys())))

In [None]:
set(PWS_Inventory.__fields__.keys()) - set(inventory.columns)

In [None]:
inventory.Horiz_Ref_Datum.isna().sum()

In [None]:
inventory.drop(columns=rm_column, inplace=True)

In [None]:
# For 2024
inventory.columns

In [None]:
inventory.shape

In [None]:
# This one is the column names for 2023 data call
# inventory.columns

In [None]:
PWS_Inventory.__fields__.keys()

In [None]:
# Doing the validation for each row as a PWS_Inventory object
valid_rows = []
# Creat a dictionary that contains the RowIdentifier of the invalid rows and the error message
invalid_rows = {}
for index, row in inventory.iterrows():
    
    try:
        PWS_Inventory(**row)
        # If passeed, add RowIdentifier into valid_rows list
        valid_rows.append(row['RowIdentifier'])

    except ValidationError as e:
        # If failed, add RowIdentifier and the error message into invalid_rows dictionary
        invalid_rows[row['RowIdentifier']] = e.errors

        print(e)

In [None]:
invalid_rows

In [None]:
# Take the errorous rows of the inventory dataframe from key values in invalid_rows dictionary
errorous_rows = inventory[inventory['RowIdentifier'].isin(invalid_rows.keys())]

In [None]:
errorous_rows

In [None]:
errorous_rows.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/PWSInventory_2024_errors.csv', index=False)

## Checking the Sampling
This is the latest file (The unaggregated)

In [None]:
import pandas as pd
sampling = pd.read_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/PWSSampleResults2022-2023.xlsx')

In [None]:
sampling.columns

In [None]:
# check for duplicates, and add them into a separate dataframe

duplicates = sampling[sampling.duplicated(subset=['PWSIDNumber', 'Year', 'AnalyteCode', 'DateSampled', 'SamplePointID'], keep=False)]

In [None]:
duplicates.to_excel('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/duplicates_samples.xlsx')

In [None]:
sampling.columns

In [None]:
Sampling.__fields__.keys()

In [None]:
sampling.AnalyteCode.value_counts()

In [None]:
# count the number of rows for each year, adding heading to the output
sampling.Year.value_counts().to_frame('Number of Rows')

In [None]:
# For AnalyteCode Replace all 1038 values with 1040
#sampling['AnalyteCode'] = sampling['AnalyteCode'].replace(1038, 1040)

# Delete all rows with 1041 as AnalyteCode, which are only NITRITE tests
sampling = sampling[sampling['AnalyteCode'] != 1041] #Contained only 17 rows

In [None]:
sampling['ConcentrationUnits'] = sampling['ConcentrationUnits'].str.lower()

In [None]:
del_cols = list(set(sampling.columns) - set(Sampling.__fields__.keys()))

# Remove the columns that are not in the Sampling validator
sampling_validation = sampling.drop(columns=del_cols)

sampling_validation.insert(0, 'RowIdentifier', sampling.index)

In [None]:
print(sampling_validation.columns)
print(Sampling.__fields__.keys())

In [None]:
# change type of colum AnlyteCode to string
sampling_validation['AnalyteCode'] = sampling_validation['AnalyteCode'].astype(str)

In [None]:
# now validating the sampling dataframe
# Doing the validation for each row as a PWS_Inventory object
valid_rows_sampling = []
# Creat a dictionary that contains the RowIdentifier of the invalid rows and the error message
invalid_rows_sampling = {}
for index, row in sampling_validation.iterrows():
    
    try:
        Sampling(**row)
        # If passeed, add RowIdentifier into valid_rows list
        valid_rows_sampling.append(row['RowIdentifier'])

    except ValidationError as e:
        # If failed, add RowIdentifier and the error message into invalid_rows dictionary
        invalid_rows_sampling[row['RowIdentifier']] = e.errors

        #print(e)

In [None]:
invalid_rows_sampling

In [None]:
sampling.to_csv('/Users/babak.jfard/projects/EPHTracking/Data/Water_Data_2024/Sample_results_2023.csv', index=False)

In [None]:
# Looks like there are more undefined AnalyteCodes in the sampling dataframe
# Let's see what they are
sampling.AnalyteCode.value_counts().to_frame("Number of rows")

In [None]:
# What are allowable AnalyteCodes as defined in the Sampling class
Sampling.__fields__['AnalyteCode'].type_


### How to Aggregate into Sampling Results

Each community water system:
* annual mean and maz concentration of:

--- arsenic, disinfection byproducts (HAA5 and TTHM), 

--- nitrates, 

--- atrazine, 

--- di(2-ethylhexyl) phthalate (DEHP), 

--- radium, 

--- tetrachloroethene (tetrachloroethylene) (PCE), 

--- trichloroethene (trichloroethylene) (TCE), and 

--- uranium


* Mean concentration per quarter 

--- Nitrate

---- Atrazine

In [None]:
set(inventory.PWSIDNumber.unique()) - set(sampling.PWSIDNumber.unique())

In [None]:
sampling.columns

In [None]:
sampling.Year.value_counts()

In [None]:
sampling.head()

In [None]:
sampling.dtypes