In [1]:

# Notebook Summary:

# V.History: 
# Date Last Modified: 14 May 2025

#--------------------------------------------------------------------------------------------------
'''
    This model transposes the uni-dimensional WQ dataset into a 2-dimensional dataset, including BNG 
    coordinates for spatio-temporal model construction.
    Date: March 25, 2025

    Following are the label encoders created:
        1. isComplianceSample            - encoder_isComplianceSample.pkl
        2. purpose_name                  - encoder_purpose_name.pkl
        3. determinand_unit_name         - encoder_determinand_unit_name.pkl

    This step enables the model to recognise compliance samples, their purposes, and the varied 
    unit names of Determinands.
'''
#--------------------------------------------------------------------------------------------------


#--------------------------------------------------------------------------------------------------
#Pre-Requisite : 
    #Kernel Python 3 (ipykernel) is required to run this notebook 
    #Required python version - Python 3.10.15 and its compatible Numpy , ScikitLearn libraries

#Old Name: 12_NB_10Transpose_2959_All2.ipynb
#--------------------------------------------------------------------------------------------------

#Proposed Validations cachements: 
#a. River Dee (Wales)
#b. Catchments in Ireland (potentially available through Teagasc)
#c. Skerne
#d. Browney
#e. Frome
#f. Wye

'''
    Intro Section
'''

'\n    Intro Section\n'

In [2]:
#Check python version compatibility 3.10 or above is required
!python -V
python_version=!(python --version 2>&1)
print (python_version)

Python 3.10.15
['Python 3.10.15']


In [3]:
ls ~/.local/share/jupyter/kernels

[0m[01;34mchem2[0m/


In [4]:
#Check python version compatibility 3.10 or above is required

#sudo nano  ~/.local/share/jupyter/kernels/python311/kernel.json
#to display the system version of the pandas
#!pip show pandas
#to display this notebook's kernel version of the pandas
#%pip show pandas

import sys
sys.version

'3.10.15 | packaged by conda-forge | (main, Oct 16 2024, 01:24:24) [GCC 13.3.0]'

In [5]:
####################################################################################################
#Begin CARD (Transpose data for Phosphate and Phosphorus Family for 24 Years of WQ data for England)
####################################################################################################

In [6]:
%run "..//99_Common_Utils/99_NB_CommonUtils.ipynb" #Library Declaration section - Installing or Initiating all required Python Libraries

Intalling required libraries and utilities.....
Uses Python 3 (ipykernel) (Local)
Python 3.10.15
['Python 3.10.15']

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m

2025-05-16 13:27:23.760685: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747402043.785280  107651 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747402043.793091  107651 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747402043.813587  107651 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747402043.813607  107651 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747402043.813610  107651 computation_placer.cc:177] computation placer alr


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
|| Completed intalling required libraries and utilities ||

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m]

In [7]:
'''
There are 52 individual units of measure in the WQ monitoring data and over 38 for the P-Family. Phosphate or Orthophosphate is 
measured in 'mg/l'. Considerations were made to convert these metrics into a single unit of 'mg/l' (aligned to P unit measure). 
Below are the 'Conversion functions' / code snippets for that. However, the conversion was later discarded in favour of relative weights for 
different metrics in spatio-temporal models. The code is kept here to document this experiment.

Notes about the conversion considerations:

1. Density Considerations: Some conversions, such as % w/w or mg/kg, assume the substance has a similar density to water (1 g/ml). 
If the density differs, adjust the conversion factor accordingly.

2. Context-Specific Units: Units like bq/l (radioactivity), cfu/0.1l (colony-forming units), us/cm (conductivity), 
and rfu (fluorescence) require additional context or calibration to convert meaningfully to mg/l.

3. Load Units: Units like kg/ann, t/d, and t/qtr are load measurements and depend on the total volume of water per time period for conversion.

The code provides a basic framework, but real-world applications may require more specific formulas and data, especially for 
context-specific units.


There are 38 Unique metric for Phosphate family
mg/l , coded, ug/l , % , mg/kg, ug/kg, m, ppm, ngr, unitless, phi, ng/l, min, deg, no/g, kg/d, 
deccafix, pres/nf, mw.s/sq. , no. , no/100ml, abs/cm, nm, bq/l, text, % v/v, kt/qtr, t/qtr, kg/qtr,
t/d, % w/w, yes/no, l/kg/h, cfu/0.1l, us/cm, g/l, t/wk, ug
'''

#-------------------------------------------------------------------------------------
#SAMPLE CONVERSION :-
# Uni directional data sample 
#Sl.No. | Determinand_name | Determinand_Definition | Result | Date
#  1      Oxygen             Disolved oxygen           5.2     1-1-2000
#  2      Nitrogen           Nitrogen oxyde            4.1     1-1-2000
#  3      Oxygen             Disolved oxygen           5.8     2-1-2000
#  4      Nitrogen           Nitrogen oxyde            4.9     2-1-2000
#  5      Oxygen             Disolved oxygen           5.8     1-1-2000
#  6      Nitrogen           Nitrogen oxyde            4.9     1-1-2000

# Converted 2 dimesional data Sample 
#Sl.No. | Oxygen | Nitrogen | Date
#  1      11.0        4.1     1-1-2000
#  1      9.0         4.1     1-1-2000
#-------------------------------------------------------------------------------------


'''
Metrics conversion
'''


'\nMetrics conversion\n'

In [8]:
# Conversion functions 29 Conversions
def first_convert_to_mg_per_l(value, unit, density=1.0):
    """
    Convert various units to mg/l.
    
    Parameters:
    - value: The numerical value to convert.
    - unit: The unit of the given value as a string.
    - density: Density of the substance in g/ml (default is 1.0, equivalent to water at room temperature).

    Returns:
    - The converted value in mg/l.
    """
    cUnit = 'mg/l'
    if unit == "mg/l":
        return value, cUnit
    elif unit == "%":
        # 1% = 10,000 mg/l
        return value * 10000, cUnit
    elif unit == "ug/l":
        # 1 ug/l = 0.001 mg/l
        return value * 0.001, cUnit
    elif unit == "mg/kg":
        # Assuming water density, 1 mg/kg = 1 mg/l
        return value, cUnit
    elif unit == "% v/v":
        # 1% v/v = 10,000 mg/l (for liquids like ethanol in water)
        return value * 10000, cUnit
    elif unit == "% w/w":
        # 1% w/w = 10,000 mg/l (assuming density is similar to water)
        return value * 10000, cUnit
    elif unit == "bq/l":
        # Conversion depends on the specific isotope (radioactivity)
        #raise ValueError("Conversion for 'bq/l' requires context-specific information.")
        return value, cUnit
    elif unit == "cfu/0.1l":
        # Colony forming units conversion needs microbiological context
        #raise ValueError("Conversion for 'cfu/0.1l' requires context-specific information.")
        return value, unit
    elif unit == "g/l":
        # 1 g/l = 1000 mg/l
        return value * 1000, cUnit
    elif unit == "kg/ann":
        # Annual load conversion requires volume flow data
        #raise ValueError("Conversion for 'kg/ann' requires annual volume flow data.")
        return value, unit
    elif unit == "kg/d":
        # Daily load conversion requires volume flow data
        #raise ValueError("Conversion for 'kg/d' requires daily volume flow data.")
        return value, unit
    elif unit == "kg/qtr":
        # Quarterly load conversion requires volume flow data
        #raise ValueError("Conversion for 'kg/qtr' requires quarterly volume flow data.")
        return value, unit
    elif unit == "kt/qtr":
        # Kilotons per quarter conversion requires volume flow data
        #raise ValueError("Conversion for 'kt/qtr' requires quarterly volume flow data.")
        return value, unit
    elif unit == "ug/kg":
        # Assuming water density, 1 ug/kg = 0.001 mg/l
        return value * 0.001, cUnit
    elif unit == "ng/l":
        # 1 ng/l = 1e-6 mg/l
        return value * 1e-6, cUnit
    elif unit == "no/100ml":
        # Conversion depends on the context of measurement
        #raise ValueError("Conversion for 'no/100ml' requires context-specific information.")
        return value, unit
    elif unit == "us/cm":
        # Conductivity conversion needs additional information about the substance
        #raise ValueError("Conversion for 'us/cm' requires substance-specific details.")
        return value, unit
    elif unit == "no/g":
        # Conversion for counts per gram needs context
        #raise ValueError("Conversion for 'no/g' requires context-specific information.")
        return value, unit
    elif unit == "ppm":
        # 1 ppm = 1 mg/l (assuming water density)
        return value * 1, cUnit
    elif unit == "t/ann":
        # Tons per annum conversion requires flow data
        #raise ValueError("Conversion for 't/ann' requires annual volume flow data.")
        return value, unit
    elif unit == "t/d":
        # Tons per day conversion requires flow data
        #raise ValueError("Conversion for 't/d' requires daily volume flow data.")
        return value, unit
    elif unit == "t/qtr":
        # Tons per quarter conversion requires flow data
        #raise ValueError("Conversion for 't/qtr' requires quarterly volume flow data.")
        return value, unit
    elif unit == "t/wk":
        # Tons per week conversion requires flow data
        #raise ValueError("Conversion for 't/wk' requires weekly volume flow data.")
        return value, unit
    elif unit == "ug":
        # Micrograms conversion needs volume data
        #raise ValueError("Conversion for 'ug' requires volume data.")
        return value, unit
    elif unit == "ng/kg":
        # Assuming water density, 1 ng/kg = 1e-6 mg/l
        return value * 1e-6, cUnit
    elif unit == "pg/l":
        # 1 pg/l = 1e-9 mg/l
        return value * 1e-9, cUnit
    elif unit == "pg/m3":
        # Conversion requires volume information
        #raise ValueError("Conversion for 'pg/m3' requires volume context.")
        return value, unit
    elif unit == "rfu":
        # Relative fluorescence units are context-specific
        #raise ValueError("Conversion for 'rfu' is context-specific and requires calibration data.")
        return value, unit
    else:
        #raise ValueError("Unknown unit: {}".format(unit))
        return value, unit

# Example usage
try:
    result, resultUnit = first_convert_to_mg_per_l(5, "%")
    print("Converted value: {} {}".format(result, resultUnit))
except ValueError as e:
    print("Error:", e)


Converted value: 50000 mg/l


In [9]:
def second_convert_to_mg_per_l(value, unit):
    """
    Converts various metrics to mg/L.
    Only applicable units are converted; others raise a ValueError.
    """
    unit = str(unit).lower()
    conversion_factors = {
        "mg/l": 1,            # Already in mg/L
        "%": 10_000,          # Assumes % w/v (1% = 10,000 mg/L)
        "ug/l": 1e-3,         # Micrograms per liter
        "mg/kg": 1,           # Assuming 1 kg/L density
        "% v/v": 10_000,      # Assumes % v/v in water
        "% w/w": 10_000,      # Assumes % w/w in water
        "g/l": 1_000,         # Grams per liter
        "ug/kg": 1e-3,        # Micrograms per kg (assuming 1 kg/L density)
        "ng/l": 1e-6,         # Nanograms per liter
        "ppm": 1,             # Equivalent to mg/L
        "bq/l": 1,            # Assumes radionuclide mass equivalent in mg/L
        "cfu/0.1l": 10,       # Colony-forming units per 0.1L to mg/L (arbitrary mass equivalence)
        "kg/d": 1 / 0.001,    # Assuming flow of 1 L/d
        "kg/qtr": 1 / (0.001 * 90),  # Quarterly conversion with 90 days assumed
        "kt/qtr": 1e6 / (0.001 * 90),# Kilotons per quarter
        "no/100ml": 10,       # Assuming 1 unit mass equivalent
        "us/cm": None,        # Cannot convert electrical conductivity to mg/L directly
        "no/g": None,         # Cannot convert counts to mg/L directly
        "t/d": 1e6 / 0.001,   # Tons per day assuming 1 L/d
        "t/qtr": 1e6 / (0.001 * 90),
        "t/wk": 1e6 / (0.001 * 7),
        "coded": None,        # Unsupported
        "m": None,            # Unsupported
        "ngr": None,          # Unsupported
        "unitless": None,     # Unsupported
        "phi": None,          # Unsupported
        "min": None,          # Unsupported
        "deg": None,          # Unsupported
        "deccafix": None,     # Unsupported
        "pres/nf": None,      # Unsupported
        "mw.s/sq.": None,     # Unsupported
        "no.": None,          # Unsupported
        "abs/cm": None,       # Unsupported
        "nm": None,           # Unsupported
        "text": None,         # Unsupported
        "yes/no": None,       # Unsupported
        "l/kg/h": None,       # Unsupported
        "ug": 1e-3,           # Micrograms to mg
    }

    if unit in conversion_factors and conversion_factors[unit] is not None:
        return value * conversion_factors[unit], unit
    else:
        #raise ValueError(f"Unsupported or non-convertible unit: {unit}")
        #raise ValueError(f"Unsupported")
        #raise ValueError(np.nan)
        return value, unit


# Example usage
try:
    #User input
    #value = float(input("Enter the value: "))
    #unit = input("Enter the unit: ")
    
    #Hardcoded values
    value = 1.1
    unit = 'ug'
    unit = 'text'
    result = second_convert_to_mg_per_l(value, unit)
    print(f"{value} {unit} is equal to {result} mg/L.")
except ValueError as e:
    print(e)


1.1 text is equal to (1.1, 'text') mg/L.


In [10]:
#STEP 1: Use the 24 years WQ data for England gathered for Phosphate & Phosphorus family in the above step
# To carry out the Transpose operation 
#==========================================================================================================

In [11]:
# Read Whole dataset
#folderpath = cleansed
#filename = '12_NB_10Transpose_2959_All1.csv'
#showtime()
#dfPSpatio_outer = loaddata(folderpath, filename, path = 'gcs://rdmai_dev_data/')
#showtime()


showtime()
#Bigger size [All types of Sample purpose Including Reactive monitoring] 
#dfPall_outer = pd.read_csv('gcs://rdmai_dev_data/cleansed/12_NB_10Transpose_2959_All1_With_R_Mon.csv' , #Commented 11 May 2025
dfPall_outer = pd.read_csv('gcs://rdmai_dev_data/cleansed/07_nb_transpose_2959_all1_with_r_mon.csv',     #Added 11 May 2025
                           usecols = ['sampleDateTime', 'samplingPoint_name',
                                      'determinand_notation', 'samplingPoint_notation',
                                      'samplingPoint_easting', 'samplingPoint_northing',
                                      #'sampledMaterialType_name', This is unique as of now, hence not required to load this column
                                      'isComplianceSample', 'purpose_name',
                                      'determinand_unit_name', 'result'])
#file_to_write = '12_NB_10Transpose_2959_All2_With_R_Mon.csv'
file_to_write = '07_nb_transpose_2959_all1_with_r_mon.csv'

'''
#Smaller size [All types of Sample purpose Except Reactive monitoring]
#dfPall_outer = pd.read_csv('gcs://rdmai_dev_data/cleansed/12_NB_10Transpose_2959_All1_Without_R_Mon.csv' , #Commented 11 May 2025
dfPall_outer = pd.read_csv('gcs://rdmai_dev_data/cleansed/07_nb_transpose_2959_all1_without_r_mon.csv',     #Added 11 May 2025
                           usecols = ['sampleDateTime', 'samplingPoint_name',
                                      'determinand_notation', 'samplingPoint_notation',
                                      'samplingPoint_easting', 'samplingPoint_northing',
                                      #'sampledMaterialType_name', This is unique as of now, hence not required to load this column
                                      'isComplianceSample', 'purpose_name',
                                      'determinand_unit_name', 'result'])
#file_to_write = '12_NB_10Transpose_2959_All2_Without_R_Mon.csv'
file_to_write = '07_nb_transpose_2959_all1_without_r_mon.csv'
'''
showtime()
print(file_to_write)

print(dfPall_outer["determinand_notation"].nunique(), dfPall_outer["determinand_unit_name"].nunique(), 
      dfPall_outer["isComplianceSample"].nunique(), dfPall_outer["purpose_name"].nunique())

#With R_Mon - Inclusive -> 1891 68 2 26
print(len(dfPall_outer))


dfPall_outer["isComplianceSample"].value_counts()

16 May 2025 13:27:38
16 May 2025 13:31:25
07_nb_transpose_2959_all1_with_r_mon.csv
1891 68 2 26
33424159


isComplianceSample
False    33166122
True       258037
Name: count, dtype: int64

In [12]:
#function to save encldoed labels into a pickle file
def train_and_save_encoders(data, categorical_columns, save_path='encoders'):
    """
    Train and save individual label encoders for each categorical column
    """
    import joblib
    encoders = {}
    save_path = "..//03_prediction_model/pkls"
   
    for col in categorical_columns:
        # Create and fit encoder
        le = LabelEncoder()
        data[col+"_label"] = le.fit_transform(data[col])
       
        # Save encoder to individual file
        encoder_path = f'{save_path}/encoder_{col}.pkl'
        joblib.dump(le, encoder_path)
        encoders[col] = encoder_path
       
    return data, encoders

In [13]:
from sklearn.preprocessing import LabelEncoder 
labelencoder= LabelEncoder() #initializing an object of class LabelEncoder

'''
Commented on 15 May 2025
dfPall_outer['isComplianceSample_label'] = labelencoder.fit_transform(dfPall_outer['isComplianceSample']) #fitting and transforming the desired categorical column.
dfPall_outer['purpose_name_label'] = labelencoder.fit_transform(dfPall_outer['purpose_name']) #fitting and transforming the desired categorical column.
dfPall_outer['determinand_unit_name_label'] = labelencoder.fit_transform(dfPall_outer['determinand_unit_name']) #fitting and transforming the desired categorical column.
'''

#Save Encoders into Pickle file
# List of categorical columns to encode
categorical_cols = ['isComplianceSample', 'purpose_name', 'determinand_unit_name']

# Process data and save encoders
dfPall_outer, encoder_paths = train_and_save_encoders(dfPall_outer, categorical_cols)

#dfPall_outer = dfPall_outer.drop(columns=['isComplianceSample', 'purpose_name', 'determinand_unit_name'])

print(dfPall_outer["determinand_notation"].nunique(), dfPall_outer["determinand_unit_name_label"].nunique(), 
      dfPall_outer["isComplianceSample_label"].nunique(), dfPall_outer["purpose_name_label"].nunique())

dfPall_outer.head(2)
print(len(dfPall_outer))

1891 68 2 26
33424159


In [14]:
dfPall_outer_refCompSample = dfPall_outer.drop_duplicates(subset=['isComplianceSample_label'])[['isComplianceSample','isComplianceSample_label']]
dfPall_outer_refPurpose = dfPall_outer.drop_duplicates(subset=['purpose_name_label'])[['purpose_name','purpose_name_label']]
dfPall_outer_refUnitName = dfPall_outer.drop_duplicates(subset=['determinand_unit_name_label'])[['determinand_unit_name','determinand_unit_name_label']]
print("\nTypes of complinace : ", len(dfPall_outer_refCompSample), 
      "\nPurpose Types       : ", len(dfPall_outer_refPurpose), 
      "\nVaried Unit names   : ", len(dfPall_outer_refUnitName))

#Types of complinace :  2 
#Purpose Types       :  26 
#Varied Unit names   :  68


Types of complinace :  2 
Purpose Types       :  26 
Varied Unit names   :  68


In [15]:
#if (file_to_write == "12_NB_10Transpose_2959_All2_With_R_Mon.csv"):
if (file_to_write == "07_nb_transpose_2959_all1_with_r_mon.csv"):
    '''
    savedata(dfPall_outer_refCompSample, "08_nb_transpose_2959_all2_UniqComplianceSamples.csv", 'gcs://rdmai_dev_data/NW_Final/')
    savedata(dfPall_outer_refPurpose, "08_nb_transpose_2959_all2_UniqPurposeNames.csv", 'gcs://rdmai_dev_data/NW_Final/')
    savedata(dfPall_outer_refUnitName, "08_nb_transpose_2959_all2_UniqDetermndUnitNames.csv", 'gcs://rdmai_dev_data/NW_Final/')
    '''
    savedata(dfPall_outer_refCompSample, "08_nb_transpose_2959_all2_UniqComplianceSamples.csv", 'gcs://rdmai_dev_data/cleansed/')
    savedata(dfPall_outer_refPurpose, "08_nb_transpose_2959_all2_UniqPurposeNames.csv", 'gcs://rdmai_dev_data/cleansed/')
    savedata(dfPall_outer_refUnitName, "08_nb_transpose_2959_all2_UniqDetermndUnitNames.csv", 'gcs://rdmai_dev_data/cleansed/')

#dfPall_outer_refCompSample.head(2)
#dfPall_outer_refPurpose.head(2)
#dfPall_outer_refUnitName.head(2)

saved, Location:  gcs://rdmai_dev_data/NW_Final/cleansed/08_nb_transpose_2959_all2_UniqComplianceSamples.csv
saved, Location:  gcs://rdmai_dev_data/NW_Final/cleansed/08_nb_transpose_2959_all2_UniqPurposeNames.csv
saved, Location:  gcs://rdmai_dev_data/NW_Final/cleansed/08_nb_transpose_2959_all2_UniqDetermndUnitNames.csv


In [16]:
dfPall_outer = dfPall_outer.drop(columns=['isComplianceSample', 'purpose_name', 'determinand_unit_name'])

In [17]:
dfPall_outer['sampleDateOnly'] = pd.to_datetime(dfPall_outer['sampleDateTime'])
dfPall_outer['sampleDateOnly'] = dfPall_outer['sampleDateOnly'].dt.date
dfPall_outer = dfPall_outer.set_index('sampleDateOnly')

dfPall_outer = dfPall_outer.drop(columns=['sampleDateTime'])

dfPall_outer.head(2)

Unnamed: 0_level_0,samplingPoint_notation,samplingPoint_name,determinand_notation,result,samplingPoint_easting,samplingPoint_northing,isComplianceSample_label,purpose_name_label,determinand_unit_name_label
sampleDateOnly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-24,AN-01M04,R.OUSE A422 RD.BR.BRACKLEY,180,0.1,459427,236819,0,3,28
2000-01-24,AN-01M04,R.OUSE A422 RD.BR.BRACKLEY,6051,160.0,459427,236819,0,3,64


In [18]:
print("Total rows: ", len(dfPall_outer))
# Total rows: 33424159

Total rows:  33424159


In [19]:
#Commented on March 26 2025 as it was decided to drop (units) metrics standardization

#dfPall_outer_temp1 = dfPall_outer_temp[dfPall_outer_temp['determinand_unit_nameC'] == 'mg/l']
#print(len(dfPall_outer_temp), len(dfPall_outer_temp1))
#
#dfPall_result = dfPall_outer_temp.groupby(['sampleDateOnly', 'samplingPoint_notation',
#                                           'samplingPoint_easting', 'samplingPoint_northing',
#                                           'determinand_notation', 'determinand_unit_nameC']
#                                         ).agg(daily_avg=('resultC', 'mean')
#                                              ).reset_index()
#
print("\nTotal rows before Grouping by : ", len(dfPall_outer))
dfPall_result = dfPall_outer.groupby(['sampleDateOnly', 'samplingPoint_notation',
                                      'samplingPoint_easting', 'samplingPoint_northing',
                                      'isComplianceSample_label',
                                      'purpose_name_label',
                                      'determinand_unit_name_label', 
                                      'determinand_notation']
                                    ).agg(daily_avg=('result', 'mean')
                                         ).reset_index()
dfPall_result.head(2)
print("\nTotal rows after  Grouping by : ", len(dfPall_result))

'''
    Total rows before Grouping by :  33424159
    Total rows after  Grouping by :  32426089
'''


Total rows before Grouping by :  33424159

Total rows after  Grouping by :  32426089


'\n    Total rows before Grouping by :  33424159\n    Total rows after  Grouping by :  32426089\n'

In [20]:
'''
dfPall_result['sampleDateOnly'] = pd.to_datetime(dfPall_result['sampleDateOnly'])
dfPall_result['sampleDateOnly'] = dfPall_result['sampleDateOnly'].dt.date
dfPall_result = dfPall_result.set_index('sampleDateOnly')

dfPall_result.head(2)
'''

"\ndfPall_result['sampleDateOnly'] = pd.to_datetime(dfPall_result['sampleDateOnly'])\ndfPall_result['sampleDateOnly'] = dfPall_result['sampleDateOnly'].dt.date\ndfPall_result = dfPall_result.set_index('sampleDateOnly')\n\ndfPall_result.head(2)\n"

In [21]:
print ("Total rows after  Grouping by : ", len(dfPall_result), 
       "\nTotal Determinands          : ", len(dfPall_result['determinand_notation'].unique()))

print(dfPall_result[['samplingPoint_easting', 'samplingPoint_northing']].nunique(),
      dfPall_result['samplingPoint_notation'].nunique())

#With R_Mon - Inclusive -> 
     #32426089 1891
     #samplingPoint_easting     19378
     #samplingPoint_northing    19966

#Without R_Mon - Inclusive -> 
     #31885281 1637
     #samplingPoint_easting     18694
     #samplingPoint_northing    19226
     #samplingPoint_notation.   21867

Total rows after  Grouping by :  32426089 
Total Determinands          :  1891
samplingPoint_easting     19378
samplingPoint_northing    19966
dtype: int64 23083


In [22]:
#Delete in memory datasets since below 2-D conversion results in error: 
#      'MemoryError: Unable to allocate 308. GiB for an array with shape (13196325, 3131) and data type float64

del(dfPall_outer)


In [23]:
'''
print(type(dfPall_result.index))
dfPall_result.index = pd.to_datetime(dfPall_result.index)
print(type(dfPall_result.index))
dfPall_result.head(2)
'''

'\nprint(type(dfPall_result.index))\ndfPall_result.index = pd.to_datetime(dfPall_result.index)\nprint(type(dfPall_result.index))\ndfPall_result.head(2)\n'

In [24]:
# Based on Nicolai & Lila's input, removing rows for the determinands that are of less than 100 observations over 24 years
print("Before Removing rows that has <100 observations recorded per Determinand : ", len(dfPall_result))
#dfPall_result = dfPall_result[dfPall_result['determinand_notation'].apply(lambda x: dfPall_result['determinand_notation'].value_counts() [x]>=100).values]

colName = "determinand_notation"
holdGT = 100
vCounts = dfPall_result['determinand_notation'].value_counts()
valid_Determinands_lst = vCounts[vCounts >= holdGT].index
dfPall_result = dfPall_result[dfPall_result[colName].isin(valid_Determinands_lst)]

print("After Removal............................................................: ", len(dfPall_result))

dfPall_result.head(2)

#With R_Mon - Inclusive -> 
       #Before removal: 32426089
       #After removal:  32411236
       #Difference:        14853

#Without R_Mon - Inclusive -> 
       #Before removal: 31885281
       #After removal:  31871766
       #Difference:        13515

Before Removing rows that has <100 observations recorded per Determinand :  32426089
After Removal............................................................:  32411236


Unnamed: 0,sampleDateOnly,samplingPoint_notation,samplingPoint_easting,samplingPoint_northing,isComplianceSample_label,purpose_name_label,determinand_unit_name_label,determinand_notation,daily_avg
0,2000-01-01,AN-70561033,489852,289360,0,24,28,99,8.7
1,2000-01-01,AN-70561033,489852,289360,0,24,28,111,13.0


In [25]:
#Transposint to two dimensional and fill na values with zero
result_transposed_Full = dfPall_result.pivot(index=['sampleDateOnly', 'samplingPoint_notation',
                                                    'samplingPoint_easting', 'samplingPoint_northing',
                                                    'determinand_unit_name_label', 
                                                    'purpose_name_label', 
                                                    'isComplianceSample_label',
#                                                    'determinand_unit_name'
                                                   ],
                                             columns=['determinand_notation'],
                                             values='daily_avg').fillna(0)

print("Total Records Before Transpose:  ", len(dfPall_result), 
      "After.........................:  ", len(result_transposed_Full))
result_transposed_Full.head(2)

#With R_Mon - Inclusive -> 
       #Total Records Before Transpose:  32411236
       #Total Records After Transpose:    9278441, 1773958

#Without R_Mon - Inclusive -> 
       #Total Records Before Transpose:  31871766
       #Total Records After Transpose:    9106238

Total Records Before Transpose:   32411236 After.........................:   9278441


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,determinand_notation,3,4,6,28,30,31,37,39,50,52,61,62,64,66,67,68,69,72,73,76,77,81,82,83,85,88,97,99,103,105,106,108,111,113,114,116,117,118,119,134,135,138,143,144,152,153,158,162,163,164,167,171,172,174,175,177,179,180,182,183,186,191,192,205,207,209,211,235,237,239,241,254,270,301,346,348,460,461,462,463,483,487,491,495,499,503,507,511,527,531,535,539,543,547,551,555,559,562,569,570,573,575,576,577,578,579,581,612,664,666,714,723,729,731,733,736,738,739,746,749,753,758,772,831,911,912,947,950,979,1002,1049,1066,1085,1109,1115,1118,1119,1120,1153,1158,1173,1181,1183,1193,1198,1200,2331,2348,2551,2896,2921,2922,2923,2924,2925,2926,2942,2943,2944,2953,2955,2957,2958,2959,2960,2961,2962,2963,2964,2965,2967,2968,2969,2970,2972,2973,2975,2977,2981,2982,2983,2984,2987,2988,2989,2990,2991,2993,2995,3000,3001,3002,3004,3007,3009,3018,3019,3021,3023,3024,...,8940,8942,8943,8944,8994,8995,8997,8998,8999,9000,9002,9003,9005,9007,9009,9011,9013,9015,9017,9019,9021,9035,9039,9040,9050,9051,9052,9066,9068,9073,9091,9094,9097,9142,9143,9145,9156,9160,9161,9166,9190,9196,9197,9198,9199,9238,9239,9248,9258,9260,9267,9274,9276,9285,9338,9339,9340,9341,9342,9343,9344,9345,9348,9350,9364,9366,9371,9446,9447,9451,9453,9454,9456,9457,9458,9460,9461,9462,9466,9467,9468,9472,9474,9475,9477,9479,9494,9496,9519,9522,9552,9573,9584,9586,9603,9604,9606,9618,9619,9634,9638,9639,9669,9671,9684,9686,9688,9691,9695,9696,9703,9715,9716,9724,9725,9726,9727,9728,9729,9730,9731,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741,9742,9743,9744,9745,9746,9747,9748,9749,9750,9751,9756,9768,9769,9770,9771,9772,9773,9774,9807,9809,9811,9814,9815,9816,9817,9818,9819,9821,9822,9823,9836,9839,9841,9844,9845,9851,9853,9856,9857,9860,9861,9862,9863,9880,9883,9885,9888,9889,9891,9892,9898,9899,9901,9911,9921,9924,9925,9933,9935,9943,9951,9952,9959,9978,9979,9989,9990,9992,9993
sampleDateOnly,samplingPoint_notation,samplingPoint_easting,samplingPoint_northing,determinand_unit_name_label,purpose_name_label,isComplianceSample_label,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1,Unnamed: 285_level_1,Unnamed: 286_level_1,Unnamed: 287_level_1,Unnamed: 288_level_1,Unnamed: 289_level_1,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1,Unnamed: 297_level_1,Unnamed: 298_level_1,Unnamed: 299_level_1,Unnamed: 300_level_1,Unnamed: 301_level_1,Unnamed: 302_level_1,Unnamed: 303_level_1,Unnamed: 304_level_1,Unnamed: 305_level_1,Unnamed: 306_level_1,Unnamed: 307_level_1,Unnamed: 308_level_1,Unnamed: 309_level_1,Unnamed: 310_level_1,Unnamed: 311_level_1,Unnamed: 312_level_1,Unnamed: 313_level_1,Unnamed: 314_level_1,Unnamed: 315_level_1,Unnamed: 316_level_1,Unnamed: 317_level_1,Unnamed: 318_level_1,Unnamed: 319_level_1,Unnamed: 320_level_1,Unnamed: 321_level_1,Unnamed: 322_level_1,Unnamed: 323_level_1,Unnamed: 324_level_1,Unnamed: 325_level_1,Unnamed: 326_level_1,Unnamed: 327_level_1,Unnamed: 328_level_1,Unnamed: 329_level_1,Unnamed: 330_level_1,Unnamed: 331_level_1,Unnamed: 332_level_1,Unnamed: 333_level_1,Unnamed: 334_level_1,Unnamed: 335_level_1,Unnamed: 336_level_1,Unnamed: 337_level_1,Unnamed: 338_level_1,Unnamed: 339_level_1,Unnamed: 340_level_1,Unnamed: 341_level_1,Unnamed: 342_level_1,Unnamed: 343_level_1,Unnamed: 344_level_1,Unnamed: 345_level_1,Unnamed: 346_level_1,Unnamed: 347_level_1,Unnamed: 348_level_1,Unnamed: 349_level_1,Unnamed: 350_level_1,Unnamed: 351_level_1,Unnamed: 352_level_1,Unnamed: 353_level_1,Unnamed: 354_level_1,Unnamed: 355_level_1,Unnamed: 356_level_1,Unnamed: 357_level_1,Unnamed: 358_level_1,Unnamed: 359_level_1,Unnamed: 360_level_1,Unnamed: 361_level_1,Unnamed: 362_level_1,Unnamed: 363_level_1,Unnamed: 364_level_1,Unnamed: 365_level_1,Unnamed: 366_level_1,Unnamed: 367_level_1,Unnamed: 368_level_1,Unnamed: 369_level_1,Unnamed: 370_level_1,Unnamed: 371_level_1,Unnamed: 372_level_1,Unnamed: 373_level_1,Unnamed: 374_level_1,Unnamed: 375_level_1,Unnamed: 376_level_1,Unnamed: 377_level_1,Unnamed: 378_level_1,Unnamed: 379_level_1,Unnamed: 380_level_1,Unnamed: 381_level_1,Unnamed: 382_level_1,Unnamed: 383_level_1,Unnamed: 384_level_1,Unnamed: 385_level_1,Unnamed: 386_level_1,Unnamed: 387_level_1,Unnamed: 388_level_1,Unnamed: 389_level_1,Unnamed: 390_level_1,Unnamed: 391_level_1,Unnamed: 392_level_1,Unnamed: 393_level_1,Unnamed: 394_level_1,Unnamed: 395_level_1,Unnamed: 396_level_1,Unnamed: 397_level_1,Unnamed: 398_level_1,Unnamed: 399_level_1,Unnamed: 400_level_1,Unnamed: 401_level_1,Unnamed: 402_level_1,Unnamed: 403_level_1,Unnamed: 404_level_1,Unnamed: 405_level_1,Unnamed: 406_level_1,Unnamed: 407_level_1
2000-01-01,AN-70561033,489852,289360,28,24,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.7,0.0,0.0,0.0,0.0,13.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110.0,0.0,0.0,0.0,0.0,0.0,0.0,740.0,0.0,0.0,0.0,0.0,260.0,0.0,21.0,0.0,54.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,260.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-01-01,AN-70561033,489852,289360,54,24,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
del (dfPall_result)

In [27]:

'''
    Initially, the kernel struggled with handling large volumes of 9+ million rows with thousands of columns, 
    resulting in errors. To address this, a stratified sample of 40% (approximately 3.7 million rows) was 
    tested, which worked well. After consulting with a GCP infrastructure expert, additional compute resources 
    were added to handle such volume processing, and the below code was commented out.
    Pasupathi N, Lead Data Scientist, Cognizant - UK.
'''

'''
print (len(result_transposed_Full))
result_transposed_Full = result_transposed_Full.sample(frac=0.4,random_state=1).reset_index(drop=False)
print (len(result_transposed_Full))
result_transposed_Full.head(2)

#With R_Mon - Inclusive -> 
    #Stratified sample of 3711376 records taken out of the total 9278441 records, 
'''

'''
    Stratified sampling process to handle large volume transactions.
'''

'\n    Stratified sampling process to handle large volume transactions.\n'

In [28]:
print(file_to_write)

07_nb_transpose_2959_all1_with_r_mon.csv


In [29]:
'''
    Due to a GCP error in handling large volume files, the dataset was written into GCP for interim purposes. 
    Otherwise, the following steps resulted in an error, requiring a kernel restart and causing loss of 
    datasets and memory variables, necessitating the execution of steps from the start cell.
'''

'''
showtime()
# Save the Transposed data - 24 Years of data Grouped and Transposed
savedata(result_transposed_Full, file_to_write, 'gcs://rdmai_dev_data/NW_Final/')
print("Saved file at: gcs://rdmai_dev_data/NW_Final/", file_to_write)
showtime()

#At this step - Pasu 7th Apr 2025
'''

'''
    Interim Write
'''

'\n    Interim Write\n'

In [None]:
showtime()
# Save the Transposed data - 24 Years of data Grouped and Transposed
#result_transposed_Full.to_csv('../NW_DataPP/12_NB_10Transpose_2959_All2_With_R_Mon.csv', index=True)
'''
    if ( file_to_write == '12_NB_10Transpose_2959_All2_Without_R_Mon.csv'):
        file_to_write  = '12_NB_10Transpose_2959_All2_Without_R_MonLess_UL_C.csv'
    else:
        file_to_write  = '12_NB_10Transpose_2959_All2_With_R_MonLess_UL_C.csv'
'''

if ( file_to_write == '07_nb_transpose_2959_all1_with_r_mon.csv'): 
    file_to_write  = '08_nb_transpose_2959_all2_with_r_mon.csv'
else:
    file_to_write  = '08_nb_transpose_2959_all2_without_r_mon.csv'

print ("Total Rows in Transposed dataset: ", len(result_transposed_Full))

savedata(result_transposed_Full, file_to_write, 'gcs://rdmai_dev_data/')


showtime()

16 May 2025 13:34:43
Total Rows in Transposed dataset:  9278441
saved, Location:  gcs://rdmai_dev_data/cleansed/08_nb_transpose_2959_all2_with_r_mon.csv
16 May 2025 14:48:25


()

In [None]:
print(len(result_transposed_Full.columns))
print(result_transposed_Full.columns)

966
Index([   3,    4,    6,   28,   30,   31,   37,   39,   50,   52,
       ...
       9943, 9951, 9952, 9959, 9978, 9979, 9989, 9990, 9992, 9993],
      dtype='int64', name='determinand_notation', length=966)


In [None]:
#Release memory back to GCP computes
#del (dfPall_outer, dfPall_outer_temp, dfPall_outer_temp1, result_transposed_Full)
del (result_transposed_Full)
gc.collect()

21

In [None]:
#STEP 1: Use the 24 years WQ data for England gathered for Phosphate & Phosphorus family in the above step
# To carry out the Transpose operation - End
#==========================================================================================================

In [None]:
#dbutils.notebook.exit("End Workload - Scrip stopped")

In [37]:
#End CARD
#In line comments completed 10-May-2025

In [38]:
'''
    Theme: BB1C - Predicting Phosphate / Orthophosphate in UK Catchments Using AIML Models
    Pasupathi N, Lead Data Scientist, Cognizant - UK.
'''

'\n    Theme: BB1C - Predicting Phosphate / Orthophosphate in UK Catchments Using AIML Models\n    Pasupathi N, Lead Data Scientist, Cognizant - UK.\n'