# Install Required Packages if needed

In [1]:
#!pip install dataretrieval

# Import Libraries

In [172]:
import pandas as pd #for creating dataframe
import pyodbc #working with ODBC databases
import numpy as np # for locating values in dataframes
from datetime import datetime, timedelta # for obtaining today's date and converting time
import os #for working with directories
import dataretrieval.nwis as nwis # retrieving water level data

# Manage directories

In [2]:
# my default directory is c:\Users\bmilinic\OneDrive - DOI\Documents\Python\bemidji
defaultdirectory = os.getcwd()
print(defaultdirectory)

c:\Users\bmilinic\OneDrive - DOI\Documents\Python\bemidji


In [37]:
# connect to the shared drive which holds the databases and files (must be connected to VPN)
os.chdir('P:/0083/analysis/DataCompilation/DataCompilationPy/create_master_oil_levels')
os.getcwd()

'P:\\0083\\analysis\\DataCompilation\\DataCompilationPy\\create_master_oil_levels'

# Import data

In [38]:
# retrieve water levels from NWIS

# OLD WAY is to web scrape using notebook 01_inputfiles_prep.ipynb but it was only getting the newest value
# dfwl = pd.read_csv(r'../create_master_oil_levels/data_inputs/NWISwaterLevel_fromPy.csv')

# NEW WAY, use the nwis library and list of sites from access
# Getting site list straight from the database using PYODBC:
# FRONT END  
Gfe_db = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=P:\0083\analysis\DataCompilation\DataCompilationPy\local_access_db\BemidjiMasterSiteData_fe.accdb;')
c_fe = Gfe_db.cursor()
# BACK END
Gbe_db = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=P:\0083\analysis\DataCompilation\DataCompilationPy\local_access_db\BemidjiMasterSiteData_be.accdb;')
c_be = Gbe_db.cursor()

# DATA
# df_sites and site list
c_fe.execute('select USGS_siteno from tblSites')
df = pd.read_sql('select USGS_siteno from tblSites', Gfe_db)
# Drop none values, create a list out of the column, and create a string out of the column
df_sites = df.dropna()
list_sites = df_sites['USGS_siteno'].tolist()
# tables
tblOilLevels = pd.read_sql('select * from tblOilLevels', Gfe_db)
tblSites = pd.read_sql('select * from tblSites', Gfe_db)
tblWells = pd.read_sql('select * from tblWells', Gfe_db)
# cd tables
tblcd_WaterLevelStatusCode = pd.read_sql("select * from tblcd_WaterLevelStatusCode", Gfe_db)  
tblcd_FluidLevelMethodCode = pd.read_sql("select * from tblcd_FluidLevelMethodCode", Gbe_db) 
# oil tape corrections
tbl_OilTapeCorrections = pd.read_sql('select * from tbl_OilTapeCorrections', Gbe_db)

# From nwis
data = nwis.get_gwlevels(sites=list_sites, datetime_index=False) # creates a tuple of dataframe and associated metadata
dfwl = data[0].copy()

#From old gwsi bmj
bmj3 = pd.read_csv(r'../create_site_info_files/data_inputs/gwsi_old/bmj3_fromPy.csv')
bmjmp = pd.read_csv(r'../create_site_info_files/data_inputs/gwsi_old/bmj_mpnt_fromPy.csv')
bmjrmk = pd.read_csv(r'../create_site_info_files/data_inputs/gwsi_old/bmj_rmk_fromPy.csv')



In [39]:
# close the cursors and connections to the databases 
c_fe.close()
Gfe_db.close()
c_be.close()
Gbe_db.close()

In [53]:
# test
chk = tblOilLevels.loc[tblOilLevels['LocalSiteName'] =='0501']
chk

Unnamed: 0,OilLevelID,LocalSiteName,PersonMakingMeasurement,OilLevelDate,OilLevelTime,TimeDatum,OilLevel_ftBMP,OilLevelMethod,Oil_Tape_Serial_No,Comments_OilLevel,...,ARCHIVE_Hold_ft,ARCHIVE_Cut_ft,ARCHIVE_NWIS_WaterLevelMethod,ARCHIVE_MP_Elev_meters,ARCHIVE_WaterLevelElevation_meters,ARCHIVE_OilLevelElevation_ft,ARCHIVE_WaterLevelDepth_metersBLS,ARCHIVE_OilLevelDepth_metersBLS,ARCHIVE_BrentComment,ARCHIVE_Comments_OilLevel
50,20110,501,Nacaya Brown (UNO student) & Barbara Bekins,2018-06-25,14:30,CDT,22.72,Z,2939.0,Likely trapped oil in this old aquifer test pu...,...,,,Z,,,,,,,Likely trapped oil in this old aquifer test pu...
152,20172,501,"A. Berg, Josiah Trost",2019-06-18,14:44,CDT,21.5,Z,2939.0,Likely trapped oil in this old aquifer test pu...,...,,,Z,,,,,,,Likely trapped oil in this old aquifer test pu...
271,20049,501,A. Berg & J. Lund,2017-06-14,17:04,CDT,22.71,Z,2939.0,Screen not connected to true oil layer. Extrem...,...,,,Z,,,,,,,Screen not connected to true oil layer. Extrem...
19370,19314,501,,2005-06-27,12:01,,,Z,,,...,25.0,2.86,,,,,,,,
19571,19572,501,Z. Hillman & E. Berquist,2012-06-20,14:46,CDT,22.55,Z,2939.0,"The oil in this well is trapped, and not scree...",...,,,Z,,,,,,,"The oil in this well is trapped, and not scree..."
19742,19744,501,A. Berg & J. Julik,2013-07-15,11:38,CDT,22.41,Z,2939.0,"Black oil, watery",...,,,Z,,,,,,,"Black oil, watery;"
19809,19811,501,A. Berg & B. Mason,2014-07-15,11:14,CDT,21.78,Z,2939.0,"Oil is likely trapped in well, and not connect...",...,,,Z,,,,,,,"Oil is likely trapped in well, and not connect..."
19899,19988,501,A. Berg & A. Witt,2016-07-25,16:09,CDT,22.83,Z,2939.0,Screen not connected to true oil layer. Very d...,...,,,Z,,,,,,,Screen not connected to true oil layer. Very d...
19967,19930,501,"A. Berg, E. Coenen",2015-08-05,10:04,CDT,23.06,Z,2939.0,Oil is likely cut off from screen. Extremly da...,...,,,Z,,,,,,,Oil is likely cut off from screen. Extremly da...


In [54]:
# Inspect dfwl
display(dfwl.loc[dfwl['lev_status_cd'] == 'P'].head(2))
display(dfwl.lev_status_cd.value_counts())
dfwl.to_csv("outputs/tests/{}_test_NWIS_output.csv".format(str(datetime.now().date())), index=False)
print('Copy of NWIS output saved to', os.getcwd())

Unnamed: 0,agency_cd,site_no,site_tp_cd,lev_dt,lev_tm,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,lev_agency_cd,lev_dt_acy_cd,lev_acy_cd,lev_src_cd,lev_meth_cd,lev_age_cd
7141,USGS,473424095052889,GW,2012-05-10,15:47,UTC,21.82,,,P,USGS,m,2,S,V,A
7984,USGS,473425095052010,GW,2014-07-25,20:13,UTC,26.41,,,P,USGS,m,2,S,V,A


1    24395
8      336
D      321
P       12
6        4
F        1
Name: lev_status_cd, dtype: int64

Copy of NWIS output saved to P:\0083\analysis\DataCompilation\DataCompilationPy\create_master_oil_levels


In [58]:
dfwl.loc[dfwl['site_no']=='473425095052603']

Unnamed: 0,agency_cd,site_no,site_tp_cd,lev_dt,lev_tm,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,lev_agency_cd,lev_dt_acy_cd,lev_acy_cd,lev_src_cd,lev_meth_cd,lev_age_cd
10283,USGS,473425095052603,GW,2012-06-20,19:46,UTC,24.0,,,1,USGS,m,2,S,V,A
10284,USGS,473425095052603,GW,2013-07-15,16:38,UTC,23.84,,,1,USGS,m,2,S,V,A
10285,USGS,473425095052603,GW,2014-07-15,16:14,UTC,23.14,,,1,USGS,m,2,S,O,A
10286,USGS,473425095052603,GW,2015-08-05,15:04,UTC,24.43,,,1,USGS,m,2,S,V,A
10287,USGS,473425095052603,GW,2017-06-14,22:04,UTC,24.04,,,1,,m,2,,O,A
10288,USGS,473425095052603,GW,2018-06-25,19:30,UTC,24.04,,,8,,m,2,,O,A
10289,USGS,473425095052603,GW,2019-06-18,19:44,UTC,22.82,,,8,USGS,m,2,S,O,A


In [59]:
dfwl.dtypes

agency_cd         object
site_no           object
site_tp_cd        object
lev_dt            object
lev_tm            object
lev_tz_cd         object
lev_va           float64
sl_lev_va        float64
sl_datum_cd      float64
lev_status_cd     object
lev_agency_cd     object
lev_dt_acy_cd     object
lev_acy_cd        object
lev_src_cd        object
lev_meth_cd       object
lev_age_cd        object
dtype: object

In [60]:
# test delete
print(tblcd_WaterLevelStatusCode.value_counts())
print(dfwl.lev_status_cd.value_counts()) # has letters and numbers
print(tblcd_WaterLevelStatusCode.value_counts())
print(tblcd_WaterLevelStatusCode.lev_status_cd.value_counts()) # should take letters

lev_status_cd_ID  lev_status_cd  Comments_WaterLevelStatus                                                                    
1                 A              Water level was affected by atmospheric pressure.                                                1
13                M              Well was plugged and not in hydraulic contact with formation.                                    1
22                X              The water level was affected by stage in nearby surface-water site.                              1
21                W              The well was destroyed (no water level was recorded).                                            1
20                V              A foreign substance was present on the surface of the water.                                     1
19                T              A nearby site that taps the same aquifer had been pumped recently.                               1
18                S              A nearby site that taps the same aquifer was bei

In [61]:
# r line 34 : edit tblOilLevels. First inspect data
display(tblOilLevels.keys())
display(tblOilLevels.ExcludeFromPublishing.sample(5))
display(tblOilLevels.shape)

Index(['OilLevelID', 'LocalSiteName', 'PersonMakingMeasurement',
       'OilLevelDate', 'OilLevelTime', 'TimeDatum', 'OilLevel_ftBMP',
       'OilLevelMethod', 'Oil_Tape_Serial_No', 'Comments_OilLevel',
       'OilLevelDataSource', 'ExcludeFromPublishing', 'ExclusionReason',
       'ZeroThicknessCalcExclude', 'ARCHIVE_WaterLevel_metersBMP',
       'ARCHIVE_WaterLevelStatus', 'ARCHIVE_OilLevel_mBMP',
       'ARCHIVE_WaterLevel_ftBMP', 'ARCHIVE_OilLevel_ftBMP',
       'DELETE_Test Column Water level - Oil level', 'ARCHIVE_OilThickness_ft',
       'ARCHIVE_OilThickness_m', 'ARCHIVE_Hold_ft', 'ARCHIVE_Cut_ft',
       'ARCHIVE_NWIS_WaterLevelMethod', 'ARCHIVE_MP_Elev_meters',
       'ARCHIVE_WaterLevelElevation_meters', 'ARCHIVE_OilLevelElevation_ft',
       'ARCHIVE_WaterLevelDepth_metersBLS', 'ARCHIVE_OilLevelDepth_metersBLS',
       'ARCHIVE_BrentComment', 'ARCHIVE_Comments_OilLevel'],
      dtype='object')

1406    NaN
19977   NaN
12070   NaN
5102    NaN
5403    NaN
Name: ExcludeFromPublishing, dtype: float64

(19997, 32)

# Modify bmj.mpnt to get date ranges for MP heights

In [62]:
# r line 148
bmjmp2 = bmjmp.copy()
# r line 152 - convert columns to datetimes
bmjmp2['GWSI_MP_BeginDate'] = pd.to_datetime(bmjmp2['GWSI_MP_BeginDate'], format = '%Y%m%d')
bmjmp2['GWSI_MP_EndDate'] = pd.to_datetime(bmjmp2['GWSI_MP_EndDate'], format = '%Y%m%d')
# r line 160 - replace NaT EndDate values with today's date
today = pd.Timestamp.today().strftime('%Y-%m-%d')
bmjmp2['GWSI_MP_EndDate'] = bmjmp2['GWSI_MP_EndDate'].fillna(today)
# convert data type to make it mergable with tblOilLevels and 
# make bmjmp column mergeable 
bmjmp2['GWSI_USGS_siteno'] = bmjmp2['GWSI_USGS_siteno'].astype(str) 

In [63]:
# used to compare original df with new df
display(bmjmp.iloc[0:4, 0:4])
display(bmjmp2.iloc[0:4, 0:4])
bmjmp2.iloc[:, 0:4].dtypes

Unnamed: 0,GWSI_AgencyCode,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate
0,USGS,473429095051006,20160726.0,
1,USGS,473424095052912,20160802.0,
2,USGS,473424095052906,20160809.0,
3,USGS,473423095052902,20180625.0,


Unnamed: 0,GWSI_AgencyCode,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate
0,USGS,473429095051006,2016-07-26,2024-07-16
1,USGS,473424095052912,2016-08-02,2024-07-16
2,USGS,473424095052906,2016-08-09,2024-07-16
3,USGS,473423095052902,2018-06-25,2024-07-16


GWSI_AgencyCode              object
GWSI_USGS_siteno             object
GWSI_MP_BeginDate    datetime64[ns]
GWSI_MP_EndDate      datetime64[ns]
dtype: object

# Water level data from NWIS

In [223]:
# repetative but safe
dfwl = data[0].copy()
# inspect nan and other data before change
display(dfwl['lev_tm'].iloc[[0, 1, 7483, 7125]])

# r line 202
# replace blank times with 1701 which will be converted to 1201 pm or 1201 (matches field parameter approach)
dfwl['lev_tm'] = dfwl['lev_tm'].fillna('17:01')

# new: correct level time to match oil data (and data release) 
dfwl['lev_tm'] = pd.to_datetime(dfwl['lev_tm'], format='%H:%M') # convert str to dt
dfwl['lev_tm'] = dfwl['lev_tm'] - timedelta(hours=5) # sub 5 hours
dfwl['lev_tm'] = dfwl['lev_tm'].dt.strftime('%H:%M') # convert back to string

0       14:56
1       18:00
7483      NaN
7125    20:05
Name: lev_tm, dtype: object

In [225]:
# inspect data after change
display(dfwl['lev_tm'].iloc[[0, 1, 7483, 7125]])

0       09:56
1       13:00
7483    12:01
7125    15:05
Name: lev_tm, dtype: object

In [226]:
print(dfwl.lev_status_cd.dtypes)
print(dfwl.lev_status_cd.value_counts())
print(tblcd_WaterLevelStatusCode.lev_status_cd_ID.dtypes)
display(tblcd_WaterLevelStatusCode[['lev_status_cd_ID','Comments_WaterLevelStatus']])

object
1    24395
8      336
D      321
P       12
6        4
F        1
Name: lev_status_cd, dtype: int64
int64


Unnamed: 0,lev_status_cd_ID,Comments_WaterLevelStatus
0,1,Water level was affected by atmospheric pressure.
1,2,Water level was affected by tide stage.
2,3,Water level was affected by ice.
3,4,The site was dry (no water level is recorded).
4,5,The site was flowing recently.
5,6,The site was flowing. Water level or head coul...
6,7,A nearby site that taps the same aquifer was f...
7,8,A nearby site that taps the same aquifer had b...
8,9,Recharge water was being injected into the aqu...
9,10,A nearby site that taps the same aquifer was i...


In [227]:
# check for merge
print(dfwl['site_no'].dtype)
print(bmjmp2.GWSI_USGS_siteno.dtype)

object
object


In [228]:
dfwl2 = dfwl.copy()
# Convert column to make the NANs mergable
#dfwl2.lev_status_cd = dfwl2.lev_status_cd.astype('int64') # issue...can't convert NaN values...need new method
dfwl2.lev_status_cd = pd.to_numeric(dfwl2.lev_status_cd, errors='coerce')

# r line 210, merge dfs to attach descriptors
dfwl2 = pd.merge(dfwl2, tblcd_WaterLevelStatusCode.loc[:, ['lev_status_cd_ID', 'Comments_WaterLevelStatus']], how='left', left_on= 'lev_status_cd', right_on= 'lev_status_cd_ID')
dfwl2 = pd.merge(dfwl2, tblcd_FluidLevelMethodCode.loc[:, ['lev_meth_cd', 'Comments_FluidLevelMethod']], how='left', on= 'lev_meth_cd')
print(dfwl2.shape)

# r line 220 merge with mpnt table to get current mp height
dfwl2['site_no'] = dfwl2['site_no'].astype(str)  # convert column to integer to make it mergible
dfwl2 = pd.merge(dfwl2, bmjmp2.loc[:, ["GWSI_USGS_siteno","GWSI_MP_BeginDate","GWSI_MP_EndDate","GWSI_MP_height_ft"]], how = 'left', left_on='site_no', right_on='GWSI_USGS_siteno')
print(dfwl2.shape)


(25108, 19)
(35776, 23)


In [229]:
dfwl

Unnamed: 0,agency_cd,site_no,site_tp_cd,lev_dt,lev_tm,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,lev_agency_cd,lev_dt_acy_cd,lev_acy_cd,lev_src_cd,lev_meth_cd,lev_age_cd
0,USGS,473356095043701,ST,2019-06-28,09:56,UTC,0.00,,,,USGS,m,1,S,D,A
1,USGS,473356095043701,ST,2020-08-06,13:00,UTC,-1.60,,,,USGS,m,1,S,D,A
2,USGS,473356095043701,ST,2021-08-19,11:00,UTC,-0.16,,,,USGS,m,0,S,D,A
3,USGS,473356095043701,ST,2022-08-25,09:08,UTC,-1.06,,,,USGS,m,1,S,D,A
4,USGS,473356095043701,ST,2023-06-22,10:30,UTC,-1.18,,,,USGS,m,0,S,D,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25103,USGS,473440095063001,GW,1992-06-16,12:01,UTC,1.60,,,1,USGS,D,2,S,V,A
25104,USGS,473503095044501,WE,2020-08-06,13:00,UTC,0.00,,,,USGS,m,1,S,D,A
25105,USGS,473503095044501,WE,2021-08-20,09:58,UTC,1.83,,,,USGS,m,1,S,D,A
25106,USGS,473503095044501,WE,2022-08-25,10:42,UTC,-0.03,,,,USGS,m,1,S,D,P


In [230]:
# INSPECT DATA: lev_dt should be between begin and end dates
dfwl2.loc[:, ['GWSI_MP_BeginDate', 'lev_dt', 'GWSI_MP_EndDate']]

Unnamed: 0,GWSI_MP_BeginDate,lev_dt,GWSI_MP_EndDate
0,2019-06-28,2019-06-28,2024-07-16
1,2019-06-28,2020-08-06,2024-07-16
2,2019-06-28,2021-08-19,2024-07-16
3,2019-06-28,2022-08-25,2024-07-16
4,2019-06-28,2023-06-22,2024-07-16
...,...,...,...
35771,1984-10-17,1992-06-16,2024-07-16
35772,NaT,2020-08-06,NaT
35773,NaT,2021-08-20,NaT
35774,NaT,2022-08-25,NaT


In [231]:
# r script 227 
# INSPECT DATA: check to see where rows for MP height are not valid based on date
chk = dfwl2[
    ~(dfwl2['lev_dt'] >= dfwl2['GWSI_MP_BeginDate']) |
    ~(dfwl2['lev_dt'] <= dfwl2['GWSI_MP_EndDate'])
    ]

chk.loc[:, ['site_no', 'GWSI_MP_BeginDate', 'lev_dt', 'GWSI_MP_EndDate']]

print('The number of rows lost is...', chk.shape[0])

The number of rows lost is... 10713


Only select rows where the MP height is valid for the date the level was taken

In [232]:
# r script 227 too
# select only the rows where the MP height is valid for the date the level was taken (make sure this is after)
print(dfwl2.shape)
dfwl2 = dfwl2[
    (dfwl2['lev_dt'] >= dfwl2['GWSI_MP_BeginDate']) &
    (dfwl2['lev_dt'] <= dfwl2['GWSI_MP_EndDate'])
    ]
print(dfwl2.shape)

(35776, 23)
(25063, 23)


In [233]:
# INSPECT the updated data
dfwl2.loc[:, ['GWSI_MP_BeginDate', 'lev_dt', 'GWSI_MP_EndDate']]

Unnamed: 0,GWSI_MP_BeginDate,lev_dt,GWSI_MP_EndDate
0,2019-06-28,2019-06-28,2024-07-16
1,2019-06-28,2020-08-06,2024-07-16
2,2019-06-28,2021-08-19,2024-07-16
3,2019-06-28,2022-08-25,2024-07-16
4,2019-06-28,2023-06-22,2024-07-16
...,...,...,...
35763,2004-12-01,2024-05-04,2024-07-16
35768,1984-10-17,1987-05-05,2024-07-16
35769,1984-10-17,1989-06-01,2024-07-16
35770,1984-10-17,1989-10-29,2024-07-16


In [234]:
# r script 244
# create average water level by site data frame for determining water table wells
# Lev.avg used later
wl_avg = dfwl2.groupby('site_no')['lev_va'].mean().reset_index(name='mean_lev_va') # note, .reset_index gives the series an index and turns it into a data frame

In [235]:
# r line 250
# INSPECT: check for NaN
# this could be due to a well being dry or affected by ice
display(wl_avg[wl_avg.mean_lev_va.isna()])
# check these rows
dfwl2.loc[(dfwl2['site_no'] == '473416095052601')|
          (dfwl2['site_no'] == '473420095051601')|
          (dfwl2['site_no'] == '473420095052501')]

#Note what makes this .loc unique...based on individual values inside a cell rather than columns

Unnamed: 0,site_no,mean_lev_va
12,473416095052601,


Unnamed: 0,agency_cd,site_no,site_tp_cd,lev_dt,lev_tm,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,...,lev_src_cd,lev_meth_cd,lev_age_cd,lev_status_cd_ID,Comments_WaterLevelStatus,Comments_FluidLevelMethod,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate,GWSI_MP_height_ft
480,USGS,473416095052601,GW,1990-10-10,12:01,UTC,,,,,...,S,V,A,,,Calibrated electric tape – accuracy of instrum...,473416095052601,1990-06-24,2024-07-16,3.13
481,USGS,473416095052601,GW,1990-10-26,12:01,UTC,,,,,...,S,V,A,,,Calibrated electric tape – accuracy of instrum...,473416095052601,1990-06-24,2024-07-16,3.13
482,USGS,473416095052601,GW,1991-03-09,12:01,UTC,,,,,...,S,V,A,,,Calibrated electric tape – accuracy of instrum...,473416095052601,1990-06-24,2024-07-16,3.13
483,USGS,473416095052601,GW,1992-06-16,12:01,UTC,,,,,...,S,V,A,,,Calibrated electric tape – accuracy of instrum...,473416095052601,1990-06-24,2024-07-16,3.13
3110,USGS,473420095051601,GW,1983-05-25,12:01,UTC,24.21,,,1.0,...,S,S,A,1.0,Water level was affected by atmospheric pressure.,Steel-tape,473420095051601,1983-05-24,2010-03-08,3.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4461,USGS,473420095052501,GW,1994-12-08,12:01,UTC,5.13,,,1.0,...,S,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473420095052501,1989-10-26,2024-07-16,1.21
4463,USGS,473420095052501,GW,1995-01-03,12:01,UTC,5.52,,,1.0,...,S,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473420095052501,1989-10-26,2024-07-16,1.21
4465,USGS,473420095052501,GW,1995-03-17,12:01,UTC,5.08,,,1.0,...,S,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473420095052501,1989-10-26,2024-07-16,1.21
4467,USGS,473420095052501,GW,1995-04-28,12:01,UTC,4.94,,,1.0,...,S,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473420095052501,1989-10-26,2024-07-16,1.21


Checking negative mean levels and looking at their sites to determine if they are in a lake or near a wetlands so that negative levels are ok

In [236]:
# INSPECTION
# r line 255 checking negative mean levels
wl_neg = dfwl2.loc[dfwl2['lev_va'] < 0].drop_duplicates(subset = 'site_no')

# r line 260
tblSites.loc[tblSites['USGS_siteno'].isin(wl_neg['site_no']), 'LocalSiteName'] # returns only rows where condition is true and the LocalSiteName column

239               WL02
240               WL03
241               WL04
242     Grant Creek 01
243             WL01-2
244               LK01
925                313
981                426
983               501A
984               501B
985               501C
1124              9003
1132              9011
Name: LocalSiteName, dtype: object

Using bmj3 data

In [237]:
# r line 323. Determine if wells are water table wells (ie screened interval within 1 m of average water level)
# merge bmj and dfwl2
print(bmj3.GWSI_USGS_siteno.dtypes, wl_avg.site_no.dtypes) # appears the dtypes are dif 
bmj3.GWSI_USGS_siteno = bmj3.GWSI_USGS_siteno.astype(str)
print(bmj3.GWSI_USGS_siteno.dtypes, wl_avg.site_no.dtypes)
bmj3_2 = pd.merge(bmj3, wl_avg, how='left', left_on ='GWSI_USGS_siteno', right_on = 'site_no')

object object
object object


In [238]:
# INSPECT
# check how many rows don't have screen info
print(bmj3_2.loc[bmj3_2['GWSI_TopOfScreenDepth_ftBLS'].isna()].shape[0])
print(bmj3_2.loc[bmj3_2['GWSI_BottomOfScreenDepth_ftBLS'].isna()].shape[0])

104
104


In [240]:
# INSPECT (for Andrew)
# r 340 make a list of sites that didn't come out in a gwsi retrieval for Andrew to check:
# make boolean mask to indicate where tblSites is not na for two columns and merge that with bmj3_2
GWSI_not_retrieved = pd.merge(tblSites.loc[tblSites['USGS_siteno'].notna(), ['USGS_siteno', 'LocalSiteName']], bmj3_2, left_on='USGS_siteno', right_on='GWSI_USGS_siteno', how='outer', indicator=True)
# create df where only data in left exists (shows where data on right is missing)
GWSI_not_retrieved = GWSI_not_retrieved.loc[GWSI_not_retrieved['_merge'] == 'left_only', ['USGS_siteno', 'LocalSiteName']]
# drops old index and replaces it with a new default one (meaning 0,1,2...)
GWSI_not_retrieved = GWSI_not_retrieved.reset_index(drop=True)
today = str(datetime.now().date())
GWSI_not_retrieved.to_csv(f'DataChecks/{today}_GWSI_not_retrieved.csv', index = False) 
print('File location: ', os.getcwd()+'\DataChecks')

File location:  P:\0083\analysis\DataCompilation\DataCompilationPy\create_master_oil_levels\DataChecks


In [241]:
# create a new column and set it to NA
bmj3_2['WaterTableWell'] = pd.NA

# assign 'Y' to wells that meet the criteria. Select rows where criteria is met...and change WaterTableWell column to 'Y' if it does
# 3.2808... is the screened interval value in ft aka 1 m
bmj3_2.loc[(bmj3_2['mean_lev_va'] < bmj3_2['GWSI_BottomOfScreenDepth_ftBLS']) &   # check that mean elevation falls within the screened interval
            (bmj3_2['mean_lev_va'] > bmj3_2['GWSI_TopOfScreenDepth_ftBLS']) | 
            (bmj3_2['mean_lev_va'] > bmj3_2['GWSI_TopOfScreenDepth_ftBLS'] - 3.2808399) & # check that mean elevation is within 1 m above the top of screened interval
            (bmj3_2['mean_lev_va'] < bmj3_2['GWSI_TopOfScreenDepth_ftBLS']) | 
            (bmj3_2['mean_lev_va'] < bmj3_2['GWSI_BottomOfScreenDepth_ftBLS'] + 3.2808399) & # check taht mean elevation is within 1 m below the bottom of the screened interval
            (bmj3_2['mean_lev_va'] > bmj3_2['GWSI_BottomOfScreenDepth_ftBLS']), 
            'WaterTableWell'] = 'Y' # if any set of that conditional statements are true, na is set to 'Y'

In [242]:
# INSPECT
# r 353 check number of water table wells
# .size() checks the number of times a value appears and resetting the index just sets it back to zero and creates a column called count
bmj3_2.groupby('WaterTableWell').size().reset_index(name='count')

Unnamed: 0,WaterTableWell,count
0,Y,149


In [243]:
# INSPECT
# r 357 check the logic for assigning water table status
# 0 means logic works ok

# Filter for wells with "Y" status to check logic (should be zero)
filter1 = (bmj3_2["WaterTableWell"]=="Y") & (bmj3_2["GWSI_TopOfScreenDepth_ftBLS"]-bmj3_2["mean_lev_va"]>3.2808)
print(bmj3_2[filter1].shape[0])

# Assign "N" status where logic fails
filter2 = bmj3_2["GWSI_TopOfScreenDepth_ftBLS"]-bmj3_2["mean_lev_va"]>3.2808
bmj3_2.loc[filter2, "WaterTableWell"] = "N" # Assign 'N' to WaterTableWell if Filter is true

# Group and count number of wells with each status
chk = bmj3_2.groupby('WaterTableWell').agg('count')
chk[['GWSI_USGS_siteno', 'site_no']]


0


Unnamed: 0_level_0,GWSI_USGS_siteno,site_no
WaterTableWell,Unnamed: 1_level_1,Unnamed: 2_level_1
N,126,126
Y,149,149


In [244]:
# r 367 check other site types
print(bmj3_2['GWSI_GWSISiteType'].unique())
# number of rows in the data where column is equal to WE or LK or ST
print(bmj3_2[(bmj3_2['GWSI_GWSISiteType'] == 'WE') | 
             (bmj3_2['GWSI_GWSISiteType'] == 'LK') | 
             (bmj3_2['GWSI_GWSISiteType'] == 'ST')].shape[0]) 

#all of these site types are surface-water and so appropriate for 
#water table mapping, assign "Y" in water table well column
bmj3_2.loc[(bmj3_2['GWSI_GWSISiteType'] == 'WE') | 
           (bmj3_2['GWSI_GWSISiteType'] == 'LK') | 
           (bmj3_2['GWSI_GWSISiteType'] == 'ST'), # for rows that equal WE, LK, or ST
           'WaterTableWell'] = 'Y'                # and only the WaterTableWell column...set it equal to 'Y'

bmj3_2.groupby('WaterTableWell').size().reset_index(name='counts') # another way of creating this same output as before

['GW' 'SB-UZ' 'WE' 'ST' 'LK' 'GW-TH']
9


Unnamed: 0,WaterTableWell,counts
0,N,126
1,Y,158


In [245]:
# r 380 create new columns with mid values (new method)
bmj3_tmp = bmj3_2.copy()
bmj3_tmp['MidOfScreenDepth_ft'] = round(0.5*(bmj3_tmp['GWSI_TopOfScreenDepth_ftBLS']+bmj3_tmp['GWSI_BottomOfScreenDepth_ftBLS']), 2)
bmj3_tmp['MidOfScreenDepth_m'] = round(bmj3_tmp['MidOfScreenDepth_ft']*0.3048, 3)
bmj3_tmp['MidOfScreenElevation_ftASL_NAVD88'] = bmj3_tmp['GWSI_LandSurfaceAltitude_ftASL_NAVD88']-bmj3_tmp['MidOfScreenDepth_ft']
bmj3_tmp['MidOfScreenElevation_mASL_NAVD88'] = round(bmj3_tmp['MidOfScreenElevation_ftASL_NAVD88']*0.3048, 3)
bmj3_tmp = bmj3_tmp.loc[:,['GWSI_USGS_siteno', 
                           'GWSI_LandSurfaceAltitude_ftASL_NAVD88', 
                           'GWSI_LandSurfaceAltitude_mASL_NAVD88', 
                           'MidOfScreenDepth_ft', 
                           'MidOfScreenDepth_m', 
                           'MidOfScreenElevation_ftASL_NAVD88', 
                           'MidOfScreenElevation_mASL_NAVD88',
                           'WaterTableWell']]
print(bmj3_tmp.keys())

Index(['GWSI_USGS_siteno', 'GWSI_LandSurfaceAltitude_ftASL_NAVD88',
       'GWSI_LandSurfaceAltitude_mASL_NAVD88', 'MidOfScreenDepth_ft',
       'MidOfScreenDepth_m', 'MidOfScreenElevation_ftASL_NAVD88',
       'MidOfScreenElevation_mASL_NAVD88', 'WaterTableWell'],
      dtype='object')


Moving on to tbl_OilTapeCorrection

In [246]:
# INSPECT
# r line 73 : edit tbl_OilTapeCorrections (tape.core)
print(tbl_OilTapeCorrections.shape)
display(tbl_OilTapeCorrections.StartDate.sample(3))
print(tbl_OilTapeCorrections.EndDate)

(10, 11)


5   2019-10-30
6   2019-10-30
8   2018-06-26
Name: StartDate, dtype: datetime64[ns]

0   2016-11-17
1   2018-06-24
2   2018-06-24
3          NaT
4          NaT
5          NaT
6          NaT
7   2018-06-25
8   2019-10-19
9   2019-10-19
Name: EndDate, dtype: datetime64[ns]


In [247]:
tbl_OilTapeCorrections2 = tbl_OilTapeCorrections.copy()
tbl_OilTapeCorrections2['EndDate'] = tbl_OilTapeCorrections2['EndDate'].fillna(pd.Timestamp.today()) # replace unknown end dates with today's date. 
tbl_OilTapeCorrections2['EndDate'] = tbl_OilTapeCorrections2['EndDate'].dt.date # changes datetime obj to string (I think) to remove the hr, min, sec 
tbl_OilTapeCorrections2.EndDate

0    2016-11-17
1    2018-06-24
2    2018-06-24
3    2024-07-17
4    2024-07-17
5    2024-07-17
6    2024-07-17
7    2018-06-25
8    2019-10-19
9    2019-10-19
Name: EndDate, dtype: object

## tbloillevels

In [248]:
tblOilLevels2 = tblOilLevels.copy()
tblOilLevels2['OilLevelDate'] = pd.to_datetime(tblOilLevels['OilLevelDate'], format = '%Y-%m-%d') # make sure this col is datatime
tblOilLevels2 = tblOilLevels2[tblOilLevels2['ExcludeFromPublishing'] != 1] # only keep rows that are not excluded from publishing
tblOilLevels2 = tblOilLevels2.loc[:, ~tblOilLevels2.columns.str.startswith('Excl')]
tblOilLevels2 = tblOilLevels2.loc[:, ~tblOilLevels2.columns.str.startswith('ARCHIVE')]
tblOilLevels2 = tblOilLevels2.loc[:, ~tblOilLevels2.columns.str.startswith('DELETE')]

In [319]:
# Inspect DELETE
# No rounding issue
display(tblOilLevels[['LocalSiteName','OilLevelTime', 'OilLevel_ftBMP']].loc[tblOilLevels['LocalSiteName']=='0501'])

Unnamed: 0,LocalSiteName,OilLevelTime,OilLevel_ftBMP
50,501,14:30,22.72
152,501,14:44,21.5
271,501,17:04,22.71
19370,501,12:01,
19571,501,14:46,22.55
19742,501,11:38,22.41
19809,501,11:14,21.78
19899,501,16:09,22.83
19967,501,10:04,23.06


In [249]:
# INSPECT
print("Old number of rows then columns:", tblOilLevels.shape, "New number of rows then columns:", tblOilLevels2.shape)
# check if all rows have a valid date value
display(tblOilLevels2.loc[tblOilLevels2['OilLevelDate'].isna(), :])
# check if all rows have a valid date value
display(tblOilLevels2.loc[tblOilLevels2['OilLevelTime'].isna(), :])

Old number of rows then columns: (19997, 32) New number of rows then columns: (19525, 12)


Unnamed: 0,OilLevelID,LocalSiteName,PersonMakingMeasurement,OilLevelDate,OilLevelTime,TimeDatum,OilLevel_ftBMP,OilLevelMethod,Oil_Tape_Serial_No,Comments_OilLevel,OilLevelDataSource,ZeroThicknessCalcExclude
2572,20272,,,NaT,,,0.0,,,,,


Unnamed: 0,OilLevelID,LocalSiteName,PersonMakingMeasurement,OilLevelDate,OilLevelTime,TimeDatum,OilLevel_ftBMP,OilLevelMethod,Oil_Tape_Serial_No,Comments_OilLevel,OilLevelDataSource,ZeroThicknessCalcExclude
2572,20272,,,NaT,,,0.0,,,,,
2618,20318,9017.0,,2021-08-11,,,0.0,,,,,


In [250]:
# INSPECT
# Compare to tblsites
display(tblOilLevels2.LocalSiteName)
display(tblSites.USGS_siteno)

0         315
1         317
2         319
3         411
4        420A
         ... 
19992     319
19993     411
19994    421B
19995    301A
19996     315
Name: LocalSiteName, Length: 19525, dtype: object

0                  None
1                  None
2                  None
3                  None
4                  None
             ...       
1724               None
1725    473424095053001
1726               None
1727               None
1728               None
Name: USGS_siteno, Length: 1729, dtype: object

In [251]:
#merge tblOilLevels with tblSites to get USGS_siteno r line 80
tblOilLevels3 = pd.merge(tblOilLevels2, tblSites.loc[:,['LocalSiteName', 'USGS_siteno',"AgencyCode","XcoordUTMNAD83_m","YcoordUTMNAD83_m","OnNWT","NWTPosition_m"]], how = 'left')
print("Old number rows/columns:", tblOilLevels2.shape, "New number rows/columns:", tblOilLevels3.shape)
tblOilLevels3 = pd.merge(tblOilLevels3, tblWells.loc[:,['LocalSiteName', 'LocalUseCode']], how = 'left')
print("Newest number rows/columns:", tblOilLevels3.shape)

Old number rows/columns: (19525, 12) New number rows/columns: (19525, 18)
Newest number rows/columns: (19525, 19)


In [252]:
# INSPECT
# check which records don't have USGS_siteno
display(tblOilLevels3.loc[tblOilLevels3['USGS_siteno'].isna(), ['LocalSiteName', 'USGS_siteno']])
display(tblOilLevels3.loc[tblOilLevels3['USGS_siteno'].notna(), ['LocalSiteName', 'USGS_siteno']])

# check which records don't have LocaluseCode
display(tblOilLevels3.loc[tblOilLevels3['LocalUseCode'].isna(), ['LocalSiteName', 'LocalUseCode']])
display(tblOilLevels3.loc[tblOilLevels3['LocalUseCode'].notna(), ['LocalSiteName', 'LocalUseCode']])

Unnamed: 0,LocalSiteName,USGS_siteno
2568,,


Unnamed: 0,LocalSiteName,USGS_siteno
0,315,473426095052530
1,317,473425095052009
2,319,473425095052504
3,411,473425095052605
4,420A,473426095052419
...,...,...
19520,319,473425095052504
19521,411,473425095052605
19522,421B,473425095052611
19523,301A,473426095052526


Unnamed: 0,LocalSiteName,LocalUseCode
2568,,
4269,LG,
4270,LG,
4271,LG,
4272,LG,
4273,LG,
4274,LG,
4275,LG,
4276,LG,
4277,LG,


Unnamed: 0,LocalSiteName,LocalUseCode
0,315,wo
1,317,wo
2,319,wo
3,411,wo
4,420A,wo
...,...,...
19520,319,wo
19521,411,wo
19522,421B,wo
19523,301A,wo


In [253]:
# INSPECT
print("Old number of rows then columns:", tblOilLevels2.shape, "New number of rows then columns:", tblOilLevels3.shape)

Old number of rows then columns: (19525, 12) New number of rows then columns: (19525, 19)


DUH merge ONLY localSiteName and USGS_siteno from tblsites rather than merge ON those columns from the two dfs.

In [254]:
# r line 93 select only the oil wells (LocalUseCode=="wo") and wells with a USGS site ID
tblOilLevels4 = tblOilLevels3[(tblOilLevels3['USGS_siteno'].notna()) & (tblOilLevels3['LocalUseCode']=='wo')]
print(tblOilLevels4.shape)

#rearrange order of columns, move last 2 columns from the join to the front
cols = tblOilLevels4.columns.tolist()      # create list of column names
new_cols = cols[-2:] + cols[:-2]           # change order by moving last 2 to the front
tblOilLevels4 = tblOilLevels4[new_cols]    # apply change to dataframe (create new one technically)

(4715, 19)


## Apply Tape Corrections to OilLevel_ftBMP

In [255]:
tblOilLevels5 = pd.merge(tblOilLevels4, tbl_OilTapeCorrections2, how = 'left', on = 'Oil_Tape_Serial_No')
tblOilLevels5.keys()

Index(['NWTPosition_m', 'LocalUseCode', 'OilLevelID', 'LocalSiteName',
       'PersonMakingMeasurement', 'OilLevelDate', 'OilLevelTime', 'TimeDatum',
       'OilLevel_ftBMP', 'OilLevelMethod', 'Oil_Tape_Serial_No',
       'Comments_OilLevel', 'OilLevelDataSource', 'ZeroThicknessCalcExclude',
       'USGS_siteno', 'AgencyCode', 'XcoordUTMNAD83_m', 'YcoordUTMNAD83_m',
       'OnNWT', 'TapeCorrID', 'HIF_ID', 'StartDate', 'EndDate',
       'StartDistance_ft', 'EndDistance_ft', 'GWSI_Tape_Correction_ft',
       'OilTapeModelMakeType', 'Source', 'Comments_TapeCorrection'],
      dtype='object')

In [256]:
# r line 127
tblOilLevels6 = tblOilLevels5[
    (tblOilLevels5['Oil_Tape_Serial_No'].isna()) | 
    (tblOilLevels5['OilLevel_ftBMP'].isna()) | 
    (tblOilLevels5['OilLevelDate'] >= tblOilLevels5['StartDate']) &
    (tblOilLevels5['OilLevelDate'] <= tblOilLevels5['EndDate']) &
    (tblOilLevels5['OilLevel_ftBMP'] >= tblOilLevels5['StartDistance_ft']) &
    (tblOilLevels5['OilLevel_ftBMP'] <= tblOilLevels5['EndDistance_ft'])
    ].drop_duplicates(subset=['OilLevelID'], keep = 'first')
# r line 131 
tblOilLevels6 = tblOilLevels6.drop(columns=[
    'TapeCorrID',
    'StartDate',
    'EndDate',
    'StartDistance_ft',
    'EndDistance_ft',
    'OilTapeModelMakeType',
    'Source',
    'Comments_TapeCorrection'    
])
# r line 132
tblOilLevels6['GWSI_Tape_Correction_ft'] = tblOilLevels6['GWSI_Tape_Correction_ft'].fillna(0)
# r line 133
tblOilLevels6['OilLevel_ftBMP_corr'] = tblOilLevels6['OilLevel_ftBMP'] + tblOilLevels6['GWSI_Tape_Correction_ft']

## Join oillevels with mpnt table

In [257]:
# r line 163 join OilLevels with MPNT table
# join
tblOilLevels7 = pd.merge(tblOilLevels6, bmjmp2.loc[:, ['GWSI_USGS_siteno', 'GWSI_MP_BeginDate', 'GWSI_MP_EndDate', 'GWSI_MP_height_ft']], how='left', left_on='USGS_siteno', right_on='GWSI_USGS_siteno')
# filter out rows where MP height is invalid for the date the level was taken
tblOilLevels7 = tblOilLevels7[
    (tblOilLevels7['OilLevelDate'] >= tblOilLevels7['GWSI_MP_BeginDate']) &
    (tblOilLevels7['OilLevelDate'] <= tblOilLevels7['GWSI_MP_EndDate'])
    ]

In [258]:
# INSPECT
# check row count
print("Number of rows before:", tblOilLevels6.shape[0])
print("Number of rows after:",tblOilLevels7.shape[0])
# another check that is similar
chk = tblOilLevels6.merge(tblOilLevels7, on='OilLevelID', how='outer', indicator=True) # merge all data with indicator that should if merge included both or some only 1 dataset
chk1 = chk[chk['_merge'] == 'left_only']
chk2 = chk[chk['_merge'] == 'right_only']  
print('Data from left only:', chk1.shape[0], 'Data from right only:', chk2.shape[0])

Number of rows before: 4715
Number of rows after: 4715
Data from left only: 0 Data from right only: 0


# Insert MP heights into tblOilLevels and calc oil level

In [259]:
# r code 180
tblOilLevels8 = tblOilLevels7.copy()
# calculate oil level
tblOilLevels8['OilLevel_ftBLS'] = tblOilLevels8['OilLevel_ftBMP_corr'] - tblOilLevels8['GWSI_MP_height_ft']
# r line 183 merge fluid level method
tblOilLevels8 = tblOilLevels8.merge(tblcd_FluidLevelMethodCode.loc[:, ['lev_meth_cd', 'Comments_FluidLevelMethod']], left_on='OilLevelMethod', right_on='lev_meth_cd', how='left') # forgot the how = left
# needed for the merge around r line 425
tblOilLevels8['OilLevelDate'] = tblOilLevels8['OilLevelDate'].dt.strftime('%Y-%m-%d')

In [322]:
# INSPECT DELETE
# no rounding error
display(tblOilLevels8[['LocalSiteName','OilLevelTime', 'OilLevel_ftBMP', 'OilLevel_ftBLS']].loc[tblOilLevels8['LocalSiteName']=='0501'])

Unnamed: 0,LocalSiteName,OilLevelTime,OilLevel_ftBMP,OilLevel_ftBLS
50,501,14:30,22.72,20.19
152,501,14:44,21.5,18.99
270,501,17:04,22.71,20.2
4112,501,12:01,,
4294,501,14:46,22.55,20.04
4465,501,11:38,22.41,19.9
4530,501,11:14,21.78,19.27
4618,501,16:09,22.83,20.32
4685,501,10:04,23.06,20.55


# Create master list

In [260]:
# INSPECTION
# r line 263 
#check each data set for duplicated values in USGS station ID,
#sample date and sample time columns.  
chk = tblOilLevels8.groupby(['USGS_siteno', 'OilLevelDate', 'OilLevelTime']).filter(lambda x: len(x) > 1)
print('For oil levels:', chk.shape[0]) # if zero, there are no duplicates

# r line 271
chk = dfwl2.groupby(['site_no', 'lev_dt', 'lev_tm']).filter(lambda x: len(x) > 1)
print('For water levels:', chk.shape[0])
display(chk.head(3))

For oil levels: 0
For water levels: 0


Unnamed: 0,agency_cd,site_no,site_tp_cd,lev_dt,lev_tm,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,...,lev_src_cd,lev_meth_cd,lev_age_cd,lev_status_cd_ID,Comments_WaterLevelStatus,Comments_FluidLevelMethod,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate,GWSI_MP_height_ft


In [261]:
# r line 285 Rename columns and select columns in oil levels
oil_levID = tblOilLevels8.rename(columns={'OilLevelDate': 'LevelDate', 'OilLevelTime': 'LevelTime'})
display(oil_levID.head(2))

Unnamed: 0,NWTPosition_m,LocalUseCode,OilLevelID,LocalSiteName,PersonMakingMeasurement,LevelDate,LevelTime,TimeDatum,OilLevel_ftBMP,OilLevelMethod,...,HIF_ID,GWSI_Tape_Correction_ft,OilLevel_ftBMP_corr,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate,GWSI_MP_height_ft,OilLevel_ftBLS,lev_meth_cd,Comments_FluidLevelMethod
0,-4.02,wo,20059,315,A. Berg & J. Lund,2017-06-14,17:01,CDT,22.94,Z,...,HIF-242570,0.0,22.94,473426095052530,1983-07-01,2024-07-16,2.52,20.42,Z,Other
1,27.55,wo,20060,317,A. Berg & J. Lund,2017-06-14,16:25,CDT,31.08,Z,...,HIF-242570,0.0,31.08,473425095052009,1983-07-02,2024-07-16,1.86,29.22,Z,Other


In [262]:
# rename columns
wat_levID = dfwl2.rename(columns={'site_no': 'USGS_siteno', 'lev_dt': 'LevelDate', 'lev_tm': 'LevelTime'})
display(wat_levID.head(2))

Unnamed: 0,agency_cd,USGS_siteno,site_tp_cd,LevelDate,LevelTime,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,...,lev_src_cd,lev_meth_cd,lev_age_cd,lev_status_cd_ID,Comments_WaterLevelStatus,Comments_FluidLevelMethod,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate,GWSI_MP_height_ft
0,USGS,473356095043701,ST,2019-06-28,09:56,UTC,0.0,,,,...,S,D,A,,,Differential Global Positioning System. This c...,473356095043701,2019-06-28,2024-07-16,0.0
1,USGS,473356095043701,ST,2020-08-06,13:00,UTC,-1.6,,,,...,S,D,A,,,Differential Global Positioning System. This c...,473356095043701,2019-06-28,2024-07-16,0.0


In [263]:
# INSPECT
print(oil_levID.keys())
print(oil_levID.shape)
print(wat_levID.keys())
print(wat_levID.shape)

Index(['NWTPosition_m', 'LocalUseCode', 'OilLevelID', 'LocalSiteName',
       'PersonMakingMeasurement', 'LevelDate', 'LevelTime', 'TimeDatum',
       'OilLevel_ftBMP', 'OilLevelMethod', 'Oil_Tape_Serial_No',
       'Comments_OilLevel', 'OilLevelDataSource', 'ZeroThicknessCalcExclude',
       'USGS_siteno', 'AgencyCode', 'XcoordUTMNAD83_m', 'YcoordUTMNAD83_m',
       'OnNWT', 'HIF_ID', 'GWSI_Tape_Correction_ft', 'OilLevel_ftBMP_corr',
       'GWSI_USGS_siteno', 'GWSI_MP_BeginDate', 'GWSI_MP_EndDate',
       'GWSI_MP_height_ft', 'OilLevel_ftBLS', 'lev_meth_cd',
       'Comments_FluidLevelMethod'],
      dtype='object')
(4715, 29)
Index(['agency_cd', 'USGS_siteno', 'site_tp_cd', 'LevelDate', 'LevelTime',
       'lev_tz_cd', 'lev_va', 'sl_lev_va', 'sl_datum_cd', 'lev_status_cd',
       'lev_agency_cd', 'lev_dt_acy_cd', 'lev_acy_cd', 'lev_src_cd',
       'lev_meth_cd', 'lev_age_cd', 'lev_status_cd_ID',
       'Comments_WaterLevelStatus', 'Comments_FluidLevelMethod',
       'GWSI_USGS_siten

In [264]:
# r 303
# make one df with all rows for both water and oil
levID = pd.concat([
    wat_levID[['USGS_siteno', 'LevelDate', 'LevelTime']], 
    oil_levID[['USGS_siteno', 'LevelDate', 'LevelTime']]
    ], 
    ignore_index=True)
# remove dubs
levID = levID.drop_duplicates()
print(levID.shape)
display(levID)



(25581, 3)


Unnamed: 0,USGS_siteno,LevelDate,LevelTime
0,473356095043701,2019-06-28,09:56
1,473356095043701,2020-08-06,13:00
2,473356095043701,2021-08-19,11:00
3,473356095043701,2022-08-25,09:08
4,473356095043701,2023-06-22,10:30
...,...,...,...
29740,473425095052702,2015-08-05,10:23
29747,473420095052301,2015-08-05,10:36
29752,473426095052526,2015-10-22,12:30
29767,473426095052414,2019-10-31,12:00


In [265]:
# missing lev_va
display(dfwl2.loc[dfwl2['site_no']=='473425095052603'])
display(wat_levID.loc[wat_levID['USGS_siteno']=='473425095052603'])
display(levID.loc[levID['USGS_siteno']=='473425095052603'])
print(levID.loc[levID['USGS_siteno']=='473425095052603'].shape)

Unnamed: 0,agency_cd,site_no,site_tp_cd,lev_dt,lev_tm,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,...,lev_src_cd,lev_meth_cd,lev_age_cd,lev_status_cd_ID,Comments_WaterLevelStatus,Comments_FluidLevelMethod,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate,GWSI_MP_height_ft
14788,USGS,473425095052603,GW,2012-06-20,14:46,UTC,24.0,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14790,USGS,473425095052603,GW,2013-07-15,11:38,UTC,23.84,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14792,USGS,473425095052603,GW,2014-07-15,11:14,UTC,23.14,,,1.0,...,S,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14794,USGS,473425095052603,GW,2015-08-05,10:04,UTC,24.43,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14796,USGS,473425095052603,GW,2017-06-14,17:04,UTC,24.04,,,1.0,...,,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14798,USGS,473425095052603,GW,2018-06-25,14:30,UTC,24.04,,,8.0,...,,O,A,8.0,A nearby site that taps the same aquifer had b...,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14800,USGS,473425095052603,GW,2019-06-18,14:44,UTC,22.82,,,8.0,...,S,O,A,8.0,A nearby site that taps the same aquifer had b...,Observed,473425095052603,2005-06-02,2024-07-16,2.51


Unnamed: 0,agency_cd,USGS_siteno,site_tp_cd,LevelDate,LevelTime,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,...,lev_src_cd,lev_meth_cd,lev_age_cd,lev_status_cd_ID,Comments_WaterLevelStatus,Comments_FluidLevelMethod,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate,GWSI_MP_height_ft
14788,USGS,473425095052603,GW,2012-06-20,14:46,UTC,24.0,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14790,USGS,473425095052603,GW,2013-07-15,11:38,UTC,23.84,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14792,USGS,473425095052603,GW,2014-07-15,11:14,UTC,23.14,,,1.0,...,S,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14794,USGS,473425095052603,GW,2015-08-05,10:04,UTC,24.43,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14796,USGS,473425095052603,GW,2017-06-14,17:04,UTC,24.04,,,1.0,...,,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14798,USGS,473425095052603,GW,2018-06-25,14:30,UTC,24.04,,,8.0,...,,O,A,8.0,A nearby site that taps the same aquifer had b...,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14800,USGS,473425095052603,GW,2019-06-18,14:44,UTC,22.82,,,8.0,...,S,O,A,8.0,A nearby site that taps the same aquifer had b...,Observed,473425095052603,2005-06-02,2024-07-16,2.51


Unnamed: 0,USGS_siteno,LevelDate,LevelTime
10283,473425095052603,2012-06-20,14:46
10284,473425095052603,2013-07-15,11:38
10285,473425095052603,2014-07-15,11:14
10286,473425095052603,2015-08-05,10:04
10287,473425095052603,2017-06-14,17:04
10288,473425095052603,2018-06-25,14:30
10289,473425095052603,2019-06-18,14:44
29175,473425095052603,2005-06-27,12:01
29681,473425095052603,2016-07-25,16:09


(9, 3)


In [266]:
# MERGE
out_MasterWaterOilLevels = pd.merge(levID, 
                                    tblSites[["AgencyCode","USGS_siteno","LocalSiteName","XcoordUTMNAD83_m","YcoordUTMNAD83_m","OnNWT","NWTPosition_m"]],
                                    how='left')

out_MasterWaterOilLevels = out_MasterWaterOilLevels.rename(
    columns ={'OnNWT': 'OnNorthWellTransect', 
              'NWTPosition_m': 'DistanceFromCenterOfNorthOilBody_m'})

out_MasterWaterOilLevels

Unnamed: 0,USGS_siteno,LevelDate,LevelTime,AgencyCode,LocalSiteName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m
0,473356095043701,2019-06-28,09:56,USGS,Grant Creek 01,343765.96,5270091.98,,
1,473356095043701,2020-08-06,13:00,USGS,Grant Creek 01,343765.96,5270091.98,,
2,473356095043701,2021-08-19,11:00,USGS,Grant Creek 01,343765.96,5270091.98,,
3,473356095043701,2022-08-25,09:08,USGS,Grant Creek 01,343765.96,5270091.98,,
4,473356095043701,2023-06-22,10:30,USGS,Grant Creek 01,343765.96,5270091.98,,
...,...,...,...,...,...,...,...,...,...
25576,473425095052702,2015-08-05,10:23,USGS,9017,342764.87,5271033.20,1.0,-21.97
25577,473420095052301,2015-08-05,10:36,USGS,1429,342814.32,5270875.34,,
25578,473426095052526,2015-10-22,12:30,USGS,301A,342781.07,5271045.79,1.0,-2.06
25579,473426095052414,2019-10-31,12:00,USGS,534A,342809.72,5271045.08,1.0,23.84


In [161]:
out_MasterWaterOilLevels.loc[out_MasterWaterOilLevels['USGS_siteno']=='473425095052603']

Unnamed: 0,USGS_siteno,LevelDate,LevelTime,AgencyCode,LocalSiteName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m
10283,473425095052603,2012-06-20,19:46,USGS,501,342781.18,5271042.47,1.0,-3.31
10284,473425095052603,2013-07-15,16:38,USGS,501,342781.18,5271042.47,1.0,-3.31
10285,473425095052603,2014-07-15,16:14,USGS,501,342781.18,5271042.47,1.0,-3.31
10286,473425095052603,2015-08-05,15:04,USGS,501,342781.18,5271042.47,1.0,-3.31
10287,473425095052603,2017-06-14,22:04,USGS,501,342781.18,5271042.47,1.0,-3.31
10288,473425095052603,2018-06-25,19:30,USGS,501,342781.18,5271042.47,1.0,-3.31
10289,473425095052603,2019-06-18,19:44,USGS,501,342781.18,5271042.47,1.0,-3.31
25113,473425095052603,2018-06-25,14:30,USGS,501,342781.18,5271042.47,1.0,-3.31
25175,473425095052603,2019-06-18,14:44,USGS,501,342781.18,5271042.47,1.0,-3.31
25287,473425095052603,2017-06-14,17:04,USGS,501,342781.18,5271042.47,1.0,-3.31


In [None]:
'''
# r 390 merge info from bmj3 with master site-date-time list
out_MasterWaterOilLevels2 = pd.merge(out_MasterWaterOilLevels, bmj3_tmp, how='left', left_on='USGS_siteno', right_on='GWSI_USGS_siteno')
# check how many water table wells in tblwells
print(sum(tblWells['WaterTableWell'] == 'Y'))

# r 418: calculate water level elevations and rename columns appropriately
out_MasterWaterOilLevels2 = out_MasterWaterOilLevels2.rename(columns={'lev_va': 'WaterLevel_ftBLS'})
out_MasterWaterOilLevels2['WaterLevel_ftASL_NAVD88'] = out_MasterWaterOilLevels2['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - out_MasterWaterOilLevels2['WaterLevel_ftBLS']
out_MasterWaterOilLevels2['WaterLevel_mASL_NAVD88'] = round(out_MasterWaterOilLevels2['WaterLevel_ftASL_NAVD88'] * 0.3048, 3)

# r 425 merge with oil levels
# first convert columns to be the same data types and same date format
out_MasterWaterOilLevels2['LevelDate'] = pd.to_datetime(out_MasterWaterOilLevels2['LevelDate']).dt.strftime('%Y-%m-%d')

# r 418: calculate water level elevations and rename columns appropriately
out_MasterWaterOilLevels2 = out_MasterWaterOilLevels2.rename(columns={'lev_va': 'WaterLevel_ftBLS'})
out_MasterWaterOilLevels2['WaterLevel_ftASL_NAVD88'] = out_MasterWaterOilLevels2['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - out_MasterWaterOilLevels2['WaterLevel_ftBLS']
out_MasterWaterOilLevels2['WaterLevel_mASL_NAVD88'] = round(out_MasterWaterOilLevels2['WaterLevel_ftASL_NAVD88'] * 0.3048, 3)

print(out_MasterWaterOilLevels2.Comments_FluidLevelMethod_oil.unique())

# r 435 merge the fluid level method columns
out_MasterWaterOilLevels2.loc[out_MasterWaterOilLevels2['Comments_FluidLevelMethod_WL'].isna() & ~out_MasterWaterOilLevels2['Comments_FluidLevelMethod_oil'].isna(),'Comments_FluidLevelMethod_WL'] = \
 out_MasterWaterOilLevels2.loc[out_MasterWaterOilLevels2['Comments_FluidLevelMethod_WL'].isna() & ~out_MasterWaterOilLevels2['Comments_FluidLevelMethod_oil'].isna(),'Comments_FluidLevelMethod_oil']
print(out_MasterWaterOilLevels2.Comments_FluidLevelMethod_WL.unique())
# r 440 delete oil fulid level comments column and rename the water fluid level method column
out_MasterWaterOilLevels2['Comments_FluidLevelMethod'] = out_MasterWaterOilLevels2['Comments_FluidLevelMethod_WL']
out_MasterWaterOilLevels2.drop('Comments_FluidLevelMethod_oil', axis=1, inplace=True)
print(out_MasterWaterOilLevels2.Comments_FluidLevelMethod.unique())

# r 452 calculate oil level elevations
out_MasterWaterOilLevels2['OilLevel_ftASL_NAVD88'] = out_MasterWaterOilLevels2['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - out_MasterWaterOilLevels2['OilLevel_ftBLS']
out_MasterWaterOilLevels2['OilLevel_mASL_NAVD88'] = out_MasterWaterOilLevels2['OilLevel_ftASL_NAVD88'].mul(0.3048).round(3)
# trying new way to achieve same column...
out_MasterWaterOilLevels2['OilLevel_ftASL_NAVD88_v2'] = out_MasterWaterOilLevels2.apply(lambda x: x['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - x['OilLevel_ftBLS'], axis=1)

# r 459 calculate oil thicknesses, using the most raw measurements
out_MasterWaterOilLevels2['OilThickness_ft'] = round(out_MasterWaterOilLevels2['WaterLevel_ftBLS'] - out_MasterWaterOilLevels2['OilLevel_ftBLS'], 2)
out_MasterWaterOilLevels2['OilThickness_m'] = round(out_MasterWaterOilLevels2['OilThickness_ft'] * 0.3048, 3)
'''

In [267]:
dfwl2.loc[dfwl2['site_no']=='473425095052603']

Unnamed: 0,agency_cd,site_no,site_tp_cd,lev_dt,lev_tm,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,...,lev_src_cd,lev_meth_cd,lev_age_cd,lev_status_cd_ID,Comments_WaterLevelStatus,Comments_FluidLevelMethod,GWSI_USGS_siteno,GWSI_MP_BeginDate,GWSI_MP_EndDate,GWSI_MP_height_ft
14788,USGS,473425095052603,GW,2012-06-20,14:46,UTC,24.0,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14790,USGS,473425095052603,GW,2013-07-15,11:38,UTC,23.84,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14792,USGS,473425095052603,GW,2014-07-15,11:14,UTC,23.14,,,1.0,...,S,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14794,USGS,473425095052603,GW,2015-08-05,10:04,UTC,24.43,,,1.0,...,S,V,A,1.0,Water level was affected by atmospheric pressure.,Calibrated electric tape – accuracy of instrum...,473425095052603,2005-06-02,2024-07-16,2.51
14796,USGS,473425095052603,GW,2017-06-14,17:04,UTC,24.04,,,1.0,...,,O,A,1.0,Water level was affected by atmospheric pressure.,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14798,USGS,473425095052603,GW,2018-06-25,14:30,UTC,24.04,,,8.0,...,,O,A,8.0,A nearby site that taps the same aquifer had b...,Observed,473425095052603,2005-06-02,2024-07-16,2.51
14800,USGS,473425095052603,GW,2019-06-18,14:44,UTC,22.82,,,8.0,...,S,O,A,8.0,A nearby site that taps the same aquifer had b...,Observed,473425095052603,2005-06-02,2024-07-16,2.51


In [270]:
# r 390 merge info from bmj3 with master site-date-time list
out_MasterWaterOilLevels2 = pd.merge(out_MasterWaterOilLevels, bmj3_tmp, how='left', left_on='USGS_siteno', right_on='GWSI_USGS_siteno')
out_MasterWaterOilLevels2.head(3)

#then get the LocalUseCode field from tblWells (in local Access DB),
#check number of WaterTableWells in tblWells (not done algorithmically)
tblWells.keys()

#the water table well field in the database is less comprehensive than the algorithm used above. 
print(sum(tblWells['WaterTableWell'] == 'Y'))

# merge again to get tblWells local use code
out_MasterWaterOilLevels2 = pd.merge(out_MasterWaterOilLevels2, tblWells[['LocalSiteName', 'LocalUseCode']], how='left')
out_MasterWaterOilLevels2.head(2)   

# r line 409 merge to get Water levels
out_MasterWaterOilLevels2 = pd.merge(out_MasterWaterOilLevels2, dfwl2[["site_no","lev_dt","lev_tm",
                                                                       "lev_va","Comments_WaterLevelStatus", 
                                                                       "Comments_FluidLevelMethod","GWSI_MP_height_ft"]],
                                     how='left', 
                                     left_on=['USGS_siteno', "LevelDate", "LevelTime"],
                                     right_on=['site_no', "lev_dt", "lev_tm"])

# r 418: calculate water level elevations and rename columns appropriately
out_MasterWaterOilLevels2 = out_MasterWaterOilLevels2.rename(columns={'lev_va': 'WaterLevel_ftBLS'})
out_MasterWaterOilLevels2['WaterLevel_ftASL_NAVD88'] = out_MasterWaterOilLevels2['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - out_MasterWaterOilLevels2['WaterLevel_ftBLS']
out_MasterWaterOilLevels2['WaterLevel_mASL_NAVD88'] = round(out_MasterWaterOilLevels2['WaterLevel_ftASL_NAVD88'] * 0.3048, 3)

# r 425 merge with oil levels
# first convert columns to be the same data types and same date format
out_MasterWaterOilLevels2['LevelDate'] = pd.to_datetime(out_MasterWaterOilLevels2['LevelDate']).dt.strftime('%Y-%m-%d')
# MERGE
out_MasterWaterOilLevels2 = pd.merge(out_MasterWaterOilLevels2, tblOilLevels8[["USGS_siteno",
                                                                               "OilLevelDate",
                                                                               "OilLevelTime",
                                                                               "OilLevelID",
                                                                               "PersonMakingMeasurement",
                                                                               "Comments_FluidLevelMethod",
                                                                               "Oil_Tape_Serial_No",
                                                                               "Comments_OilLevel",
                                                                               "GWSI_Tape_Correction_ft",
                                                                               "GWSI_MP_height_ft",
                                                                               "OilLevel_ftBLS"]],
                                     left_on=['USGS_siteno', "LevelDate", "LevelTime"],
                                     right_on=['USGS_siteno', "OilLevelDate", "OilLevelTime"],
                                     how='left',
                                     suffixes=('_WL','_oil')
                                     )


print(out_MasterWaterOilLevels2.Comments_FluidLevelMethod_oil.unique())
# r 435 merge the fluid level method columns
out_MasterWaterOilLevels2.loc[out_MasterWaterOilLevels2['Comments_FluidLevelMethod_WL'].isna() & ~out_MasterWaterOilLevels2['Comments_FluidLevelMethod_oil'].isna(),'Comments_FluidLevelMethod_WL'] = \
 out_MasterWaterOilLevels2.loc[out_MasterWaterOilLevels2['Comments_FluidLevelMethod_WL'].isna() & ~out_MasterWaterOilLevels2['Comments_FluidLevelMethod_oil'].isna(),'Comments_FluidLevelMethod_oil']
print(out_MasterWaterOilLevels2.Comments_FluidLevelMethod_WL.unique())
# r 440 delete oil fulid level comments column and rename the water fluid level method column
out_MasterWaterOilLevels2['Comments_FluidLevelMethod'] = out_MasterWaterOilLevels2['Comments_FluidLevelMethod_WL']
out_MasterWaterOilLevels2.drop('Comments_FluidLevelMethod_oil', axis=1, inplace=True)
print(out_MasterWaterOilLevels2.Comments_FluidLevelMethod.unique())

# r 452 calculate oil level elevations
out_MasterWaterOilLevels2['OilLevel_ftASL_NAVD88'] = out_MasterWaterOilLevels2['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - out_MasterWaterOilLevels2['OilLevel_ftBLS']
out_MasterWaterOilLevels2['OilLevel_mASL_NAVD88'] = out_MasterWaterOilLevels2['OilLevel_ftASL_NAVD88'].mul(0.3048).round(3)
# trying new way to achieve same column...
out_MasterWaterOilLevels2['OilLevel_ftASL_NAVD88_v2'] = out_MasterWaterOilLevels2.apply(lambda x: x['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - x['OilLevel_ftBLS'], axis=1)

# r 459 calculate oil thicknesses, using the most raw measurements
out_MasterWaterOilLevels2['OilThickness_ft'] = round(out_MasterWaterOilLevels2['WaterLevel_ftBLS'] - out_MasterWaterOilLevels2['OilLevel_ftBLS'], 2)
out_MasterWaterOilLevels2['OilThickness_m'] = round(out_MasterWaterOilLevels2['OilThickness_ft'] * 0.3048, 3)


88
[nan 'Calibrated electric tape – accuracy of instrument has been checked'
 'Steel-tape' 'Other']
['Differential Global Positioning System. This code is especially applicable to surface expressions of groundwater.'
 'Steel-tape'
 'Calibrated electric tape – accuracy of instrument has been checked'
 'Observed' 'Electric-tape' 'Other' nan]
['Differential Global Positioning System. This code is especially applicable to surface expressions of groundwater.'
 'Steel-tape'
 'Calibrated electric tape – accuracy of instrument has been checked'
 'Observed' 'Electric-tape' 'Other' nan]


In [271]:
chk = out_MasterWaterOilLevels2.loc[out_MasterWaterOilLevels2['USGS_siteno']=='473425095052603']
chk[['USGS_siteno', 'WaterLevel_ftBLS']]


Unnamed: 0,USGS_siteno,WaterLevel_ftBLS
10283,473425095052603,24.0
10284,473425095052603,23.84
10285,473425095052603,23.14
10286,473425095052603,24.43
10287,473425095052603,24.04
10288,473425095052603,24.04
10289,473425095052603,22.82
25458,473425095052603,
25573,473425095052603,


## Some checks

In [272]:
# r 467 check to see that all oil level IDs are included in final table
# INSPECT
#0 rows indicates all OilLevelID s are in the final merged table
# issue could be related to example of anti join given in my Lession.ipynb file under Libraries > PANDAS > "anti-join"
pd.merge(tblOilLevels8, out_MasterWaterOilLevels2, how='outer', on='OilLevelID', indicator=True).query("_merge != 'both'")

Unnamed: 0,NWTPosition_m,LocalUseCode_x,OilLevelID,LocalSiteName_x,PersonMakingMeasurement_x,OilLevelDate_x,OilLevelTime_x,TimeDatum,OilLevel_ftBMP,OilLevelMethod,...,GWSI_Tape_Correction_ft_y,GWSI_MP_height_ft_oil,OilLevel_ftBLS_y,Comments_FluidLevelMethod_y,OilLevel_ftASL_NAVD88,OilLevel_mASL_NAVD88,OilLevel_ftASL_NAVD88_v2,OilThickness_ft,OilThickness_m,_merge
4715,,,,,,,,,,,...,,,,Differential Global Positioning System. This c...,,,,,,right_only
4716,,,,,,,,,,,...,,,,Differential Global Positioning System. This c...,,,,,,right_only
4717,,,,,,,,,,,...,,,,Differential Global Positioning System. This c...,,,,,,right_only
4718,,,,,,,,,,,...,,,,Differential Global Positioning System. This c...,,,,,,right_only
4719,,,,,,,,,,,...,,,,Differential Global Positioning System. This c...,,,,,,right_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25576,,,,,,,,,,,...,,,,Calibrated electric tape – accuracy of instrum...,,,,,,right_only
25577,,,,,,,,,,,...,,,,Steel-tape,,,,,,right_only
25578,,,,,,,,,,,...,,,,Calibrated electric tape – accuracy of instrum...,,,,,,right_only
25579,,,,,,,,,,,...,,,,Steel-tape,,,,,,right_only


In [273]:
# INSPECT
#r 471 check for duplicate in oil level id:
chk = out_MasterWaterOilLevels2[~out_MasterWaterOilLevels2['OilLevelID'].isna()].groupby('OilLevelID').filter(lambda x: len(x) > 1)
print(len(chk), 'rows') # if zero, then no duplicates
display(chk)

0 rows


Unnamed: 0,USGS_siteno,LevelDate,LevelTime,AgencyCode,LocalSiteName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m,GWSI_USGS_siteno,...,Comments_OilLevel,GWSI_Tape_Correction_ft,GWSI_MP_height_ft_oil,OilLevel_ftBLS,Comments_FluidLevelMethod,OilLevel_ftASL_NAVD88,OilLevel_mASL_NAVD88,OilLevel_ftASL_NAVD88_v2,OilThickness_ft,OilThickness_m


In [274]:
# r 478 check for negative oil thicknesses
NegThk = out_MasterWaterOilLevels2['OilThickness_ft'] <= 0 # note: just retruns boolean mask of true or false for each row of that column
NegThk = out_MasterWaterOilLevels2.loc[out_MasterWaterOilLevels2['OilThickness_ft'] <= 0]
NegThk # if non exist...good
NegThk.to_csv("DataChecks/{}_NegativeOilThicknesses.csv".format(str(datetime.now().date())), index=False)

In [275]:
# INSPECT
# r 485 check to see that oil wells are classified as water table wells
# copied this check into notepad file for Andrew to look into.
WT_oilWell_chk = out_MasterWaterOilLevels2.loc[(out_MasterWaterOilLevels2['WaterTableWell'] == 'N') & (out_MasterWaterOilLevels2['LocalUseCode'] == 'wo')]\
    .groupby('LocalSiteName').size().reset_index(name='count')
display(WT_oilWell_chk)

Unnamed: 0,LocalSiteName,count
0,0501,9
1,302,154
2,421A,127
3,9014,31
4,9017,30
5,9018,1


In [276]:
# INSPECT more checks for oil thickness calcs

# number of thickness values
print(out_MasterWaterOilLevels2[~out_MasterWaterOilLevels2['OilThickness_ft'].isna()]['OilThickness_ft'].count())

# number of oil level values
print(out_MasterWaterOilLevels2[~out_MasterWaterOilLevels2['OilLevel_ftBLS'].isna()]['OilLevel_ftBLS'].count())

# number of water levels
print(out_MasterWaterOilLevels2[~out_MasterWaterOilLevels2['WaterLevel_ftBLS'].isna()]['WaterLevel_ftBLS'].count())
print(dfwl2[~dfwl2['lev_va'].isna()]['lev_va'].count())

2971
3436
24737
24737


In [277]:
# INSPECT
# check count of each column and inspect which rows have valid values
out_MasterWaterOilLevels2.count()
out_MasterWaterOilLevels2[~out_MasterWaterOilLevels2['Comments_WaterLevelStatus'].isna()] # not many rows available for this column

Unnamed: 0,USGS_siteno,LevelDate,LevelTime,AgencyCode,LocalSiteName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m,GWSI_USGS_siteno,...,Comments_OilLevel,GWSI_Tape_Correction_ft,GWSI_MP_height_ft_oil,OilLevel_ftBLS,Comments_FluidLevelMethod,OilLevel_ftASL_NAVD88,OilLevel_mASL_NAVD88,OilLevel_ftASL_NAVD88_v2,OilThickness_ft,OilThickness_m
5,473358095061401,1987-05-05,12:01,USGS,426,341759.50,5270225.09,,,473358095061401,...,,,,,Steel-tape,,,,,
6,473358095061401,1989-06-01,12:01,USGS,426,341759.50,5270225.09,,,473358095061401,...,,,,,Calibrated electric tape – accuracy of instrum...,,,,,
7,473358095061401,1992-06-16,12:01,USGS,426,341759.50,5270225.09,,,473358095061401,...,,,,,Calibrated electric tape – accuracy of instrum...,,,,,
8,473404095054101,1984-10-15,12:01,USGS,409,342437.61,5270398.26,,,473404095054101,...,,,,,Steel-tape,,,,,
9,473404095054101,1984-10-31,12:01,USGS,409,342437.61,5270398.26,,,473404095054101,...,,,,,Steel-tape,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25058,473437095052401,2024-05-04,13:59,USGS,524,342831.11,5271372.45,,,473437095052401,...,,,,,Calibrated electric tape – accuracy of instrum...,,,,,
25059,473440095063001,1987-05-05,12:01,USGS,427,341455.97,5271518.32,,,473440095063001,...,,,,,Steel-tape,,,,,
25060,473440095063001,1989-06-01,12:01,USGS,427,341455.97,5271518.32,,,473440095063001,...,,,,,Calibrated electric tape – accuracy of instrum...,,,,,
25061,473440095063001,1989-10-29,12:01,USGS,427,341455.97,5271518.32,,,473440095063001,...,,,,,Steel-tape,,,,,


In [278]:
# r 521 create a dataset ID column
out_MasterWaterOilLevels2['dataset_id'] = 'ds_12'

In [279]:
# INSPECTION
display(tblOilLevels8[["LocalSiteName","OilLevelDate","OilLevelTime","ZeroThicknessCalcExclude"]].dtypes)
#out_MasterWaterOilLevels2[['LocalSiteName','LevelDate','LevelTime']].
out_MasterWaterOilLevels2.LevelDate

LocalSiteName               object
OilLevelDate                object
OilLevelTime                object
ZeroThicknessCalcExclude    object
dtype: object

0        2019-06-28
1        2020-08-06
2        2021-08-19
3        2022-08-25
4        2023-06-22
            ...    
25576    2015-08-05
25577    2015-08-05
25578    2015-10-22
25579    2019-10-31
25580    2005-07-15
Name: LevelDate, Length: 25581, dtype: object

In [280]:
# r 525
#calculate 0 oil thickness for all wells with water but not oil, except for 
#records where water was not measured (ZeroThicknessCalcExclude) column. 

#merge master file to get "excludeFromZeroThicknessCalc from tblOilLevels (tblOilLevels8 in Python)
# to merge, first we have to match data types
tblOilLevels8['OilLevelDate'] = pd.to_datetime(tblOilLevels8['OilLevelDate']).dt.strftime('%Y-%m-%d')

out_MasterWaterOilLevels3 = pd.merge(out_MasterWaterOilLevels2, tblOilLevels8[["LocalSiteName","OilLevelDate","OilLevelTime","ZeroThicknessCalcExclude"]], 
                                     left_on=['LocalSiteName','LevelDate','LevelTime'],
                                     right_on=['LocalSiteName','OilLevelDate','OilLevelTime'],
                                     how='left')
# couple checks to see if the merge worked r 535
print(out_MasterWaterOilLevels2.shape[0])
print(out_MasterWaterOilLevels3.shape[0])
print(out_MasterWaterOilLevels3[~out_MasterWaterOilLevels3['ZeroThicknessCalcExclude'].isna()].shape[0])   # these two do the same thing
print(out_MasterWaterOilLevels3[pd.notna(out_MasterWaterOilLevels3['ZeroThicknessCalcExclude'])].shape[0]) # these two do the same thing
print(tblOilLevels8[~tblOilLevels8['ZeroThicknessCalcExclude'].isna()].shape[0])

25581
25581
16
16
16


In [281]:
# r 542 more checks
print(out_MasterWaterOilLevels3[(out_MasterWaterOilLevels3['WaterLevel_mASL_NAVD88'] > 0) & (out_MasterWaterOilLevels3['OilLevel_ftBLS'].isna())].shape[0]) 
print(out_MasterWaterOilLevels3[(out_MasterWaterOilLevels3['WaterLevel_mASL_NAVD88'] > 0) & (out_MasterWaterOilLevels3['OilLevel_mASL_NAVD88'].isna())].shape[0])
print(out_MasterWaterOilLevels3[(out_MasterWaterOilLevels3['WaterLevel_mASL_NAVD88'] > 0) & 
                                (out_MasterWaterOilLevels3['OilLevel_mASL_NAVD88'].isna()) &
                                (out_MasterWaterOilLevels3['ZeroThicknessCalcExclude'].isna()) # same as print statement before except with this added conditional
                                ].shape[0])
# r 551 if there is water, and no oil, set oil thickness to 0
out_MasterWaterOilLevels3.loc[(out_MasterWaterOilLevels3['WaterLevel_mASL_NAVD88'] > 0) &
                              (out_MasterWaterOilLevels3['OilLevel_mASL_NAVD88'].isna()) &
                              (out_MasterWaterOilLevels3['ZeroThicknessCalcExclude'].isna()), # picks rows where these conditions are true
                              'OilThickness_m'] = 0                                           # for the OilThickness_m col and replaces it with zero
# repeat but for the ft columns (note: the R script still used the m for the conditions while changing the OilThickness_ft)
out_MasterWaterOilLevels3.loc[(out_MasterWaterOilLevels3['WaterLevel_ftASL_NAVD88'] > 0) & 
                              (out_MasterWaterOilLevels3['OilLevel_ftASL_NAVD88'].isna()) &
                              (out_MasterWaterOilLevels3['ZeroThicknessCalcExclude'].isna()), 
                              'OilThickness_ft'] = 0

out_MasterWaterOilLevels3.loc[out_MasterWaterOilLevels3['ZeroThicknessCalcExclude'].isna()]

21766
21766
21750


Unnamed: 0,USGS_siteno,LevelDate,LevelTime,AgencyCode,LocalSiteName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m,GWSI_USGS_siteno,...,Comments_FluidLevelMethod,OilLevel_ftASL_NAVD88,OilLevel_mASL_NAVD88,OilLevel_ftASL_NAVD88_v2,OilThickness_ft,OilThickness_m,dataset_id,OilLevelDate_y,OilLevelTime_y,ZeroThicknessCalcExclude
0,473356095043701,2019-06-28,09:56,USGS,Grant Creek 01,343765.96,5270091.98,,,473356095043701,...,Differential Global Positioning System. This c...,,,,0.0,0.0,ds_12,,,
1,473356095043701,2020-08-06,13:00,USGS,Grant Creek 01,343765.96,5270091.98,,,473356095043701,...,Differential Global Positioning System. This c...,,,,0.0,0.0,ds_12,,,
2,473356095043701,2021-08-19,11:00,USGS,Grant Creek 01,343765.96,5270091.98,,,473356095043701,...,Differential Global Positioning System. This c...,,,,0.0,0.0,ds_12,,,
3,473356095043701,2022-08-25,09:08,USGS,Grant Creek 01,343765.96,5270091.98,,,473356095043701,...,Differential Global Positioning System. This c...,,,,0.0,0.0,ds_12,,,
4,473356095043701,2023-06-22,10:30,USGS,Grant Creek 01,343765.96,5270091.98,,,473356095043701,...,Differential Global Positioning System. This c...,,,,0.0,0.0,ds_12,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25576,473425095052702,2015-08-05,10:23,USGS,9017,342764.87,5271033.20,1.0,-21.97,473425095052702,...,Other,1391.07,423.998,1391.07,,,ds_12,2015-08-05,10:23,
25577,473420095052301,2015-08-05,10:36,USGS,1429,342814.32,5270875.34,,,473420095052301,...,Other,,,,,,ds_12,2015-08-05,10:36,
25578,473426095052526,2015-10-22,12:30,USGS,301A,342781.07,5271045.79,1.0,-2.06,473426095052526,...,Other,1390.87,423.937,1390.87,,,ds_12,2015-10-22,12:30,
25579,473426095052414,2019-10-31,12:00,USGS,534A,342809.72,5271045.08,1.0,23.84,473426095052414,...,Calibrated electric tape – accuracy of instrum...,,,,,,ds_12,2019-10-31,12:00,


In [282]:
# see if there are fewer NAs (note...generally there are just too many NAs)
print(out_MasterWaterOilLevels2.loc[out_MasterWaterOilLevels2['OilThickness_m'].isna()].shape[0])
print(out_MasterWaterOilLevels3.loc[out_MasterWaterOilLevels3['OilThickness_m'].isna()].shape[0])

22610
860


## create estimated water level column for rows where an oil level was measured
calculate apparent water table elevation, based on Geoff's reference in SIR,

.856 is the specific gravity of crude oil.

In [283]:
# r line 569 populate Est water level in m with water levels from water level_m column for records with a water level but oil thickness = NA
out_MasterWaterOilLevels3.loc[(~out_MasterWaterOilLevels3['WaterLevel_mASL_NAVD88'].isna()) & 
                              (out_MasterWaterOilLevels3['OilThickness_m'].isna()),
                              'EstWaterLevel_mASL_NAV88'] = \
                              out_MasterWaterOilLevels3.loc[(~out_MasterWaterOilLevels3['WaterLevel_mASL_NAVD88'].isna()) & 
                                                            (out_MasterWaterOilLevels3['OilThickness_m'].isna()), 
                                                            'WaterLevel_mASL_NAVD88']
# r line 573 populate Est water level in ft with water levels from water level, ft column for records with a water level but oil thickness = NA
out_MasterWaterOilLevels3.loc[(~out_MasterWaterOilLevels3['WaterLevel_ftASL_NAVD88'].isna()) & 
                              (out_MasterWaterOilLevels3['OilThickness_ft'].isna()),
                              'EstWaterLevel_ftASL_NAV88'] = \
                              out_MasterWaterOilLevels3.loc[(~out_MasterWaterOilLevels3['WaterLevel_ftASL_NAVD88'].isna()) & 
                                                            (out_MasterWaterOilLevels3['OilThickness_ft'].isna()), 
                                                            'WaterLevel_ftASL_NAVD88']
# r 578 populate est water level in m column with water level+ oil_thickness*0.855 (Lundy, 2015) for records where 
# a water level and oil thickness is present; 0 oil thickness should produce the 
# an estimated water level identical to the original water level data. 
out_MasterWaterOilLevels3.loc[out_MasterWaterOilLevels3['OilThickness_m'].notna(), 'EstWaterLevel_mASL_NAVD88'] = \
round(out_MasterWaterOilLevels3.loc[out_MasterWaterOilLevels3['OilThickness_m'].notna(), 'WaterLevel_mASL_NAVD88'] +\
      out_MasterWaterOilLevels3.loc[out_MasterWaterOilLevels3['OilThickness_m'].notna(), 'OilThickness_m']*0.855, 3)
# r 581 populate est water level in ft column with water level+ oil_thickness*0.855 for records in ft where a 
# water level and oil thickness is present; 0 oil thickness should produce the 
# an estimated water level identical to the original water level data. 
out_MasterWaterOilLevels3.loc[out_MasterWaterOilLevels3['OilThickness_ft'].notna(), 'EstWaterLevel_ftASL_NAVD88'] = \
round(out_MasterWaterOilLevels3.loc[out_MasterWaterOilLevels3['OilThickness_ft'].notna(), 'WaterLevel_ftASL_NAVD88'] +\
      out_MasterWaterOilLevels3.loc[out_MasterWaterOilLevels3['OilThickness_ft'].notna(), 'OilThickness_ft']*0.855, 3)
# r 586 replace NA in Comments_WaterLevelStatus column with "static" for appropriate conditions
out_MasterWaterOilLevels3.loc[(out_MasterWaterOilLevels3['WaterLevel_ftBLS'].notna()) & 
                              (out_MasterWaterOilLevels3['Comments_WaterLevelStatus'].isna()),
                              'Comments_WaterLevelStatus'] = 'static'

In [284]:
out_MasterWaterOilLevels3[['USGS_siteno','LocalSiteName','Comments_WaterLevelStatus']]

Unnamed: 0,USGS_siteno,LocalSiteName,Comments_WaterLevelStatus
0,473356095043701,Grant Creek 01,static
1,473356095043701,Grant Creek 01,static
2,473356095043701,Grant Creek 01,static
3,473356095043701,Grant Creek 01,static
4,473356095043701,Grant Creek 01,static
...,...,...,...
25576,473425095052702,9017,
25577,473420095052301,1429,
25578,473426095052526,301A,
25579,473426095052414,534A,


# Export to CSV

In [301]:
# r 590 reorder the columns for the final output
out_MasterWaterOilLevels3 = out_MasterWaterOilLevels3.loc[:, ['dataset_id',
                                                              'AgencyCode',
                                                              "USGS_siteno",
                                                              "LocalSiteName",
                                                              "LocalUseCode", 
                                                              "WaterTableWell",                       
                                                              "XcoordUTMNAD83_m",
                                                              "YcoordUTMNAD83_m",                      
                                                              "OnNorthWellTransect",                  
                                                              "DistanceFromCenterOfNorthOilBody_m",    
                                                              "GWSI_LandSurfaceAltitude_ftASL_NAVD88",
                                                              "GWSI_LandSurfaceAltitude_mASL_NAVD88",
                                                              "MidOfScreenDepth_ft",
                                                              "MidOfScreenDepth_m",
                                                              "MidOfScreenElevation_ftASL_NAVD88",#worked
                                                              "MidOfScreenElevation_mASL_NAVD88",
                                                              "LevelDate", 
                                                              "LevelTime",
                                                              "WaterLevel_ftBLS",
                                                              "Comments_WaterLevelStatus",
                                                              "Comments_FluidLevelMethod",
                                                              "GWSI_MP_height_ft_WL",
                                                              "WaterLevel_ftASL_NAVD88",
                                                              "WaterLevel_mASL_NAVD88",
                                                              "OilLevelID",                           
                                                              "PersonMakingMeasurement", #worked
                                                              "Oil_Tape_Serial_No",
                                                              "GWSI_Tape_Correction_ft",
                                                              "GWSI_MP_height_ft_oil",
                                                              "OilLevel_ftBLS",
                                                              "OilLevel_ftASL_NAVD88",
                                                              "OilLevel_mASL_NAVD88",
                                                              "OilThickness_ft",
                                                              "OilThickness_m",
                                                              "EstWaterLevel_ftASL_NAVD88",
                                                              "EstWaterLevel_mASL_NAVD88",
                                                              "Comments_OilLevel",
                                                              "ZeroThicknessCalcExclude"]]

In [302]:
# save the output
out_MasterWaterOilLevels3.to_csv("outputs/{}_out_MasterOilLevels3.csv".format(str(datetime.now().date())), index=False)
print('Saved path:', "outputs/{}_out_MasterOilLevels3.csv".format(str(datetime.now().date())))

OSError: Cannot save file into a non-existent directory: 'outputs'

In [289]:
# r 638 export master file for data release, just select a subset of columns.
# the site info data release will have all of the location and screen information
# add an "x" prefix to the local site name column
out_MasterWaterOilLevels_DataRelease = out_MasterWaterOilLevels3.loc[:,['dataset_id',
                                                                        "AgencyCode",
                                                                        "USGS_siteno",
                                                                        "LocalSiteName",
                                                                        "LevelDate", 
                                                                        "LevelTime",
                                                                        "WaterLevel_ftBLS",
                                                                        "Comments_WaterLevelStatus",
                                                                        "Comments_FluidLevelMethod",
                                                                        "WaterLevel_ftASL_NAVD88",
                                                                        "WaterLevel_mASL_NAVD88",
                                                                        "OilLevelID",                          
                                                                        "OilLevel_ftBLS",
                                                                        "OilLevel_ftASL_NAVD88",
                                                                        "OilLevel_mASL_NAVD88",
                                                                        "OilThickness_ft",
                                                                        "OilThickness_m",
                                                                        "EstWaterLevel_ftASL_NAVD88",
                                                                        "EstWaterLevel_mASL_NAVD88",
                                                                        "Comments_OilLevel"]]
out_MasterWaterOilLevels_DataRelease['LocalSiteName'] = 'x' + out_MasterWaterOilLevels_DataRelease['LocalSiteName'].astype(str)
out_MasterWaterOilLevels_DataRelease['USGS_siteno'] = 'x' + out_MasterWaterOilLevels_DataRelease['USGS_siteno'].astype(str)


In [None]:
# save the output
out_MasterWaterOilLevels_DataRelease.to_csv("outputs/{}_tblds_12_WaterAndOilLevels.csv".format(str(datetime.now().date())), index=False)
print('Saved path:', "outputs/{}_tblds_12_WaterAndOilLevels.csv".format(str(datetime.now().date())))

# Extra code

In [None]:
# Example code
bmjmp2['GWSI_USGS_siteno'] = bmjmp2['GWSI_USGS_siteno'].astype(str)  # convert column to strings
tblOilLevels7.loc[:, ['OilLevelDate','GWSI_MP_BeginDate','GWSI_MP_EndDate']].sample(3) # inspect specific columns for random rows

# Good example merge

tblOilLevels7 = pd.merge(tblOilLevels6, bmjmp2.loc[:, ['GWSI_USGS_siteno', 'GWSI_MP_BeginDate', 'GWSI_MP_EndDate', 'GWSI_MP_height_ft']], how='left', left_on='USGS_siteno', right_on='GWSI_USGS_siteno')

'''
dfc = pd.merge(dfa, dfb, how='left', left_on='Col', right_on='Col')

# or select specific columns within dfb to merge into dfa using loc

dfb.loc[:, ['Col1', 'Col2']]

'''

## R vs Python

In [3]:
print(os.getcwd())

c:\Users\bmilinic\OneDrive - DOI\Documents\Python\bemidji


In [6]:
# Easily read in py output
file_path = r"P:\0083\analysis\DataCompilation\DataCompilationPy\create_master_oil_levels\outputs\2024-07-11_tblds_12_WaterAndOilLevels.csv"
dfpy = pd.read_csv(file_path)

In [29]:
# Read in R output after investigating the encoding

import chardet
file_path = r"P:\0083\analysis\DataCompilation\DataCompilationR\CreateMasterWaterOilLevelDataSet\outputs\2021-04-27_tblds_12_WaterAndOilLevels.csv"
file_path = r"P:\0083\analysis\DataCompilation\DataCompilationPy\create_master_oil_levels\outputs\EXAMPLE_tblds_12_WaterAndOilLevels.csv"

# (r)ead a file's (b)inary
file = open(file_path, 'rb')
# store as raw data
raw_data = file.read()
# close the file
file.close()
# check encoding with chardet
result = chardet.detect(raw_data)
print("Result: ", result)
encoding = result['encoding']
print(f"Detected potential encoding: {encoding}")

# use the encoding to read the csv
dfr = pd.read_csv(file_path, encoding=encoding)

# if reading in the EXAMPLE file skip the first two rows 
if file_path == r"P:\0083\analysis\DataCompilation\DataCompilationPy\create_master_oil_levels\outputs\EXAMPLE_tblds_12_WaterAndOilLevels.csv":
    print('Skipping first two rows for EXAMPLE data')
    dfr = pd.read_csv(file_path, encoding=encoding, skiprows=2)

Result:  {'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
Detected potential encoding: Windows-1252
Skipping first two rows for EXAMPLE data


In [290]:
dfpy = out_MasterWaterOilLevels_DataRelease.copy()

In [291]:
dfpy['origin'] = 'python'
dfr['origin'] = 'R'

# update r column to match python
#dfr['LocalSiteName'] = 'x' + dfr['LocalSiteName'].astype(str)

In [292]:
dfcompare = pd.concat([dfpy, dfr], axis=0)
dfcompare.keys()

Index(['dataset_id', 'AgencyCode', 'USGS_siteno', 'LocalSiteName', 'LevelDate',
       'LevelTime', 'WaterLevel_ftBLS', 'Comments_WaterLevelStatus',
       'Comments_FluidLevelMethod', 'WaterLevel_ftASL_NAVD88',
       'WaterLevel_mASL_NAVD88', 'OilLevelID', 'OilLevel_ftBLS',
       'OilLevel_ftASL_NAVD88', 'OilLevel_mASL_NAVD88', 'OilThickness_ft',
       'OilThickness_m', 'EstWaterLevel_ftASL_NAVD88',
       'EstWaterLevel_mASL_NAVD88', 'Comments_OilLevel', 'origin'],
      dtype='object')

In [293]:
dfcompare.sort_values(by=['LocalSiteName', 'LevelTime','WaterLevel_ftBLS'], inplace=True)
dfcompare[['LocalSiteName', 'LevelTime','WaterLevel_ftBLS', 'origin']]

Unnamed: 0,LocalSiteName,LevelTime,WaterLevel_ftBLS,origin
10286,x0501,10:04,24.43,python
9791,x0501,10:04,24.43,R
10285,x0501,11:14,23.14,python
9790,x0501,11:14,23.14,R
10284,x0501,11:38,23.84,python
...,...,...,...,...
24436,xWL04,10:20,0.18,python
24434,xWL04,13:00,0.31,python
24435,xWL04,13:16,-0.31,python
23111,xWL04,17:00,0.00,R


In [295]:
dfcompare.sort_values(by=['LocalSiteName','WaterLevel_ftBLS'], inplace=True)
dfcompare[['LocalSiteName', 'LevelTime','WaterLevel_ftBLS', 'OilLevel_ftBLS', 'origin']]

Unnamed: 0,LocalSiteName,LevelTime,WaterLevel_ftBLS,OilLevel_ftBLS,origin
10289,x0501,14:44,22.82,18.99,python
9794,x0501,14:44,22.82,18.99,R
10285,x0501,11:14,23.14,19.27,python
9790,x0501,11:14,23.14,19.27,R
10284,x0501,11:38,23.84,19.90,python
...,...,...,...,...,...
24435,xWL04,13:16,-0.31,,python
23111,xWL04,17:00,0.00,,R
24433,xWL04,17:34,0.00,,python
24436,xWL04,10:20,0.18,,python


In [313]:
dfcompare.sort_values(by=['LocalSiteName','LevelTime', 'WaterLevel_ftBLS'], inplace=True)
dfcompare[['LocalSiteName', 'LevelDate','LevelTime','WaterLevel_ftBLS', 'OilLevel_ftBLS', 'origin']].head(20)

Unnamed: 0,LocalSiteName,LevelDate,LevelTime,WaterLevel_ftBLS,OilLevel_ftBLS,origin
10286,x0501,2015-08-05,10:04,24.43,20.55,python
9791,x0501,8/5/2015,10:04,24.43,20.55,R
10285,x0501,2014-07-15,11:14,23.14,19.27,python
9790,x0501,7/15/2014,11:14,23.14,19.27,R
10284,x0501,2013-07-15,11:38,23.84,19.9,python
9789,x0501,7/15/2013,11:38,23.84,19.9,R
25458,x0501,2005-06-27,12:01,,,python
23971,x0501,6/27/2005,12:01,,,R
10288,x0501,2018-06-25,14:30,24.04,20.19,python
9793,x0501,6/25/2018,14:30,24.04,20.19,R


In [316]:
chk = dfcompare[['origin', 'LocalSiteName', 'LevelDate', 'LevelTime', 'WaterLevel_ftBLS', 'OilLevel_ftBLS', 'OilThickness_ft']].iloc[0:1000].copy()

In [317]:
chk

Unnamed: 0,origin,LocalSiteName,LevelDate,LevelTime,WaterLevel_ftBLS,OilLevel_ftBLS,OilThickness_ft
10286,python,x0501,2015-08-05,10:04,24.43,20.55,3.88
9791,R,x0501,8/5/2015,10:04,24.43,20.55,3.88
10285,python,x0501,2014-07-15,11:14,23.14,19.27,3.87
9790,R,x0501,7/15/2014,11:14,23.14,19.27,3.87
10284,python,x0501,2013-07-15,11:38,23.84,19.90,3.94
...,...,...,...,...,...,...,...
13954,R,x301A,10/23/2013,11:00,,22.46,
23986,R,x301A,10/23/2014,11:00,,22.13,
25511,python,x301A,2013-01-09,11:00,,22.71,
25545,python,x301A,2013-10-23,11:00,,22.46,


In [35]:
# save the output
dfcompare.to_csv(r'P:\0083\analysis\DataCompilation\DataCompilationPy\create_master_oil_levels\outputs\tests\compare.csv')

### Mutli-compare

In [95]:
# compare 3 dfs
# Easily read in py output
file_path = r"P:\0083\analysis\DataCompilation\DataCompilationPy\create_master_oil_levels\outputs\2024-07-02_tblds_12_WaterAndOilLevels.csv"
dfpy_updated = pd.read_csv(file_path)

In [96]:
dfpy_updated['origin'] = 'python_updated'

In [97]:
dfcompare = pd.concat([dfpy, dfpy_updated, dfr], axis=0)
dfcompare.keys()

Index(['dataset_id', 'AgencyCode', 'USGS_siteno', 'LocalSiteName', 'LevelDate',
       'LevelTime', 'WaterLevel_ftBLS', 'Comments_WaterLevelStatus',
       'Comments_FluidLevelMethod', 'WaterLevel_ftASL_NAVD88',
       'WaterLevel_mASL_NAVD88', 'OilLevelID', 'OilLevel_ftBLS',
       'OilLevel_ftASL_NAVD88', 'OilLevel_mASL_NAVD88', 'OilThickness_ft',
       'OilThickness_m', 'EstWaterLevel_ftASL_NAVD88',
       'EstWaterLevel_mASL_NAVD88', 'Comments_OilLevel', 'origin'],
      dtype='object')

In [98]:
dfcompare.sort_values(by=['LocalSiteName', 'LevelTime','WaterLevel_ftBLS'], inplace=True)

In [99]:
dfcompare

Unnamed: 0,dataset_id,AgencyCode,USGS_siteno,LocalSiteName,LevelDate,LevelTime,WaterLevel_ftBLS,Comments_WaterLevelStatus,Comments_FluidLevelMethod,WaterLevel_ftASL_NAVD88,...,OilLevelID,OilLevel_ftBLS,OilLevel_ftASL_NAVD88,OilLevel_mASL_NAVD88,OilThickness_ft,OilThickness_m,EstWaterLevel_ftASL_NAVD88,EstWaterLevel_mASL_NAVD88,Comments_OilLevel,origin
25867,ds_12,USGS,x473425095052603,x0501,2015-08-05,10:04,,,Other,,...,19930.0,20.55,1391.28,424.062,,,,,Oil is likely cut off from screen. Extremly da...,python
26321,ds_12,USGS,x473425095052603,x0501,2015-08-05,10:04,,,Other,,...,19930.0,20.55,1391.28,424.062,,,,,Oil is likely cut off from screen. Extremly da...,python_updated
25076,ds_12,USGS,473425095052603,x0501,2015-08-05,10:04,,,Other,,...,19930.0,20.55,1391.28,424.062,,,,,Oil is likely cut off from screen. Extremly da...,R
25712,ds_12,USGS,x473425095052603,x0501,2014-07-15,11:14,,,Other,,...,19811.0,19.27,1392.56,424.452,,,,,"Oil is likely trapped in well, and not connect...",python
26166,ds_12,USGS,x473425095052603,x0501,2014-07-15,11:14,,,Other,,...,19811.0,19.27,1392.56,424.452,,,,,"Oil is likely trapped in well, and not connect...",python_updated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24434,ds_12,USGS,x473431095052801,xWL04,2020-08-06,18:00,0.31,static,Differential Global Positioning System. This c...,1392.42,...,,,,,0.0,0.0,1392.42,424.410,,python_updated
23990,ds_12,USGS,x473431095052801,xWL04,2022-08-25,18:16,-0.31,static,Differential Global Positioning System. This c...,1393.04,...,,,,,0.0,0.0,1393.04,424.599,,python
24435,ds_12,USGS,x473431095052801,xWL04,2022-08-25,18:16,-0.31,static,Differential Global Positioning System. This c...,1393.04,...,,,,,0.0,0.0,1393.04,424.599,,python_updated
23988,ds_12,USGS,x473431095052801,xWL04,2019-06-27,22:34,0.00,static,Differential Global Positioning System. This c...,1392.73,...,,,,,0.0,0.0,1392.73,424.504,,python


: 