Make sure to have sqlalchemy-access installed for uploading back into the database

Only needs to be done once! 

You can't be connected to VPN due to encryption issues. Install using Anaconda prompt when disconnected from VPN

In [None]:
# only needed if you want to reupload to Microsoft Access database
#pip install sqlalchemy-access

# Import Libraries

In [7]:
import pandas as pd #for creating dataframe
import pyodbc #working with ODBC databases
import numpy as np # for locating values in dataframes
from datetime import datetime # for obtaining today's date
import os #for working with directories

In [8]:
#optional- set dfs to show actual numbers rather than scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x) 

# Manage directories

In [9]:
# my default directory is c:\Users\bmilinic\OneDrive - DOI\Documents\Python\bemidji
defaultdirectory = os.getcwd()
print(defaultdirectory)

P:\0083\analysis\DataCompilation\DataCompilationPy\create_site_info_files


In [10]:
# connect to the shared drive which holds the databases and files 
os.chdir('P:/0083/analysis/DataCompilation/DataCompilationPy/create_site_info_files')

# Import data

In [12]:
# from GWSI Python output 
dfbmj3 = pd.read_csv(r'data_inputs/gwsi_old/bmj3_fromPy.csv')
dfrmk = pd.read_csv(r'data_inputs/gwsi_old/bmj_rmk_fromPy.csv')
# from Python outout aquarius and MLR 
dfaq = pd.read_csv(r'data_inputs/aquarius/Referencepoints_updatedMP_fromPy.csv') 
dfmlr = pd.read_csv(r'data_inputs/MLR/MLR_fromPy.csv')

# Import data from Microsoft Access Using PYODBC
Gfe_db = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=P:\0083\analysis\DataCompilation\DataCompilationPy\local_access_db\BemidjiMasterSiteData_fe.accdb;')
c_fe = Gfe_db.cursor()
# tables
tblSites = pd.read_sql('select * from tblSites', Gfe_db)
tblWells = pd.read_sql('select * from tblWells', Gfe_db)
tblCores = pd.read_sql('select * from tblCores', Gfe_db)
tblOE = pd.read_sql('select * from tblOtherEquipment', Gfe_db)
# cd tables
tblcd_LocalUseCode = pd.read_sql("select * from tblcd_LocalUseCode", Gfe_db)  
tblcd_CasingMaterial = pd.read_sql("select * from tblcd_CasingMaterial", Gfe_db)   
tblcd_ScreenMaterial = pd.read_sql("select * from tblcd_ScreenMaterial", Gfe_db)   
tblcd_OpeningType = pd.read_sql("select * from tblcd_OpeningType", Gfe_db)  



## Merge tbleSites (local access) with GWSI (retrieved) to update the USGS station names

In [142]:
# merge tblSites and dfbmj3: match up datatypes of the columns that the dfs will be merged on
tblSites2 = tblSites
tblSites2['USGS_siteno'] = tblSites2['USGS_siteno'].fillna(0).astype('int64')
dfcomplete = pd.merge(tblSites2, dfbmj3, left_on='USGS_siteno', right_on='GWSI_USGS_siteno', how='left')
# turn zeros back into NaN
dfcomplete['USGS_siteno'].replace(0, np.nan, inplace=True) 
print(dfcomplete.shape)
# Merge tbl wells
dfcomplete = pd.merge(dfcomplete, tblWells, on='LocalSiteName', how='left')
print(dfcomplete.shape)
# Merge tblOE
dfcomplete = pd.merge(dfcomplete, tblOE, on='LocalSiteName', how='left')
print(dfcomplete.shape)
# Merge tblCores
dfcomplete = pd.merge(dfcomplete, tblCores, on='LocalSiteName', how='left')
print(dfcomplete.shape)
# Merge dfmlr
dfcomplete = pd.merge(dfcomplete, dfmlr, left_on='USGS_siteno', right_on='site_no', how='left')
print(dfcomplete.shape)
# Merge dfaq
dfcomplete = pd.merge(dfcomplete, dfaq, left_on='USGS_siteno', right_on='site', how='left')
print(dfcomplete.shape)
# Local Use Codes
dfcomplete = pd.merge(dfcomplete, tblcd_LocalUseCode[['LocalUseCode','Comments_UseCode']], how='left') #if you leave out the "how" (like in the original) then # of rows plummits
dfcomplete = pd.merge(dfcomplete, tblcd_OpeningType[["TypeOfOpenInterval", "comments_OpeningType"]], left_on='GWSI_TypeOfOpenInterval', right_on='TypeOfOpenInterval', how='left')
dfcomplete = pd.merge(dfcomplete, tblcd_CasingMaterial[["CasingMaterial", "Comments_CasingMaterial"]], left_on='GWSI_CasingMaterial', right_on='CasingMaterial', how='left')
dfcomplete = pd.merge(dfcomplete, tblcd_ScreenMaterial[["ScreenMaterial", "Comments_ScreenMaterial"]], left_on='GWSI_ScreenMaterialType', right_on='ScreenMaterial', how='left')
print(dfcomplete.shape)


(1729, 123)
(1731, 216)
(1741, 248)
(2474, 289)
(2474, 331)
(2474, 347)


  dfcomplete = pd.merge(dfcomplete, tblCores, on='LocalSiteName', how='left')


(2474, 354)


In [143]:
# calculate additional columns for GWSI
dfcomplete['GWSI_MeasuringPointElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] + dfcomplete['GWSI_MP_height_ft']
dfcomplete['MeasuringPointElevation_mASL_NAVD88'] = dfcomplete['GWSI_MeasuringPointElevation_ftASL_NAVD88'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['GWSI_MeasuringPointHeight_m'] = dfcomplete['GWSI_MP_height_ft'].mul(0.3048).round(3)

dfcomplete['LandSurfaceAltitude_mASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['GWSI_TopOfScreenElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - dfcomplete['GWSI_TopOfScreenDepth_ftBLS']
dfcomplete['TopOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_TopOfScreenElevation_ftASL_NAVD88'].mul(0.3048).round(3)

dfcomplete['GWSI_BottomOfScreenElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - dfcomplete['GWSI_BottomOfScreenDepth_ftBLS']
dfcomplete['BottomOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_BottomOfScreenElevation_ftASL_NAVD88'].mul(0.3048).round(3)

dfcomplete['GWSI_MidOfScreenElevation_mASL_NAVD88'] = dfcomplete['TopOfScreenElevation_mASL_NAVD88'] + dfcomplete['BottomOfScreenElevation_mASL_NAVD88']
dfcomplete['MidOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_MidOfScreenElevation_mASL_NAVD88'].div(2).round(3)

dfcomplete['TotalWellDepth_mBLS'] = dfcomplete['GWSI_TotalWellDepth_ftBLS'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['DiameterOfDrillHole_cm'] = dfcomplete['GWSI_DiameterOfDrillHole_inches'].mul(2.54).round(1) #renamed for Wells

dfcomplete['WellCasingInnerDiameter_cm'] = dfcomplete['GWSI_WellCasingInnerDiameter_inches'].mul(2.54).round(1) #renamed for Wells

dfcomplete['WidthOfOpeningsInOpenInterval_cm'] = dfcomplete['GWSI_WidthOfOpeningsInOpenInterval_inches'].mul(2.54).round(3) #renamed for Wells

dfcomplete['GWSI_ScreenLength_ft'] = dfcomplete['GWSI_BottomOfScreenDepth_ftBLS'] - dfcomplete['GWSI_TopOfScreenDepth_ftBLS']
dfcomplete['ScreenLength_m'] = dfcomplete['GWSI_ScreenLength_ft'].mul(.3048).round(3) #renamed for Wells

dfcomplete['well_MP_height_m'] = dfcomplete['GWSI_MP_height_ft'].mul(.3048).round(3) #renamed for Wells

In [None]:
# create indicator columns in tblSites for which sites exist in each of site type tables (wells, cores, other equipment)
tblSites2['WellSite'] = np.where(tblSites2['LocalSiteName'].isin(tblWells['LocalSiteName']), 1, 0) # inserts 1 if true and 0 if false
tblSites2['CoreSite'] = np.where(tblSites2['LocalSiteName'].isin(tblCores['LocalSiteName']), 1, 0)
tblSites2['OtherEquipmentSite'] = np.where(tblSites2['LocalSiteName'].isin(tblOE['LocalSiteName']), 1, 0)

In [111]:
tblWells.shape

(615, 94)

In [114]:
dfcomplete.loc[dfcomplete['LocalSiteName'] == '518G-06'].to_csv('deletemeplease.csv')

In [123]:
tblCores['LocalSiteName'].value_counts()


1408     17
1605     16
1305     16
1308     15
1902     15
         ..
9318      1
9319      1
9320A     1
9320B     1
1603      1
Name: LocalSiteName, Length: 736, dtype: int64

In [104]:
dfcomplete['LocalSiteName'].value_counts()

518G-06     2
9103G-05    2
15-N6       1
9109        1
9108        1
           ..
1112        1
1111        1
1110        1
1109        1
1706        1
Name: LocalSiteName, Length: 1729, dtype: int64

In [78]:
boolean = dfcomplete.duplicated(subset=['LocalSiteName']).any()
print(boolean)

True


In [None]:
dfcomplete[['Wellcon_SiteRecordID_x','Wellcon_SiteRecordID_y']].sample(5)

In [68]:
dfcomplete.LocalSiteName

0          15-N6
1          15-N7
2          15-N8
3          15-N9
4         15-N10
          ...   
1736        1713
1737    1711G-01
1738      WT1712
1739        1704
1740        1706
Name: LocalSiteName, Length: 1741, dtype: object

# Merge all the rest of the data


# OLD 

# Import data from Microsoft Access Using PYODBC

In [12]:
# connect to databases
Gbe_db = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=P:\0083\analysis\DataCompilation\DataCompilationPy\local_access_db\BemidjiMasterSiteData_be.accdb;')
Gfe_db = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=P:\0083\analysis\DataCompilation\DataCompilationPy\local_access_db\BemidjiMasterSiteData_fe.accdb;')
# create cursor instances for copying/editing databases (not needed if only downloading data)
c_be = Gbe_db.cursor()
c_fe = Gfe_db.cursor()

In [13]:
tblSites = pd.read_sql('select * from tblSites', Gfe_db)
tblWells = pd.read_sql('select * from tblWells', Gfe_db)
tblCores = pd.read_sql('select * from tblCores', Gfe_db)
tblOE = pd.read_sql('select * from tblOtherEquipment', Gfe_db)



In [15]:
tblcd_LocalUseCode = pd.read_sql("select * from tblcd_LocalUseCode", Gbe_db)  
tblcd_CasingMaterial = pd.read_sql("select * from tblcd_CasingMaterial", Gbe_db)   
tblcd_ScreenMaterial = pd.read_sql("select * from tblcd_ScreenMaterial", Gbe_db)   
tblcd_OpeningType = pd.read_sql("select * from tblcd_OpeningType", Gbe_db)  



# dfbmj3

In [6]:
# calculate additional columns
dfbmj3['GWSI_MeasuringPointElevation_ftASL_NAVD88'] = dfbmj3['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] + dfbmj3['GWSI_MP_height_ft']
dfbmj3['MeasuringPointElevation_mASL_NAVD88'] = dfbmj3['GWSI_MeasuringPointElevation_ftASL_NAVD88'].mul(0.3048).round(3) #renamed for Wells

dfbmj3['GWSI_MeasuringPointHeight_m'] = dfbmj3['GWSI_MP_height_ft'].mul(0.3048).round(3)

dfbmj3['LandSurfaceAltitude_mASL_NAVD88'] = dfbmj3['GWSI_LandSurfaceAltitude_ftASL_NAVD88'].mul(0.3048).round(3) #renamed for Wells

dfbmj3['GWSI_TopOfScreenElevation_ftASL_NAVD88'] = dfbmj3['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - dfbmj3['GWSI_TopOfScreenDepth_ftBLS']
dfbmj3['TopOfScreenElevation_mASL_NAVD88'] = dfbmj3['GWSI_TopOfScreenElevation_ftASL_NAVD88'].mul(0.3048).round(3)

dfbmj3['GWSI_BottomOfScreenElevation_ftASL_NAVD88'] = dfbmj3['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - dfbmj3['GWSI_BottomOfScreenDepth_ftBLS']
dfbmj3['BottomOfScreenElevation_mASL_NAVD88'] = dfbmj3['GWSI_BottomOfScreenElevation_ftASL_NAVD88'].mul(0.3048).round(3)

dfbmj3['GWSI_MidOfScreenElevation_mASL_NAVD88'] = dfbmj3['TopOfScreenElevation_mASL_NAVD88'] + dfbmj3['BottomOfScreenElevation_mASL_NAVD88']
dfbmj3['MidOfScreenElevation_mASL_NAVD88'] = dfbmj3['GWSI_MidOfScreenElevation_mASL_NAVD88'].div(2).round(3)

dfbmj3['TotalWellDepth_mBLS'] = dfbmj3['GWSI_TotalWellDepth_ftBLS'].mul(0.3048).round(3) #renamed for Wells

dfbmj3['DiameterOfDrillHole_cm'] = dfbmj3['GWSI_DiameterOfDrillHole_inches'].mul(2.54).round(1) #renamed for Wells

dfbmj3['WellCasingInnerDiameter_cm'] = dfbmj3['GWSI_WellCasingInnerDiameter_inches'].mul(2.54).round(1) #renamed for Wells

dfbmj3['WidthOfOpeningsInOpenInterval_cm'] = dfbmj3['GWSI_WidthOfOpeningsInOpenInterval_inches'].mul(2.54).round(3) #renamed for Wells

dfbmj3['GWSI_ScreenLength_ft'] = dfbmj3['GWSI_BottomOfScreenDepth_ftBLS'] - dfbmj3['GWSI_TopOfScreenDepth_ftBLS']
dfbmj3['ScreenLength_m'] = dfbmj3['GWSI_ScreenLength_ft'].mul(.3048).round(3) #renamed for Wells

dfbmj3['well_MP_height_m'] = dfbmj3['GWSI_MP_height_ft'].mul(.3048).round(3) #renamed for Wells

NameError: name 'dfbmj3' is not defined

# tblSites

In [213]:
# copy and pasted the list of available columns from tblSites2.keys()
# select which columns to keep
tblSites2 = tblSites[['AgencyCode', 'Wellcon_SiteRecordID', 'DatabasePointType',
       'USGS_siteno', 'USGS_StationName', 'LocalSiteName', 'XcoordUTMNAD83_m',
       'YcoordUTMNAD83_m', 'LandSurfaceAltitude_ftASL_NAVD88',
       'LandSurfaceAltitude_mASL_NAVD88', 'AgencyUse', 'Comments_GWSISite',
       'DateOfConstruction', 'Comments_DateOfConstruction', 'VarianceNumber',
       'TotalBoring/DrillingDepth_ftBLS', 'VarianceGranted',
       'SourceOfDepthData', 'NameOfContractor', 'DrillerName',
       'StartingDepthOfHole_ftBLS', 'DiameterOfDrillHole_inches',
       'PropertyOwnerName', 'PropertyOwnerAddress',
       'HorizontalCoordinateSource', 'SiteRecordNumber', 'NWTPosition_m',
       'Loc/MiscCom', 'PlotMaps', 'LogCode', 'SurveyNotes',
       'SiteEstablishedForWhom', 'OilSmell', 'SiteVerticalSource',
       'SiteActiveStatus', 'XLocal_m_FromArc', 'YLocal_m_FromArc', 'OnNWT',
       'Comments_Status', 'ApproxRemovalDate', 'StudySite']]
# create indicator columns in tblSites for which sites exist in each of site type tables (wells, cores, other equipment)
tblSites2['WellSite'] = np.where(tblSites2['LocalSiteName'].isin(tblWells['LocalSiteName']), 1, 0) # inserts 1 if true and 0 if false
tblSites2['CoreSite'] = np.where(tblSites2['LocalSiteName'].isin(tblCores['LocalSiteName']), 1, 0)
tblSites2['OtherEquipmentSite'] = np.where(tblSites2['LocalSiteName'].isin(tblOE['LocalSiteName']), 1, 0)

# convert from ft to m
tblSites2["TotalBoring/DrillingDepth_mBLS"] = tblSites2["TotalBoring/DrillingDepth_ftBLS"].mul(0.3048).round(3)

# rename some columns
tblSites2 = tblSites2.rename(columns={'OnNWT':'OnNorthWellTransect',
                                      'NWTPosition_m':'DistanceFromCenterOfNorthOilBody_m',
                                      'LandSurfaceAltitude_mASL_NAVD88':'LandSurfaceElevation_mASL_NAVD88',
                                      'Loc/MiscCom':'Comments_Miscellaneous'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tblSites2['WellSite'] = np.where(tblSites2['LocalSiteName'].isin(tblWells['LocalSiteName']), 1, 0) # inserts 1 if true and 0 if false
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tblSites2['CoreSite'] = np.where(tblSites2['LocalSiteName'].isin(tblCores['LocalSiteName']), 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

## Merge tbleSites (local access) with GWSI (retrieved) to update the USGS station names

In [214]:
# match up datatypes of the columns that the dfs will be merged on
tblSites2['USGS_siteno'] = tblSites2['USGS_siteno'].fillna(0).astype('int64')
tblSites3 = pd.merge(tblSites2, dfbmj3[[
                                   "GWSI_USGS_siteno",
                                   "GWSI_USGS_StationName",
                                   "GWSI_LandSurfaceAltitude_mASL_NAVD88",
                                   "GWSI_LandSurfaceAltitude_ftASL_NAVD88",
                                   "GWSI_DiameterOfDrillHole_inches",
                                   "GWSI_AgencyUse",
                                   "GWSI_Comments_GWSISite",
                                   "GWSI_TotalBoring/DrillingDepth_ftBLS",
                                   "GWSI_NameOfContractor" ,
                                   "GWSI_SourceOfDepthData" ,
                                   "GWSI_StartingDepthOfHole_ftBLS",
                                   "GWSI_DateOfConstruction"]],
                                   left_on='USGS_siteno', right_on='GWSI_USGS_siteno', how='left')
#turn zeros back into NaN
tblSites3['USGS_siteno'].replace(0, np.nan, inplace=True) 

# update USGS station names and date of construction for records with info in GWSI
# Combine the station names into one column where GWSI updates the local column IF it is a valid number
tblSites3['USGS_StationName'] = tblSites3['GWSI_USGS_StationName'].where(tblSites3['GWSI_USGS_StationName'].notna(), tblSites3['USGS_StationName']) 
# repeat for date of construction but I have to get the columns in the same date format (YYYYMMDD)
tblSites3['DateOfConstruction'] = tblSites3['DateOfConstruction'].dt.strftime('%Y%m%d')
tblSites3['DateOfConstruction'] = tblSites3['GWSI_DateOfConstruction'].where(tblSites3['GWSI_DateOfConstruction'].notna(), tblSites3['DateOfConstruction'])



In [215]:
#NEWEST AUTHORITATIVE SOURCES FOR MLR (1/11/2023)
tblSites3 = pd.merge(tblSites3, dfmlr, left_on='USGS_siteno', right_on='site_no', how='left')
#station name
tblSites3['USGS_StationName'] = tblSites3['station_nm'].where(tblSites3['station_nm'].notna(), tblSites3['USGS_StationName'])
#landsurfaceelevation
tblSites3['alt_va'] = tblSites3['alt_va'].mul(0.3048).round(3) # convert the column from ft to m
tblSites3['LandSurfaceElevation_mASL_NAVD88'] = tblSites3['alt_va'].where(tblSites3['alt_va'].notna(), tblSites3['LandSurfaceElevation_mASL_NAVD88'])
#dateofconstruction
tblSites3['DateOfConstruction'] = tblSites3['construction_dt'].where(tblSites3['construction_dt'].notna(), tblSites3['DateOfConstruction'])
#drillingdepth
tblSites3['hole_depth_va'] = tblSites3['hole_depth_va'].mul(0.3048).round(3) # convert the column from ft to m
tblSites3['TotalBoring/DrillingDepth_mBLS'] = tblSites3['hole_depth_va'].where(tblSites3['hole_depth_va'].notna(), tblSites3['TotalBoring/DrillingDepth_mBLS'])



In [216]:
#automating this process
v1 = 'TotalWellDepth_mBLS' # old variable
v2 = 'well_depth_va' # new variable
t1 = 'tblWells3' # table name


print(t1+"['"+v2+"'] = "+t1+"['"+v2+"'].mul(0.3048).round(3)")
print(t1+"['"+v1+"'] = "+t1+"['"+v2+"'].where("+t1+"['"+v2+"'].notna(), "+t1+"['"+v1+"'])")
print(t1+"[['USGS_siteno', '"+v1+"', '"+v2+"']].sample(10)")

tblWells3['well_depth_va'] = tblWells3['well_depth_va'].mul(0.3048).round(3)
tblWells3['TotalWellDepth_mBLS'] = tblWells3['well_depth_va'].where(tblWells3['well_depth_va'].notna(), tblWells3['TotalWellDepth_mBLS'])
tblWells3[['USGS_siteno', 'TotalWellDepth_mBLS', 'well_depth_va']].sample(10)


In [217]:
tblSites3[['USGS_siteno', 'TotalWellDepth_mBLS', 'well_depth_va']].sample(10)

KeyError: "['TotalWellDepth_mBLS'] not in index"

In [None]:
# my check to see if alt_va and LandSurfaceElvation were similar when both values existed
tblSites3[['USGS_siteno', 'alt_va', 'LandSurfaceElevation_mASL_NAVD88']].sample(11)

Unnamed: 0,USGS_siteno,alt_va,LandSurfaceElevation_mASL_NAVD88
337,,,430.557
376,,,432.949
1127,473421100000000.0,429.838,429.839
923,473427100000000.0,433.218,433.219
168,,,426.071
955,473418100000000.0,430.743,430.742
586,,,432.82
908,473426100000000.0,430.655,430.655
57,,,433.337
1354,,,430.167


## tblWells

In [218]:
# select which columns to keep
tblWells2 = tblWells[['WellEntryRecordNumber', 'LocalSiteName', 'LocalSiteWellSubName',
       'GWSISiteType', 'MNUniqueNmbr', 'GWSIUseOfSite', 'GWSIAquiferType',
       'GWSIPrimaryAquifer', 'GWSINationalAquifer', 'TypeOfBackFill',
       'NumberBagsOfCleanSand', 'BentoniteUsed', 'TypeOfBentonite',
       'CoreTakenBeforeWellInstallation', 'TailPipeLength_ft',
       'TailPipeLengthRemarks', 'ProtectionPipeInstalled',
       'ProtectionPipeMaterial', 'DiameterOfProtectionPipe',
       'ProtectionPipeLength_ft', 'LengthBetweenProtectionPipeAndMP',
       'CementPadInstalled', 'DrilledForWhom', 'DrillersFieldComments',
       'TotalWellDepth_ftBLS', 'StickupLength_ft', 'StickupSource',
       'WellCasingInnerDiameter_inches', 'CasingMaterial',
       'SourceOfConstructionData', 'MethodOfConstruction', 'TypeOfFinish',
       'NumberBagsOfGrout', 'TypeOfSeal', 'WellGrouted', 'DepthofSeal_ftBLS',
       'StartOfSeal_ftBLS', 'MethodOfDevelopment', 'HoursOfDevelopment',
       'SpecialTreatmentForDevelopment', 'TopOfScreenDepth_ftBLS',
       'MidOfScreenDepth_ftBLS', 'BottomOfScreenDepth_ftBLS',
       'Comments_TopOfScreenDepth', 'Comments_MidScreenDepth',
       'Comments_BottomOfScreenDepth', 'ScreenLength_ft',
       'ScreenLengthRemarks', 'ScreenInnerDiameter_inches',
       'ScreenMaterialType', 'TypeOfOpenInterval',
       'WidthOfOpeningsInOpenInterval_inches', 'WellOwnerName',
       'WellOwnerAddress', 'ScreenMake', 'LicenseeBusinessName', 'LicNum',
       'CertifiedRepresentative', 'CertifiedRepNo', 'DateSignedAndCertified',
       'AltitudeOfMeasuringPoint_ftASL_NAVD88',
       'AltitudeOfMeasuringPoint_mASL_NAVD88', 'MeasuringPointAltitudeRemarks',
       'LandSurfaceAltitude_ftASL_NAVD88', 'Pre2010_CasMat', 'LocalUseCode',
       'WellMPVerticalSource', 'Wellcon_SiteRecordID', 'WellPurpose',
       'WellProjectID', 'WaterTableWell', 'SouthPoolWell', 'QuarterlyWLSite',
       'TransducerSite', 'DifficultToPump', 'DifficultToPump_Comments']]

## Merge tblWells to add all columns needed

In [219]:
#CREATING THE MASTER SITE LIST
mastersitelist = pd.merge (tblSites3, tblWells2, on='LocalSiteName', how='left')
mastersitelist = pd.merge(mastersitelist, dfbmj3, left_on='USGS_siteno', right_on='GWSI_USGS_siteno', how='left')
mastersitelist = mastersitelist[["LocalSiteName",
                            "StudySite",
                            "AgencyCode",
                            "USGS_siteno",
                            "USGS_StationName",
                            "XcoordUTMNAD83_m",
                            "YcoordUTMNAD83_m",
                            "LandSurfaceElevation_mASL_NAVD88",
                            "OnNorthWellTransect",
                            "DistanceFromCenterOfNorthOilBody_m",
                            "WellSite",
                            "CoreSite",
                            "OtherEquipmentSite",
                            "TopOfScreenElevation_mASL_NAVD88",
                            "BottomOfScreenElevation_mASL_NAVD88",
                            "MidOfScreenElevation_mASL_NAVD88",
                            "DateOfConstruction",
                            "Comments_DateOfConstruction",
                            "SiteActiveStatus",
                            "Comments_Status",
                            "ApproxRemovalDate"]]

# format date
mastersitelist['LocalSiteName'] = 'x' + mastersitelist['LocalSiteName'].astype(str)
mastersitelist['USGS_siteno'] = 'x' + mastersitelist['USGS_siteno'].astype(str)

# make nan values uniform
mastersitelist['USGS_siteno'] = mastersitelist['USGS_siteno'].replace('xnan', np.nan)
null_cells = mastersitelist.isnull()
mastersitelist = mastersitelist.astype(str).mask(null_cells, np.NaN)

# tblOE

In [220]:
tblOE2 = tblOE[["LocalSiteName",
                "LocalUseCode",
                "OtherEquipStickupLength_ft",
                "OtherEquipmentPurpose",
                "Comments_Equipment",
                "Comments"]]

tblOE2["OtherEquipStickupLength_m"] = tblOE2["OtherEquipStickupLength_ft"].mul(0.3048).round(3)
tblOE2 = tblOE2.rename(columns={'OtherEquipStickupLength_m':'OtherEquip_MP_height_m'})

tblOE2 = pd.merge(tblOE2, tblSites3[["LocalSiteName",
                                   "StudySite",
                                   "USGS_siteno",
                                   "USGS_StationName",
                                   "XcoordUTMNAD83_m",
                                   "YcoordUTMNAD83_m",
                                   "OnNorthWellTransect",
                                   "DistanceFromCenterOfNorthOilBody_m",
                                   "LandSurfaceElevation_mASL_NAVD88",
                                   "DateOfConstruction",
                                   "Comments_DateOfConstruction",
                                   "NameOfContractor",
                                   "DrillerName",
                                   "Comments_Miscellaneous"]], how='left')
tblOE2 = pd.merge(tblOE2, tblcd_LocalUseCode[['LocalUseCode','Comments_UseCode']])

tblOE2 = tblOE2[["LocalSiteName",
                 "StudySite",
                 "LocalUseCode",
                 "Comments_UseCode",
                 "USGS_siteno",
                 "USGS_StationName",
                 "XcoordUTMNAD83_m",
                 "YcoordUTMNAD83_m",
                 "OnNorthWellTransect",
                 "DistanceFromCenterOfNorthOilBody_m",
                 "LandSurfaceElevation_mASL_NAVD88",
                 "OtherEquip_MP_height_m",
                 "DateOfConstruction",
                 "Comments_DateOfConstruction",
                 "NameOfContractor",
                 "DrillerName",
                 "OtherEquipmentPurpose",
                 "Comments_Equipment",
                 "Comments",
                 "Comments_Miscellaneous"]]
tblOE2['LocalSiteName'] = 'x' + tblOE2['LocalSiteName'].astype(str)
tblOE2['USGS_siteno'] = 'x' + tblOE2['USGS_siteno'].astype(str)
tblOE2['USGS_siteno'] = tblOE2['USGS_siteno'].replace('xnan', np.nan)
null_cells = tblOE2.isnull()
tblOE2 = tblOE2.astype(str).mask(null_cells, np.NaN)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tblOE2["OtherEquipStickupLength_m"] = tblOE2["OtherEquipStickupLength_ft"].mul(0.3048).round(3)


# tblCores

In [221]:
tblCores2 = tblCores[['CoreEntryRecordNumber', 'LocalSiteName', 'LocalSiteCoreSubName',
       'CoreLocationNear', 'DepthDrilledBeforeCoring_ftBLS',
       'CoringBegan_ftBLS', 'CoringEnded_ftBLS', 'CoreBarrelType',
       'TotalCoreLengthPounded_ft', 'CoreDrivingEquipment', 'OilSmell',
       'C02Used', 'TakeC02', 'CoreRecoveryLength_ft', 'MethaneBubblesPresent',
       'SubsurfaceCoreZone', 'PostCoreEquipmentInstallation', 'MultipleCores',
       'FreeProductOilPresentInCore', 'DrillersFieldComments',
       'TypeOfBackFill', 'NumberBagsOfBentonite', 'BentoniteUsed',
       'GWSIMethodOfConstruction', 'TypeOfSeal', 'DepthofSeal_ftBLS',
       'StartOfSeal_ftBLS', 'SourceOfConstructionData', 'CoreStickupSource',
       'CoreStickupLength_ft', 'TypeOfBentonite', 'NumberOfBagsCleanSand',
       'CoreAltitudeOfMeasuringPoint_ftASL_NAVD88',
       'CoreAltitudeOfMeasuringPoint_mASL_NAVD88',
       'CoreMeasuringPointAltitudeRemarks', 'CoreMPVerticalSource',
       'LocalUseCode', 'Wellcon_SiteRecordID', 'CorePurposePlannedAnalyses',
       'CoreProjectID', 'CementPadInstalled', 'MarkedWithCorePlate',]]  

#Calculations loop AND renaming
vars_ft = ['CoreStickupLength_ft',
           'CoringBegan_ftBLS',
           'CoringEnded_ftBLS',
           'CoreRecoveryLength_ft',
           'TotalCoreLengthPounded_ft']
vars_m = [i.replace('ft', 'm') for i in vars_ft]
for var_ft, var_m in zip(vars_ft, vars_m):
        tblCores2[var_m] = tblCores2[var_ft].mul(0.3048).round(3)

tblCores2 = tblCores2.rename(columns={
                                'CoreStickupLength_m':'Core_MP_height_m',
                                'TotalCoreLengthPounded_m':'CoreLengthPounded_m'})

#Merging
tblCores2 = pd.merge(tblCores2, tblSites3[["LocalSiteName",
                                     "StudySite",
                                     "USGS_siteno",
                                     "USGS_StationName",
                                     "XcoordUTMNAD83_m",
                                     "YcoordUTMNAD83_m",
                                     "OnNorthWellTransect",
                                     "DistanceFromCenterOfNorthOilBody_m",
                                     "LandSurfaceElevation_mASL_NAVD88",
                                     "TotalBoring/DrillingDepth_mBLS", 
                                     "DateOfConstruction",
                                     "Comments_DateOfConstruction",
                                     "TotalBoring/DrillingDepth_ftBLS", 
                                     "NameOfContractor",
                                     "DrillerName",
                                     "Comments_Miscellaneous"]], how='left')
tblCores2 = pd.merge(tblCores2, tblcd_LocalUseCode[['LocalUseCode','Comments_UseCode']])

#New calculaton after merge
tblCores2['TopOfCoreElevation_mASL_NAVD88'] = tblCores2['LandSurfaceElevation_mASL_NAVD88'] - tblCores2['CoringBegan_mBLS']

#rearange to put in order of data release
tblCores2 = tblCores2[["LocalSiteName",
                       "StudySite",
                       "LocalUseCode",
                       "Comments_UseCode",
                       "USGS_siteno",
                       "USGS_StationName",
                       "LocalSiteCoreSubName",
                       "XcoordUTMNAD83_m",
                       "YcoordUTMNAD83_m",
                       "OnNorthWellTransect",
                       "DistanceFromCenterOfNorthOilBody_m",
                       "LandSurfaceElevation_mASL_NAVD88",
                       "Core_MP_height_m",
                       "DateOfConstruction",
                       "Comments_DateOfConstruction",
                       "TotalBoring/DrillingDepth_mBLS",
                       "NameOfContractor",
                       "DrillerName",
                       "CoringBegan_mBLS",
                       "CoringEnded_mBLS",
                       "CoreRecoveryLength_m",
                       "TopOfCoreElevation_mASL_NAVD88",
                       "CoreLengthPounded_m",
                       "CoreBarrelType",
                       "SubsurfaceCoreZone",
                       "FreeProductOilPresentInCore",
                       "DrillersFieldComments",
                       "Comments_Miscellaneous"]]       

#Format columns and male nan values uniform
tblCores2['LocalSiteName'] = 'x' + tblCores2['LocalSiteName'].astype(str)
tblCores2['USGS_siteno'] = 'x' + tblCores2['USGS_siteno'].astype(str)
tblCores2['USGS_siteno'] = tblCores2['USGS_siteno'].replace('xnan', np.nan)
null_cells = tblCores2.isnull()
tblCores2 = tblCores2.astype(str).mask(null_cells, np.NaN)


# tblWells

In [235]:
#PRINT OUT DF KEYS IN ABC ORDER
dfmlr.reindex(sorted(dfmlr.columns), axis=1).keys()

Index(['agency_cd', 'alt_acy_va', 'alt_datum_cd', 'alt_meth_cd', 'alt_va',
       'aqfr_cd', 'aqfr_type_cd', 'basin_cd', 'construction_dt',
       'contrib_drain_area_va', 'coord_acy_cd', 'coord_datum_cd',
       'coord_meth_cd', 'country_cd', 'county_cd', 'dec_coord_datum_cd',
       'dec_lat_va', 'dec_long_va', 'depth_src_cd', 'district_cd',
       'drain_area_va', 'gw_file_cd', 'hole_depth_va', 'huc_cd',
       'instruments_cd', 'inventory_dt', 'land_net_ds', 'lat_va',
       'local_time_fg', 'long_va', 'map_nm', 'map_scale_fc', 'nat_aqfr_cd',
       'project_no', 'reliability_cd', 'site_no', 'site_tp_cd', 'state_cd',
       'station_nm', 'topo_cd', 'tz_cd', 'well_depth_va'],
      dtype='object')

In [241]:
#PRINT OUT DF KEYS IN ABC ORDER
tblWells3.reindex(sorted(tblWells3.columns), axis=1).keys()

Index(['AgencyCode', 'AgencyUse', 'AltitudeOfMeasuringPoint_ftASL_NAVD88',
       'AltitudeOfMeasuringPoint_mASL_NAVD88', 'AppliedByUser', 'AppliedTime',
       'ApproxRemovalDate', 'BentoniteUsed', 'BottomOfScreenDepth_ftBLS',
       'BottomOfScreenElevation_mASL_NAVD88',
       ...
       'reliability_cd', 'site', 'site_no', 'site_tp_cd', 'state_cd',
       'station_nm', 'topo_cd', 'tz_cd', 'well_MP_height_m', 'well_depth_va'],
      dtype='object', length=274)

In [249]:
# since already created, start with the merge
tblWells3 = pd.merge(tblWells2, tblSites3,  on='LocalSiteName', how='left') #tblSites3 here already has dfmlr in it!
tblWells3 = pd.merge(tblWells3, dfbmj3, left_on='USGS_siteno', right_on='GWSI_USGS_siteno', how='left')

# add tblcd info
tblWells3 = pd.merge(tblWells3, tblcd_OpeningType[["TypeOfOpenInterval", "comments_OpeningType"]], left_on='GWSI_TypeOfOpenInterval', right_on='TypeOfOpenInterval', how='left')
tblWells3 = pd.merge(tblWells3, tblcd_LocalUseCode[["LocalUseCode", "Comments_UseCode"]], how='left')
tblWells3 = pd.merge(tblWells3, tblcd_CasingMaterial[["CasingMaterial", "Comments_CasingMaterial"]], left_on='GWSI_CasingMaterial', right_on='CasingMaterial', how='left')
tblWells3 = pd.merge(tblWells3, tblcd_ScreenMaterial[["ScreenMaterial", "Comments_ScreenMaterial"]], left_on='GWSI_ScreenMaterialType', right_on='ScreenMaterial', how='left')

# rename tblcd info
tblWells3 = tblWells3.rename(columns={"comments_OpeningType":'OpeningTypeDescription'})
tblWells3 = tblWells3.rename(columns={"Comments_CasingMaterial":'CasingMaterialDescription'})
tblWells3 = tblWells3.rename(columns={"Comments_ScreenMaterial":'ScreenMaterialDescription'})

# NEWEST AUTHORITATIVE SOURCES (1/11/2023) for MLR (already in tblWells3 since its already in tblSites3)
# well depth
tblWells3['well_depth_va'] = tblWells3['well_depth_va'].mul(0.3048).round(3)
tblWells3['TotalWellDepth_mBLS'] = tblWells3['well_depth_va'].where(tblWells3['well_depth_va'].notna(), tblWells3['TotalWellDepth_mBLS'])
#AQUARIUS
tblWells3 = pd.merge(tblWells3, dfaq, left_on='USGS_siteno', right_on='site', how='left')
#MP height
tblWells3['Elevation'] = tblWells3['Elevation'].mul(0.3048).round(3)
tblWells3['well_MP_height_m'] = tblWells3['Elevation'].where(tblWells3['Elevation'].notna(), tblWells3['well_MP_height_m'])



In [319]:
tblWells3[['DateOfConstruction','ApproxRemovalDate','ValidFrom','DecommissionedDate', 'GWSI_MP_BeginDate', 'GWSI_MP_EndDate']].sample(10)

Unnamed: 0,DateOfConstruction,ApproxRemovalDate,ValidFrom,DecommissionedDate,GWSI_MP_BeginDate,GWSI_MP_EndDate
269,19840701.0,,1984-07-02T00:00:00.0000000-06:00,,19840702.0,
545,20040721.0,,,,20040721.0,
595,20050615.0,,0001-01-01T00:00:00.0000000+00:00,,20050616.0,
308,19880607.0,,0001-01-01T00:00:00.0000000+00:00,,19880608.0,
95,19970620.0,,1997-06-20T00:00:00.0000000-06:00,,19970620.0,
616,,,,,,
740,,,,,,
844,20170628.0,,,,20170628.0,
202,19830701.0,,1983-07-01T00:00:00.0000000-06:00,1983-07-02T00:00:00.0000000-06:00,19830702.0,
91,19970620.0,,1997-06-20T00:00:00.0000000-06:00,,19970620.0,


In [245]:
#automating this process
v1 = 'well_MP_height_m' # old variable
v2 = 'Elevation' # new variable
t1 = 'tblWells3' # table name


print(t1+"['"+v2+"'] = "+t1+"['"+v2+"'].mul(0.3048).round(3)")
print(t1+"['"+v1+"'] = "+t1+"['"+v2+"'].where("+t1+"['"+v2+"'].notna(), "+t1+"['"+v1+"'])")
print(t1+"[['USGS_siteno', '"+v1+"', '"+v2+"']].sample(10)")

tblWells3['Elevation'] = tblWells3['Elevation'].mul(0.3048).round(3)
tblWells3['well_MP_height_m'] = tblWells3['Elevation'].where(tblWells3['Elevation'].notna(), tblWells3['well_MP_height_m'])
tblWells3[['USGS_siteno', 'well_MP_height_m', 'Elevation']].sample(10)


In [195]:
#data release
# select a subset of data
tblWells3 = tblWells3[["LocalSiteName",
                       "StudySite",
                       "LocalUseCode",
                       "Comments_UseCode",
                       "USGS_siteno",
                       "USGS_StationName",
                       "XcoordUTMNAD83_m",
                       "YcoordUTMNAD83_m",
                       "OnNorthWellTransect",
                       "DistanceFromCenterOfNorthOilBody_m",
                       "MeasuringPointElevation_mASL_NAVD88", 
                       "well_MP_height_m",
                       "LandSurfaceAltitude_mASL_NAVD88",
                       "TopOfScreenElevation_mASL_NAVD88",
                       "BottomOfScreenElevation_mASL_NAVD88",
                       "ScreenLength_m",
                       "MidOfScreenElevation_mASL_NAVD88",
                       "TotalWellDepth_mBLS",
                       "TotalBoring/DrillingDepth_mBLS",
                       "DiameterOfDrillHole_cm",
                       "WellCasingInnerDiameter_cm",
                       "OpeningTypeDescription",
                       "WidthOfOpeningsInOpenInterval_cm",
                       "CasingMaterialDescription",
                       "ScreenMaterialDescription",
                       "DateOfConstruction",
                       "Comments_DateOfConstruction",
                       "NameOfContractor",
                       "DrillerName",
                       "Comments_Miscellaneous",
                       "SiteActiveStatus",
                       "Comments_Status",
                       "ApproxRemovalDate"]]  

# formatting columns
tblWells3['LocalSiteName'] = 'x' + tblWells3['LocalSiteName'].astype(str)
tblWells3['USGS_siteno'] = 'x' + tblWells3['USGS_siteno'].astype(str)
#Make nan values uniform
tblWells3['USGS_siteno'] = tblWells3['USGS_siteno'].replace('xnan', np.nan)
null_cells = tblWells3.isnull()
tblWells3 = tblWells3.astype(str).mask(null_cells, np.NaN)  

# bmj_rmk data release

In [23]:
dfrmk2 = dfrmk[['GWSI_AgencyCode', 'GWSI_USGS_siteno', 'GWSI_GWSI_RMK',
       'GWSI_GWSI_RMK_Date', 'GWSI_GWSI_RMK_SequenceNo']]

# make columns the same datatype of interger
tblSites['USGS_siteno'] = tblSites['USGS_siteno'].fillna(0).astype('int64')
# dfrmk2's is already an interger
dfrmk2 = pd.merge(dfrmk2, tblSites[['USGS_siteno', 'LocalSiteName']], left_on= "GWSI_USGS_siteno", right_on= 'USGS_siteno', how='left')

In [24]:
# organize for data release
dfrmk3 = dfrmk2[['GWSI_AgencyCode',
                 'GWSI_USGS_siteno',
                 'LocalSiteName',
                 'GWSI_GWSI_RMK',
                 'GWSI_GWSI_RMK_Date',
                 'GWSI_GWSI_RMK_SequenceNo']]
# format numbers with 'x'
dfrmk3['GWSI_USGS_siteno'] = 'x' + dfrmk3['GWSI_USGS_siteno'].astype(str)
dfrmk3['LocalSiteName'] = 'x' + dfrmk3['LocalSiteName'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrmk3['GWSI_USGS_siteno'] = 'x' + dfrmk3['GWSI_USGS_siteno'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfrmk3['LocalSiteName'] = 'x' + dfrmk3['LocalSiteName'].astype(str)


# Saving Every Figure in a new directory

In [25]:
# create variable with today's date
date = datetime.today().strftime('%Y%m%d') 
# create variable for new folder/directory
dir = "data_outputs/"+date+"_datarelease" # new directory location and name
# create the directory unless it already exists...then skip
try:
    os.mkdir(dir)
except:
    pass
# save all the files there 
mastersitelist.to_csv(dir+"/DataRelease_MasterSiteList.csv", index=False)
tblOE2.to_csv(dir+"/DataRelease_OtherEquipmentInformation.csv", index=False)
tblCores2.to_csv(dir+"/DataRelease_CoreInformation.csv", index=False)
tblWells3.to_csv(dir+"/DataRelease_WellConstructionInformation.csv", index=False)
dfrmk3.to_csv(dir+"/DataRelease_rmk3.csv", index=False)

# Fixing the caveat that keeps popping up

In [8]:
dfrmk2 = dfrmk[['GWSI_AgencyCode', 
                'GWSI_USGS_siteno', 
                'GWSI_GWSI_RMK',
                'GWSI_GWSI_RMK_Date', 
                'GWSI_GWSI_RMK_SequenceNo']]

#would translate into

dfrmk2 = dfrmk.loc[:, ['GWSI_AgencyCode', 
                      'GWSI_USGS_siteno', 
                      'GWSI_GWSI_RMK',
                      'GWSI_GWSI_RMK_Date', 
                      'GWSI_GWSI_RMK_SequenceNo']]

Unnamed: 0,GWSI_AgencyCode,GWSI_USGS_siteno,GWSI_GWSI_RMK,GWSI_GWSI_RMK_Date,GWSI_GWSI_RMK_SequenceNo
0,USGS,473429095051006,This well was re-surveyed on 27 June 2019 by J...,20190730.000,1.000
1,USGS,473429095051006,Digital levels were then ran from the temporar...,20190730.000,2.000
2,USGS,473429095051006,This well casing is constructed of stainless s...,20190730.000,3.000
3,USGS,473424095052912,"This is port ""01"" for vadose zone vapor/gas sa...",20200206.000,1.000
4,USGS,473424095052906,"This is port ""01"" for vadose zone vapor/gas sa...",20200206.000,1.000
...,...,...,...,...,...
628,USGS,473419095052503,This is the location of a wetland staff gage f...,20190730.000,1.000
629,USGS,473419095052304,"This is port ""01"" for vadose zone vapor/gas sa...",20200206.000,1.000
630,USGS,473425095051601,This is a monitoring well used for the Bemidji...,20170726.000,1.000
631,USGS,473423095051501,This monitoring well used for the Bemidji Toxi...,20170726.000,1.000
