Make sure to have sqlalchemy-access installed for uploading back into the database

Only needs to be done once! 

You can't be connected to VPN due to encryption issues. Install using Anaconda prompt when disconnected from VPN

In [None]:
# only needed if you want to reupload to Microsoft Access database
#pip install sqlalchemy-access

# Import Libraries

In [None]:
import pandas as pd #for creating dataframe
import pyodbc #working with ODBC databases
import numpy as np # for locating values in dataframes
from datetime import datetime # for obtaining today's date
import os #for working with directories

In [None]:
#optional- set dfs to show actual numbers rather than scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x) 

# Manage directories

In [None]:
# my default directory is c:\Users\bmilinic\OneDrive - DOI\Documents\Python\bemidji
defaultdirectory = os.getcwd()
print(defaultdirectory)

In [None]:
# connect to the shared drive which holds the databases and files 
os.chdir('P:/0083/analysis/DataCompilation/DataCompilationPy/create_site_info_files')

# Import data

In [None]:
# from GWSI Python output 
dfbmj3 = pd.read_csv(r'data_inputs/gwsi_old/bmj3_fromPy.csv')
dfrmk = pd.read_csv(r'data_inputs/gwsi_old/bmj_rmk_fromPy.csv')
# from Python outout aquarius and MLR 
dfaq = pd.read_csv(r'data_inputs/aquarius/Referencepoints_updatedMP_fromPy.csv') 
dfmlr = pd.read_csv(r'data_inputs/MLR/MLR_fromPy.csv')

# Import data from Microsoft Access Using PYODBC
Gfe_db = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=P:\0083\analysis\DataCompilation\DataCompilationPy\local_access_db\BemidjiMasterSiteData_fe.accdb;')
c_fe = Gfe_db.cursor()
# tables
tblSites = pd.read_sql('select * from tblSites', Gfe_db)
tblWells = pd.read_sql('select * from tblWells', Gfe_db)
tblCores = pd.read_sql('select * from tblCores', Gfe_db)
tblOE = pd.read_sql('select * from tblOtherEquipment', Gfe_db)
# cd tables
tblcd_LocalUseCode = pd.read_sql("select * from tblcd_LocalUseCode", Gfe_db)  
tblcd_CasingMaterial = pd.read_sql("select * from tblcd_CasingMaterial", Gfe_db)   
tblcd_ScreenMaterial = pd.read_sql("select * from tblcd_ScreenMaterial", Gfe_db)   
tblcd_OpeningType = pd.read_sql("select * from tblcd_OpeningType", Gfe_db)  

## Merge tbleSites (local access) with GWSI (retrieved) to update the USGS station names

In [816]:
# merge tblSites and dfbmj3: match up datatypes of the columns that the dfs will be merged on
tblSites2 = tblSites
tblSites2['USGS_siteno'] = tblSites2['USGS_siteno'].fillna(0).astype('int64')
dfcomplete = pd.merge(tblSites2.add_prefix('SITE_'), dfbmj3, left_on='SITE_USGS_siteno', right_on='GWSI_USGS_siteno', how='left')
# turn zeros back into NaN
dfcomplete['SITE_USGS_siteno'].replace(0, np.nan, inplace=True) 
print(dfcomplete.shape)
# Merge tbl wells
dfcomplete = pd.merge(dfcomplete, tblWells.add_prefix('WELL_'), left_on='SITE_LocalSiteName', right_on='WELL_LocalSiteName', how='left')
print(dfcomplete.shape)
# Merge tblOE
dfcomplete = pd.merge(dfcomplete, tblOE.add_prefix('OE_'), left_on='SITE_LocalSiteName', right_on='OE_LocalSiteName', how='left')
print(dfcomplete.shape)
# Merge tblCores
dfcomplete = pd.merge(dfcomplete, tblCores.add_prefix('CORE_'), left_on='SITE_LocalSiteName', right_on='CORE_LocalSiteName', how='left')
print(dfcomplete.shape)
# Merge dfmlr
dfcomplete = pd.merge(dfcomplete, dfmlr.add_prefix('MLR_'), left_on='SITE_USGS_siteno', right_on='MLR_site_no', how='left')
print(dfcomplete.shape)
# Merge dfaq
dfcomplete = pd.merge(dfcomplete, dfaq.add_prefix('AQ_'), left_on='SITE_USGS_siteno', right_on='AQ_site', how='left')
print(dfcomplete.shape)

(1729, 123)
(1731, 217)
(1741, 250)
(2474, 292)
(2474, 334)
(2474, 350)
(2474, 350)


In [817]:
# calculate additional columns
# GWSI COLS
dfcomplete['GWSI_MeasuringPointElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] + dfcomplete['GWSI_MP_height_ft']
dfcomplete['MeasuringPointElevation_mASL_NAVD88'] = dfcomplete['GWSI_MeasuringPointElevation_ftASL_NAVD88'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['GWSI_MeasuringPointHeight_m'] = dfcomplete['GWSI_MP_height_ft'].mul(0.3048).round(3)

dfcomplete['LandSurfaceAltitude_mASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['GWSI_TopOfScreenElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - dfcomplete['GWSI_TopOfScreenDepth_ftBLS']
dfcomplete['TopOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_TopOfScreenElevation_ftASL_NAVD88'].mul(0.3048).round(3)

dfcomplete['GWSI_BottomOfScreenElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - dfcomplete['GWSI_BottomOfScreenDepth_ftBLS']
dfcomplete['BottomOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_BottomOfScreenElevation_ftASL_NAVD88'].mul(0.3048).round(3)

dfcomplete['GWSI_MidOfScreenElevation_mASL_NAVD88'] = dfcomplete['TopOfScreenElevation_mASL_NAVD88'] + dfcomplete['BottomOfScreenElevation_mASL_NAVD88']
dfcomplete['MidOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_MidOfScreenElevation_mASL_NAVD88'].div(2).round(3)

dfcomplete['TotalWellDepth_mBLS'] = dfcomplete['GWSI_TotalWellDepth_ftBLS'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['DiameterOfDrillHole_cm'] = dfcomplete['GWSI_DiameterOfDrillHole_inches'].mul(2.54).round(1) #renamed for Wells

dfcomplete['WellCasingInnerDiameter_cm'] = dfcomplete['GWSI_WellCasingInnerDiameter_inches'].mul(2.54).round(1) #renamed for Wells

dfcomplete['WidthOfOpeningsInOpenInterval_cm'] = dfcomplete['GWSI_WidthOfOpeningsInOpenInterval_inches'].mul(2.54).round(3) #renamed for Wells

dfcomplete['GWSI_ScreenLength_ft'] = dfcomplete['GWSI_BottomOfScreenDepth_ftBLS'] - dfcomplete['GWSI_TopOfScreenDepth_ftBLS']
dfcomplete['ScreenLength_m'] = dfcomplete['GWSI_ScreenLength_ft'].mul(.3048).round(3) #renamed for Wells

dfcomplete['well_MP_height_m'] = dfcomplete['GWSI_MP_height_ft'].mul(.3048).round(3) #renamed for Wells
# SITE COLS
dfcomplete["TotalBoring/DrillingDepth_mBLS"] = dfcomplete["SITE_TotalBoring/DrillingDepth_ftBLS"].mul(0.3048).round(3)

# MLR COLS
dfcomplete['MLR_alt_va'] = dfcomplete['MLR_alt_va'].mul(0.3048).round(3)
dfcomplete['MLR_hole_depth_va'] = dfcomplete['MLR_hole_depth_va'].mul(0.3048).round(3)

# OE COLS
dfcomplete["OE_OtherEquipStickupLength_m"] = dfcomplete["OE_OtherEquipStickupLength_ft"].mul(0.3048).round(3)

# CORE COLS
dfcomplete['CORE_CoreStickupLength_m'] = dfcomplete['CORE_CoreStickupLength_ft'].mul(0.3048).round(3)
dfcomplete['CORE_CoringBegan_mBLS'] = dfcomplete['CORE_CoringBegan_ftBLS'].mul(0.3048).round(3)
dfcomplete['CORE_CoringEnded_mBLS'] = dfcomplete['CORE_CoringEnded_ftBLS'].mul(0.3048).round(3)
dfcomplete['CORE_CoreRecoveryLength_m'] = dfcomplete['CORE_CoreRecoveryLength_ft'].mul(0.3048).round(3)
dfcomplete['CORE_TotalCoreLengthPounded_m'] = dfcomplete['CORE_TotalCoreLengthPounded_ft'].mul(0.3048).round(3)

# WELLS COLS
dfcomplete['MLR_well_depth_va'] = dfcomplete['MLR_well_depth_va'].mul(0.3048).round(3)
dfcomplete['AQ_Elevation'] = dfcomplete['AQ_Elevation'].mul(0.3048).round(3)

In [818]:
# CHANGE DATE TIME FORMATS
dfcomplete['SITE_DateOfConstruction'] = dfcomplete['SITE_DateOfConstruction'].dt.strftime('%Y%m%d')

In [819]:
# UPDATE RECORDS BY COMBINING COLUMNS
# station name
dfcomplete['USGS_StationName'] = dfcomplete['GWSI_USGS_StationName'].where(dfcomplete['GWSI_USGS_StationName'].notna(), dfcomplete['SITE_USGS_StationName']) 
dfcomplete['USGS_StationName'] = dfcomplete['MLR_station_nm'].where(dfcomplete['MLR_station_nm'].notna(), dfcomplete['USGS_StationName'])
# date of construction - make sure date time format was updated (block of code above)
dfcomplete['DateOfConstruction'] = dfcomplete['GWSI_DateOfConstruction'].where(dfcomplete['GWSI_DateOfConstruction'].notna(), dfcomplete['SITE_DateOfConstruction'])
dfcomplete['DateOfConstruction'] = dfcomplete['MLR_construction_dt'].where(dfcomplete['MLR_construction_dt'].notna(), dfcomplete['DateOfConstruction'])
# Landsurfaceelvation
dfcomplete['LandSurfaceAltitude_mASL_NAVD88'] = dfcomplete['SITE_LandSurfaceAltitude_mASL_NAVD88'].where(dfcomplete['SITE_LandSurfaceAltitude_mASL_NAVD88'].notna(), dfcomplete['LandSurfaceAltitude_mASL_NAVD88'])
dfcomplete['LandSurfaceAltitude_mASL_NAVD88'] = dfcomplete['MLR_alt_va'].where(dfcomplete['MLR_alt_va'].notna(), dfcomplete['LandSurfaceAltitude_mASL_NAVD88'])
# drillingdepth
dfcomplete['TotalBoring/DrillingDepth_mBLS'] = dfcomplete['MLR_hole_depth_va'].where(dfcomplete['MLR_hole_depth_va'].notna(), dfcomplete['TotalBoring/DrillingDepth_mBLS'])
# well depth
dfcomplete['TotalWellDepth_mBLS'] = dfcomplete['MLR_well_depth_va'].where(dfcomplete['MLR_well_depth_va'].notna(), dfcomplete['TotalWellDepth_mBLS'])
# MP height
dfcomplete['well_MP_height_m'] = dfcomplete['AQ_Elevation'].where(dfcomplete['AQ_Elevation'].notna(), dfcomplete['well_MP_height_m'])

In [820]:
# create indicator columns in tblSites for which sites exist in each of site type tables (wells, cores, other equipment)
dfcomplete['WellSite'] = np.where(dfcomplete['SITE_LocalSiteName'].isin(tblWells['LocalSiteName']), 1, 0) # inserts 1 if true and 0 if false
dfcomplete['CoreSite'] = np.where(dfcomplete['SITE_LocalSiteName'].isin(tblCores['LocalSiteName']), 1, 0)
dfcomplete['OtherEquipmentSite'] = np.where(dfcomplete['SITE_LocalSiteName'].isin(tblOE['LocalSiteName']), 1, 0)

In [821]:
#Rename columns
dfcomplete = dfcomplete.rename(columns={'SITE_OnNWT':'OnNorthWellTransect',
                                        'SITE_NWTPosition_m':'DistanceFromCenterOfNorthOilBody_m',
                                        'LandSurfaceAltitude_mASL_NAVD88':'LandSurfaceElevation_mASL_NAVD88',
                                        'SITE_Loc/MiscCom':'Comments_Miscellaneous',
                                        "SITE_LocalSiteName":"LocalSiteName",
                                        "SITE_StudySite":"StudySite",
                                        "SITE_AgencyCode":"AgencyCode",
                                        "SITE_USGS_siteno":"USGS_siteno",
                                        "SITE_XcoordUTMNAD83_m":"XcoordUTMNAD83_m",
                                        "SITE_YcoordUTMNAD83_m":"YcoordUTMNAD83_m",
                                        "SITE_Comments_DateOfConstruction":"Comments_DateOfConstruction",
                                        "SITE_SiteActiveStatus":"SiteActiveStatus",
                                        "SITE_Comments_Status":"Comments_Status",
                                        "SITE_ApproxRemovalDate":"ApproxRemovalDate",
                                        "OE_OtherEquipStickupLength_m":'OtherEquip_MP_height_m', #For Other equipment data release
                                        'SITE_NameOfContractor':'NameOfContractor',
                                        'SITE_DrillerName':'DrillerName',
                                        'OE_OtherEquipmentPurpose':'OtherEquipmentPurpose',
                                        'OE_Comments_Equipment':'Comments_Equipment',
                                        'OE_Comments':'Comments',
                                        'CORE_CoreStickupLength_m':'Core_MP_height_m',
                                        'CORE_TotalCoreLengthPounded_m':'CoreLengthPounded_m',
                                        'CORE_LocalSiteCoreSubName':'LocalSiteCoreSubName',
                                        'CORE_CoringBegan_mBLS':'CoringBegan_mBLS', 
                                        'CORE_CoringEnded_mBLS':'CoringEnded_mBLS', 
                                        'CORE_CoreRecoveryLength_m':'CoreRecoveryLength_m',
                                        'CORE_CoreBarrelType':'CoreBarrelType', 
                                        'CORE_SubsurfaceCoreZone':'SubsurfaceCoreZone', 
                                        'CORE_FreeProductOilPresentInCore':'FreeProductOilPresentInCore', 
                                        'CORE_DrillersFieldComments':'DrillersFieldComments'
                                        }) 

                                        

In [909]:
#create mastersitelist data release
mastersitelist = dfcomplete.loc[:, ["LocalSiteName",
                                    "StudySite",
                                    "AgencyCode",
                                    "USGS_siteno",
                                    "USGS_StationName",
                                    "XcoordUTMNAD83_m",
                                    "YcoordUTMNAD83_m",
                                    "LandSurfaceElevation_mASL_NAVD88",
                                    "OnNorthWellTransect",
                                    "DistanceFromCenterOfNorthOilBody_m",
                                    "WellSite",
                                    "CoreSite",
                                    "OtherEquipmentSite",
                                    "TopOfScreenElevation_mASL_NAVD88",
                                    "BottomOfScreenElevation_mASL_NAVD88",
                                    "MidOfScreenElevation_mASL_NAVD88",
                                    "DateOfConstruction",
                                    "Comments_DateOfConstruction",
                                    "SiteActiveStatus",
                                    "Comments_Status",
                                    "ApproxRemovalDate"]]

# format date
mastersitelist['LocalSiteName'] = 'x' + mastersitelist['LocalSiteName'].astype(str)
mastersitelist['USGS_siteno'] = 'x' + mastersitelist['USGS_siteno'].astype(str)

# make nan values uniform
mastersitelist['USGS_siteno'] = mastersitelist['USGS_siteno'].replace('xnan', np.nan)
null_cells = mastersitelist.isnull()
mastersitelist = mastersitelist.astype(str).mask(null_cells, np.NaN)
# delete dublicates
mastersitelist2 = mastersitelist.drop_duplicates(subset='LocalSiteName')
mastersitelist2.shape

(1729, 21)

In [823]:
# create OE data release (this merge is unqiue and must be done last...'LocalUseCode' when merged, needs to use the "left_on" of which ever table (OE or Core etc). Otherwise you have to do some renaming magic to keep all the 'LocalUseCode' columns seperate and change last minute while deleting old columns toi avoid repeating the same col name)
tblOE2 = pd.merge(dfcomplete, tblcd_LocalUseCode[['LocalUseCode','Comments_UseCode']], left_on='OE_LocalUseCode', right_on='LocalUseCode',how='left') #if you leave out the "how" (like in the original) then # of rows plummits
tblOE2 = tblOE2.loc[:,["LocalSiteName",
                          "StudySite",
                          "LocalUseCode",
                          "Comments_UseCode",
                          "USGS_siteno",
                          "USGS_StationName",
                          "XcoordUTMNAD83_m",
                          "YcoordUTMNAD83_m",
                          "OnNorthWellTransect",
                          "DistanceFromCenterOfNorthOilBody_m",
                          "LandSurfaceElevation_mASL_NAVD88",
                          "OtherEquip_MP_height_m",
                          "DateOfConstruction",
                          "Comments_DateOfConstruction",
                          "NameOfContractor",
                          "DrillerName",
                          "OtherEquipmentPurpose",
                          "Comments_Equipment",
                          "Comments",
                          "Comments_Miscellaneous"]]
# delete dublicates
tblOE2 = tblOE2.dropna(axis=0, subset=['LocalUseCode'])
tblOE2 = tblOE2.drop_duplicates(subset='LocalSiteName')

# final touches
tblOE2['LocalSiteName'] = 'x' + tblOE2['LocalSiteName'].astype(str)
tblOE2['USGS_siteno'] = 'x' + tblOE2['USGS_siteno'].astype(str)
tblOE2['USGS_siteno'] = tblOE2['USGS_siteno'].replace('xnan', np.nan)
# make nan values uniform
null_cells = tblOE2.isnull()
tblOE2 = tblOE2.astype(str).mask(null_cells, np.NaN)

In [824]:
# Create one extra column which needs to come after the column merges
dfcomplete['TopOfCoreElevation_mASL_NAVD88'] = dfcomplete['LandSurfaceElevation_mASL_NAVD88'] - dfcomplete['CoringBegan_mBLS']
# Migrate local use code 
tblCores2 = pd.merge(dfcomplete, tblcd_LocalUseCode[['LocalUseCode','Comments_UseCode']], left_on='CORE_LocalUseCode', right_on='LocalUseCode',how='left')
# create data release in column order
tblCores2 = tblCores2.loc[:, ["LocalSiteName",
                              "StudySite",
                              "LocalUseCode",
                              "Comments_UseCode",
                              "USGS_siteno",
                              "USGS_StationName",
                              "LocalSiteCoreSubName",
                              "XcoordUTMNAD83_m",
                              "YcoordUTMNAD83_m",
                              "OnNorthWellTransect",
                              "DistanceFromCenterOfNorthOilBody_m",
                              "LandSurfaceElevation_mASL_NAVD88",
                              "Core_MP_height_m",
                              "DateOfConstruction",
                              "Comments_DateOfConstruction",
                              "TotalBoring/DrillingDepth_mBLS",
                              "NameOfContractor",
                              "DrillerName",
                              "CoringBegan_mBLS",
                              "CoringEnded_mBLS",
                              "CoreRecoveryLength_m",
                              "TopOfCoreElevation_mASL_NAVD88",
                              "CoreLengthPounded_m",
                              "CoreBarrelType",
                              "SubsurfaceCoreZone",
                              "FreeProductOilPresentInCore",
                              "DrillersFieldComments",
                              "Comments_Miscellaneous"]] 
# delete uneeded cols
tblCores2 = tblCores2.dropna(axis=0, subset=['Comments_UseCode'])
#Format columns and male nan values uniform
tblCores2['LocalSiteName'] = 'x' + tblCores2['LocalSiteName'].astype(str)
tblCores2['USGS_siteno'] = 'x' + tblCores2['USGS_siteno'].astype(str)
tblCores2['USGS_siteno'] = tblCores2['USGS_siteno'].replace('xnan', np.nan)
null_cells = tblCores2.isnull()
tblCores2 = tblCores2.astype(str).mask(null_cells, np.NaN)

In [997]:
# add tblcd info
tblWells3 = pd.merge(dfcomplete, tblcd_OpeningType[["TypeOfOpenInterval", "comments_OpeningType"]], left_on='GWSI_TypeOfOpenInterval', right_on='TypeOfOpenInterval', how='left')
tblWells3 = pd.merge(tblWells3, tblcd_LocalUseCode[["LocalUseCode", "Comments_UseCode"]], left_on='WELL_LocalUseCode', right_on='LocalUseCode',how='left')
tblWells3 = pd.merge(tblWells3, tblcd_CasingMaterial[["CasingMaterial", "Comments_CasingMaterial"]], left_on='GWSI_CasingMaterial', right_on='CasingMaterial', how='left')
tblWells3 = pd.merge(tblWells3, tblcd_ScreenMaterial[["ScreenMaterial", "Comments_ScreenMaterial"]], left_on='GWSI_ScreenMaterialType', right_on='ScreenMaterial', how='left')

# rename tblcd info
tblWells3 = tblWells3.rename(columns={"comments_OpeningType":'OpeningTypeDescription'})
tblWells3 = tblWells3.rename(columns={"Comments_CasingMaterial":'CasingMaterialDescription'})
tblWells3 = tblWells3.rename(columns={"Comments_ScreenMaterial":'ScreenMaterialDescription'})

In [1000]:
#data release
# select a subset of data
tblWells3 = tblWells3.loc[:,["LocalSiteName",
                             "StudySite",
                             "LocalUseCode",
                             "Comments_UseCode",
                             "USGS_siteno",
                             "USGS_StationName",
                             "XcoordUTMNAD83_m",
                             "YcoordUTMNAD83_m",
                             "OnNorthWellTransect",
                             "DistanceFromCenterOfNorthOilBody_m",
                             "MeasuringPointElevation_mASL_NAVD88", 
                             "well_MP_height_m",
                             "LandSurfaceElevation_mASL_NAVD88",
                             "TopOfScreenElevation_mASL_NAVD88",
                             "BottomOfScreenElevation_mASL_NAVD88",
                             "ScreenLength_m",
                             "MidOfScreenElevation_mASL_NAVD88",
                             "TotalWellDepth_mBLS",
                             "TotalBoring/DrillingDepth_mBLS",
                             "DiameterOfDrillHole_cm",
                             "WellCasingInnerDiameter_cm",
                             "OpeningTypeDescription",
                             "WidthOfOpeningsInOpenInterval_cm",
                             "CasingMaterialDescription",
                             "ScreenMaterialDescription",
                             "DateOfConstruction",
                             "Comments_DateOfConstruction",
                             "NameOfContractor",
                             "DrillerName",
                             "Comments_Miscellaneous",
                             "SiteActiveStatus",
                             "Comments_Status",
                             "ApproxRemovalDate"]]  
# delete uneeded cols
tblWells3 = tblWells3.dropna(axis=0, subset=['Comments_UseCode'])
tblWells3 = tblWells3.drop_duplicates()#subset='LocalSiteName')
# formatting columns
tblWells3['LocalSiteName'] = 'x' + tblWells3['LocalSiteName'].astype(str)
tblWells3['USGS_siteno'] = 'x' + tblWells3['USGS_siteno'].astype(str)
#Make nan values uniform
tblWells3['USGS_siteno'] = tblWells3['USGS_siteno'].replace('xnan', np.nan)
null_cells = tblWells3.isnull()
tblWells3 = tblWells3.astype(str).mask(null_cells, np.NaN)  

# bmj_rmk data release
organized seperately since bmj_rmk has repeating rows for every site number 

In [841]:
dfrmk2 = dfrmk.loc[:,['GWSI_AgencyCode', 'GWSI_USGS_siteno', 'GWSI_GWSI_RMK',
                      'GWSI_GWSI_RMK_Date', 'GWSI_GWSI_RMK_SequenceNo']]

# make columns the same datatype of interger
tblSites['USGS_siteno'] = tblSites['USGS_siteno'].fillna(0).astype('int64')
# dfrmk2's is already an interger
dfrmk2 = pd.merge(dfrmk2, tblSites[['USGS_siteno', 'LocalSiteName']], left_on= "GWSI_USGS_siteno", right_on= 'USGS_siteno', how='left')

# organize for data release
dfrmk3 = dfrmk2.loc[:, ['GWSI_AgencyCode',
                        'GWSI_USGS_siteno',
                        'LocalSiteName',
                        'GWSI_GWSI_RMK',
                        'GWSI_GWSI_RMK_Date',
                        'GWSI_GWSI_RMK_SequenceNo']]
# format numbers with 'x'
dfrmk3['GWSI_USGS_siteno'] = 'x' + dfrmk3['GWSI_USGS_siteno'].astype(str)
dfrmk3['LocalSiteName'] = 'x' + dfrmk3['LocalSiteName'].astype(str)

# Saving Every dataframe in a new directory

In [910]:
# create variable with today's date
date = datetime.today().strftime('%Y%m%d') 
# create variable for new folder/directory
dir = "data_outputs/"+date+"_datarelease" # new directory location and name
# create the directory unless it already exists...then skip
try:
    os.mkdir(dir)
except:
    pass
# save all the files there 
mastersitelist2.to_csv(dir+"/DataRelease_MasterSiteList.csv", index=False)
tblOE2.to_csv(dir+"/DataRelease_OtherEquipmentInformation.csv", index=False)
tblCores2.to_csv(dir+"/DataRelease_CoreInformation.csv", index=False)
tblWells3.to_csv(dir+"/DataRelease_WellConstructionInformation.csv", index=False)
dfrmk3.to_csv(dir+"/DataRelease_rmk3.csv", index=False)

# extra tests

In [689]:
#check specific rows for specific columns
dfcomplete.loc[0:5, ['LocalSiteName','USGS_siteno','LandSurfaceElevation_mASL_NAVD88', 'MLR_alt_va']]

Unnamed: 0,LocalSiteName,USGS_siteno,LandSurfaceElevation_mASL_NAVD88,MLR_alt_va
2,15-N8,,,
3,15-N9,,,
1,15-N7,,,
5,15-N11,,,
0,15-N6,,,


In [738]:
#check random rows for specific columns
dfcomplete.loc[:, ['LocalSiteName','USGS_siteno','LandSurfaceElevation_mASL_NAVD88', 'MLR_alt_va']].sample(5)

Unnamed: 0,LocalSiteName,USGS_siteno,LandSurfaceElevation_mASL_NAVD88,MLR_alt_va
733,1104,,432.728,
1585,8417B,473425095052003.06,432.441,432.441
1618,9027,473428095052203.06,428.095,428.095
350,L310-1.5,,433.453,
54,17.lk1,,,


In [742]:
# look at rows where specfic columns have specific values
dfcomplete.loc[dfcomplete['GWSI_USGS_siteno'] == 473429095051006, ['GWSI_USGS_siteno','LandSurfaceElevation_mASL_NAVD88']]

Unnamed: 0,GWSI_USGS_siteno,LandSurfaceElevation_mASL_NAVD88
8,473429095051006.0,426.296


In [1029]:
dfrmk3

Unnamed: 0,GWSI_AgencyCode,GWSI_USGS_siteno,LocalSiteName,GWSI_GWSI_RMK,GWSI_GWSI_RMK_Date,GWSI_GWSI_RMK_SequenceNo
0,USGS,x473429095051006,x1217E,This well was re-surveyed on 27 June 2019 by J...,20190730.000,1.000
1,USGS,x473429095051006,x1217E,Digital levels were then ran from the temporar...,20190730.000,2.000
2,USGS,x473429095051006,x1217E,This well casing is constructed of stainless s...,20190730.000,3.000
3,USGS,x473424095052912,x1602G-01,"This is port ""01"" for vadose zone vapor/gas sa...",20200206.000,1.000
4,USGS,x473424095052906,x707G-01,"This is port ""01"" for vadose zone vapor/gas sa...",20200206.000,1.000
...,...,...,...,...,...,...
628,USGS,x473419095052503,xWG,This is the location of a wetland staff gage f...,20190730.000,1.000
629,USGS,x473419095052304,x1509G-01,"This is port ""01"" for vadose zone vapor/gas sa...",20200206.000,1.000
630,USGS,x473425095051601,x1707,This is a monitoring well used for the Bemidji...,20170726.000,1.000
631,USGS,x473423095051501,x1708,This monitoring well used for the Bemidji Toxi...,20170726.000,1.000


In [1032]:
# another example but with a string
dfcomplete.loc[dfcomplete['LocalSiteName'] == '15-N6', ['GWSI_USGS_siteno','LocalSiteName','LandSurfaceElevation_mASL_NAVD88']]

Unnamed: 0,GWSI_USGS_siteno,LocalSiteName,LandSurfaceElevation_mASL_NAVD88
0,,15-N6,434.049


In [1035]:
# another example but with can extra conditional where each boolean needs round brackets
dfrmk3.loc[(dfrmk3['GWSI_USGS_siteno'] == 'x473429095051006') & (dfrmk3['GWSI_GWSI_RMK_SequenceNo'] == 1), :]

Unnamed: 0,GWSI_AgencyCode,GWSI_USGS_siteno,LocalSiteName,GWSI_GWSI_RMK,GWSI_GWSI_RMK_Date,GWSI_GWSI_RMK_SequenceNo
0,USGS,x473429095051006,x1217E,This well was re-surveyed on 27 June 2019 by J...,20190730.0,1.0


In [815]:
#PRINT OUT DF KEYS IN ABC ORDER
dfcomplete.reindex(sorted(dfcomplete.columns), axis=1).keys()

Index(['AQ_AppliedByUser', 'AQ_AppliedTime', 'AQ_Comment',
       'AQ_DecommissionedDate', 'AQ_DecommissionedReason', 'AQ_Description',
       'AQ_Elevation', 'AQ_IsMeasuredAgainstLocalAssumedDatum',
       'AQ_MeasurementDirection', 'AQ_Method',
       ...
       'WELL_WellheadCasingLength', 'WELL_WellheadCasingProtection',
       'WELL_WellheadCompletion', 'WELL_WidthOfOpeningsInOpenInterval_inches',
       'WellCasingInnerDiameter_cm', 'WellSite',
       'WidthOfOpeningsInOpenInterval_cm', 'XcoordUTMNAD83_m',
       'YcoordUTMNAD83_m', 'well_MP_height_m'],
      dtype='object', length=380)

In [739]:
# look at the number of unique values in a column
len(pd.unique(dfcomplete['LandSurfaceElevation_mASL_NAVD88'])) 

1158

In [740]:
# number of columns and rows in a df
mastersitelist2.shape

(1729, 21)

# Comparing the old outputs with my updated outputs and these newest outputs

In [846]:
print(os.listdir()) # creates a list of directories
print(os.scandir()) # creates an object which can be iterated through

['.ipynb_checkpoints', '01_inputfiles_prep-Copy1.ipynb', '01_inputfiles_prep.ipynb', '02_data_convergence-Copy1.ipynb', '02_data_convergence-Copy2-20221222.ipynb', '02_data_convergence-Copy3-20221228.ipynb', '02_data_convergence.ipynb', '02_data_convergenceCopy4-20230102.ipynb', 'ColumnMapping', 'config', 'datacheck', 'data_inputs', 'data_outputs', 'description', 'HEAD', 'hooks', 'info', 'objects', 'refs', 'Untitled-1.ipynb', 'Untitled-1_20230102.ipynb']
<built-in function scandir>


In [911]:
# import old output files from R 
path = r'data_outputs/old_fromR/'
dir = os.listdir(path)
oldc = pd.read_csv(path+dir[0], encoding='Windows-1252') # Coreinformation
oldm = pd.read_csv(path+dir[1], encoding='Windows-1252') # mastersitelist
oldo = pd.read_csv(path+dir[2], encoding='Windows-1252') # otherequipment
oldw = pd.read_csv(path+dir[3], encoding='Windows-1252') # wellconstruction
# import updated output files where R code was translated to Python code
path = r'data_outputs/20230110_datarelease/'
dir = os.listdir(path)
midc = pd.read_csv(path+dir[0])
midm = pd.read_csv(path+dir[1])
mido = pd.read_csv(path+dir[2])
midr = pd.read_csv(path+dir[3]) # rmk3
midw = pd.read_csv(path+dir[4])
# import newest (1/19/2023) output files from Python where everything was reorganized
path = r'data_outputs/20230119_datarelease/'
dir = os.listdir(path)
newc = pd.read_csv(path+dir[0])
newm = pd.read_csv(path+dir[1])
newo = pd.read_csv(path+dir[2])
newr = pd.read_csv(path+dir[3]) # rmk3
neww = pd.read_csv(path+dir[4])

In [930]:
# .shape prints out the (rows, cols) of a dataframe. Cols should generally match. 
# Rows Could have slightly increased from old to mid since mid is slightly updated. 
# New files may have slightly less than mid as some duplicate rows were deleted
print("Core Info:", oldc.shape, midc.shape, newc.shape)
print("MasterSiteList Info:", oldm.shape, midm.shape, newm.shape)
print("OtherEquipment Info:", oldo.shape, mido.shape, newo.shape)
print("Well Con Info:", oldw.shape, midw.shape, neww.shape)
print("RMK Info:", 'nan', midr.shape, newr.shape)

Core Info: (1408, 30) (1470, 28) (1469, 28)
MasterSiteList Info: (1701, 21) (1731, 21) (1729, 21)
OtherEquipment Info: (434, 20) (433, 20) (423, 20)
Well Con Info: (605, 33) (615, 33) (596, 33)
RMK Info: nan (633, 6) (633, 6)


In [918]:
# reorganized the same data
row_names = ["Core Info:", "MasterSiteList Info:","OtherEquipment Info:","Well Con Info:","RMK Info:"]
col_names = ["Old R dataframe", 'Translated to Py', 'Reorganized in Py']
data = [[oldc.shape, midc.shape, newc.shape],[oldm.shape, midm.shape, newm.shape],
        [oldo.shape, mido.shape, newo.shape], [oldw.shape, midw.shape, neww.shape],['nan', midr.shape, newr.shape]]
df = pd.DataFrame(data, columns=col_names)
df.index = [row_names]
df

Unnamed: 0,Old R dataframe,Translated to Py,Reorganized in Py
Core Info:,"(1408, 30)","(1470, 28)","(1469, 28)"
MasterSiteList Info:,"(1701, 21)","(1731, 21)","(1729, 21)"
OtherEquipment Info:,"(434, 20)","(433, 20)","(423, 20)"
Well Con Info:,"(605, 33)","(615, 33)","(596, 33)"
RMK Info:,,"(633, 6)","(633, 6)"


I notice that core info lost some columns

In [933]:
# get the column labels of each dataframe
colsOLD = set(oldc.columns)
colsNEW = set(newc.columns)

# find the columns missing in df2 compared to df1
missing_in_NEW = colsOLD.difference(colsNEW)
print("Cols missing the new dataframe:", missing_in_NEW)

# find the columns missing in df1 compared to df2
missing_in_OLD = colsNEW.difference(colsOLD)
print("Cols missing from the old dataframe:", missing_in_OLD)


Cols missing the new dataframe: {'Unnamed: 28', 'Unnamed: 29', 'TotalBoring_DrillingDepth_mBLS'}
Cols missing from the old dataframe: {'TotalBoring/DrillingDepth_mBLS'}


In [925]:
missing_cols = colsOLD.symmetric_difference(colsNEW)
print(missing_cols)
missing_cols = colsNEW.symmetric_difference(colsOLD)
print(missing_cols)

{'Unnamed: 29', 'TotalBoring/DrillingDepth_mBLS', 'TotalBoring_DrillingDepth_mBLS', 'Unnamed: 28'}
{'Unnamed: 29', 'TotalBoring/DrillingDepth_mBLS', 'TotalBoring_DrillingDepth_mBLS', 'Unnamed: 28'}


Turns out there are just invisible columns in the old dataframe called "Unnamed: ##". Perhaps it has to do with the weirdness of the encoding type. Can be ignored and know that both dataframes actally have 28 rows for Coring dataframes

### All columns match up...what about data?

In [950]:
print(oldm['LocalSiteName'].dtypes)
print(newm['LocalSiteName'].dtypes)
# First I have to make the old columns match
oldm['LocalSiteName'] = 'x' + oldm['LocalSiteName'].astype(str)

object
object


In [953]:
display(oldm.loc[0:2, :])
display(newm.loc[0:2, :])

Unnamed: 0,LocalSiteName,StudySite,AgencyCode,USGS_siteno,USGS_StationName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,LandSurfaceElevation_mASL_NAVD88,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m,...,CoreSite,OtherEquipmentSite,TopOfScreenElevation_mASL_NAVD88,BottomOfScreenElevation_mASL_NAVD88,MidOfScreenElevation_mASL_NAVD88,DateOfConstruction,Comments_DateOfConstruction,SiteActiveStatus,Comments_Status,ApproxRemovalDate
0,x15-N6,Bemidji,USGS,xnan,,342541.28,5271062.29,434.049,,,...,1,0,,,,,,Active,,
1,x15-N7,Bemidji,USGS,xnan,,342563.44,5271111.22,434.401,,,...,1,0,,,,,,Active,,
2,x15-N8,Bemidji,USGS,xnan,,342557.79,5271104.47,434.263,,,...,1,0,,,,,,Active,,


Unnamed: 0,LocalSiteName,StudySite,AgencyCode,USGS_siteno,USGS_StationName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,LandSurfaceElevation_mASL_NAVD88,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m,...,CoreSite,OtherEquipmentSite,TopOfScreenElevation_mASL_NAVD88,BottomOfScreenElevation_mASL_NAVD88,MidOfScreenElevation_mASL_NAVD88,DateOfConstruction,Comments_DateOfConstruction,SiteActiveStatus,Comments_Status,ApproxRemovalDate
0,x15-N6,Bemidji,USGS,,,342541.28,5271062.29,434.049,,,...,1,0,,,,,,Active,,
1,x15-N7,Bemidji,USGS,,,342563.44,5271111.22,434.401,,,...,1,0,,,,,,Active,,
2,x15-N8,Bemidji,USGS,,,342557.79,5271104.47,434.263,,,...,1,0,,,,,,Active,,


In [963]:
# check true versus false cols
check = oldm['LocalSiteName'].isin(newm['LocalSiteName'])
true_count = check.sum()
false_count = len(check) - true_count
print("Number of True values:", true_count)
print("Number of False values:", false_count)

Number of True values: 1591
Number of False values: 110


Lesson on boolean indexing...a boolean (true/false) mask can be used to select rows where elements showed up to be true or false. Typically it would return true values

> true_rows=oldm.loc[check]

But it can also return false values by using the '~' operator which negates the mask 

> false_rows=oldm.loc[~check]

Another way to do it is

> false_rows=oldm.loc[check == False]

In [978]:
false_rows=oldm.loc[~check]
false_rows.sample(10)

Unnamed: 0,LocalSiteName,StudySite,AgencyCode,USGS_siteno,USGS_StationName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,LandSurfaceElevation_mASL_NAVD88,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m,...,CoreSite,OtherEquipmentSite,TopOfScreenElevation_mASL_NAVD88,BottomOfScreenElevation_mASL_NAVD88,MidOfScreenElevation_mASL_NAVD88,DateOfConstruction,Comments_DateOfConstruction,SiteActiveStatus,Comments_Status,ApproxRemovalDate
379,x330,Bemidji,USGS,xnan,,342861.94,5271069.47,433.207,1.0,81.47,...,1,0,,,,20030726.0,,Active,,
357,x220,Bemidji,USGS,xnan,,342848.75,5271063.08,433.0,1.0,66.82,...,1,0,,,,20020723.0,,Active,,
347,x206,Bemidji,USGS,xnan,,342782.83,5271033.43,430.295,1.0,-5.46,...,1,0,,,,20020723.0,,Active,,
370,x321,Bemidji,USGS,xnan,,342774.16,5271037.29,430.163,1.0,-11.82,...,1,0,,,,20030723.0,,Active,,
297,x8,Bemidji,USGS,xnan,,342830.12,5271052.74,432.792,1.0,45.6,...,1,0,,,,20000801.0,,Active,,
516,x706,Bemidji,USGS,xnan,,342862.16,5271068.63,433.179,1.0,81.33,...,1,0,,,,20070716.0,,Active,,
378,x329,Bemidji,USGS,xnan,,342859.45,5271068.17,433.146,1.0,78.66,...,1,0,,,,20030726.0,,Active,,
503,x533,Bemidji,USGS,xnan,,342829.09,5271052.05,432.776,1.0,44.38,...,1,0,,,,20050924.0,,Active,,
309,x25,Bemidji,USGS,xnan,,342773.57,5271036.61,430.263,1.0,-12.64,...,1,0,,,,20000804.0,,Active,,
517,x707,Bemidji,USGS,xnan,,342872.33,5271074.71,433.268,1.0,93.09,...,1,0,,,,20070716.0,,Active,,


In [None]:
display(oldm.loc[0:2, :])
display(newm.loc[0:2, :])

Why are there over 100 False values for LocalSiteNames? There seems to be small ignorable problems. Like the error in the old dataframe where 0007 was converted to 7. Or "Telephone box" versus "telephone bo". 

## I can repeat this process using any dataframe or column

In [1015]:
# manually do any processing on the old dfs to match the new dfs format
oldw['LocalSiteName'] = 'x' + oldw['LocalSiteName'].astype(str)

In [1022]:
#inputs
old_df = oldo # do these values appear in... 
new_df = newo # this dataframe?
col_name = 'LocalSiteName'
# check true versus false cols
check = old_df[col_name].isin(new_df[col_name])
true_count = check.sum()
false_count = len(check) - true_count
print("Number of True values:", true_count)
print("Number of False values:", false_count)

false_rows=old_df.loc[~check]
display(false_rows[['LocalSiteName', 'Comments']])

Number of True values: 419
Number of False values: 15


Unnamed: 0,LocalSiteName,Comments
19,xB-pipeline,record added to database to accommodate pipeli...
76,xTelephone bo 3,
77,xPower bo 6,
78,xPower bo 6 meter,
79,xPower bo 7,
80,xTelephone bo 4,
81,xTelephone bo 5,
102,xPower bo 1,
103,xPower bo 2,
104,xPower bo 3,


In [None]:
# how to display the whole text in the column
pd.options.display.max_colwidth = 100
display(false_rows[['LocalSiteName', 'Comments']])
pd.reset_option('max_colwidth') # turn output display back to default to avoid crashing the computer if printing a lot of columns

In [993]:
#checked rows where mismatches occur

display(old_df.loc[old_df['LocalSiteName'] == '17.lk.sc5', :])
display(new_df.loc[new_df['LocalSiteName'] == 'x17.lk.sc5', :])

Unnamed: 0,LocalSiteName,StudySite,LocalUseCode,Comments_UseCode,USGS_siteno,USGS_StationName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m,...,CasingMaterialDescription,ScreenMaterialDescription,DateOfConstruction,Comments_DateOfConstruction,NameOfContractor,DrillerName,Comments_Miscellaneous,SiteActiveStatus,Comments_Status,ApproxRemovalDate
19,17.lk.sc5,Bemidji,,,,,343127.64,5271096.41,,,...,,,,,USGS MN Water Science Center,Not applicable,,Removed,temporary sampling location,


Unnamed: 0,LocalSiteName,StudySite,LocalUseCode,Comments_UseCode,USGS_siteno,USGS_StationName,XcoordUTMNAD83_m,YcoordUTMNAD83_m,OnNorthWellTransect,DistanceFromCenterOfNorthOilBody_m,...,CasingMaterialDescription,ScreenMaterialDescription,DateOfConstruction,Comments_DateOfConstruction,NameOfContractor,DrillerName,Comments_Miscellaneous,SiteActiveStatus,Comments_Status,ApproxRemovalDate


All values that are in **'XcoordUTMNAD83_m'** in the old df also appear in the new df for mastersitelist, core, and otherequipment but not for Well Construction. Are rows with '17.lk.sc5' really needed? 

## Convert to UTF-8 example. 
Since the encoding type "Windows-1252" is limited with its character use (single byte!) and can't be translated to all languages, it is good practice in general to convert it to UTF-8 so it can handle the wide variety of characters in different languages like Chinese. 

In [883]:
#detect the encoding of a file
import chardet
with open(r'data_outputs/old_fromR/DataRelease01_CoreInformation.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [880]:

oldc.to_csv(r'data_outputs/old_fromR/Test_DataRelease01_CoreInformation_UTF8.csv', index=False, encoding='UTF-8')


In [881]:
test = pd.read_csv(r'data_outputs/old_fromR/DataRelease01_CoreInformation_UTF8.csv')

# TRYING TO AUTOMATE 

In [None]:
dfcomplete[['SITE_USGS_siteno', 'GWSI_USGS_siteno', 'MLR_site_no', 'AQ_site', 'SITE_LocalSiteName','CORE_LocalSiteName','WELL_LocalSiteName','OE_LocalSiteName']].sample(10)

In [None]:
list_of_cols_in_data_release = []
# loop through the col name mapping doc to create a list of col names
for a, b, c, d, w, x, y, z in zip(colmap['AccessDB_name_while_merging'], colmap['MLR_field_name'], colmap['Aquarius_field_name'], colmap['Local_field_name'], colmap['mastersitelist_data_release'], colmap['oe_info_data_release'], colmap['core_data_release'], colmap['wellcon_data_release']): # scans through all these columns
    if w == 1 or x == 1 or y == 1 or z == 1:  # if the row is located in any data release, do the below code
        list_of_cols_in_data_release += [a, b, c, d] # add each column name from each source (GWSI, MLR, AQ, Local)
# remove all "nan" values from the list
list_of_cols_in_data_release = [x for x in list_of_cols_in_data_release if str(x) != 'nan'] # remove all nans
print(list_of_cols_in_data_release)

In [None]:
# merge tblSites and dfcomplete: match up datatypes of the columns that the dfs will be merged on
tblSites2 = tblSites
tblSites2['USGS_siteno'] = tblSites2['USGS_siteno'].fillna(0).astype('int64')
dfcomplete = pd.merge(tblSites2.add_prefix('SITE_'), dfbmj3, left_on='SITE_USGS_siteno', right_on='GWSI_USGS_siteno', how='left')
# turn zeros back into NaN
dfcomplete['SITE_USGS_siteno'].replace(0, np.nan, inplace=True) 
print(dfcomplete.shape)
# Merge tbl wells
dfcomplete = pd.merge(dfcomplete, tblWells, on='LocalSiteName', how='left')
print(dfcomplete.shape)
# Merge tblOE
dfcomplete = pd.merge(dfcomplete, tblOE, on='LocalSiteName', how='left')
print(dfcomplete.shape)
print(dfcomplete['TotalBoring/DrillingDepth_ftBLS_y'])
# Merge tblCores
dfcomplete = pd.merge(dfcomplete, tblCores, on='LocalSiteName', how='left')
print(dfcomplete.shape)
# Merge dfmlr
dfcomplete = pd.merge(dfcomplete, dfmlr, left_on='USGS_siteno', right_on='site_no', how='left')
print(dfcomplete.shape)
# Merge dfaq
dfcomplete = pd.merge(dfcomplete, dfaq, left_on='USGS_siteno', right_on='site', how='left')
print(dfcomplete.shape)
# Local Use Codes
dfcomplete = pd.merge(dfcomplete, tblcd_LocalUseCode[['LocalUseCode','Comments_UseCode']], how='left') #if you leave out the "how" (like in the original) then # of rows plummits
dfcomplete = pd.merge(dfcomplete, tblcd_OpeningType[["TypeOfOpenInterval", "comments_OpeningType"]], left_on='GWSI_TypeOfOpenInterval', right_on='TypeOfOpenInterval', how='left')
dfcomplete = pd.merge(dfcomplete, tblcd_CasingMaterial[["CasingMaterial", "Comments_CasingMaterial"]], left_on='GWSI_CasingMaterial', right_on='CasingMaterial', how='left')
dfcomplete = pd.merge(dfcomplete, tblcd_ScreenMaterial[["ScreenMaterial", "Comments_ScreenMaterial"]], left_on='GWSI_ScreenMaterialType', right_on='ScreenMaterial', how='left')
print(dfcomplete.shape)


# Creating a list of desired columns from all data sources

In [None]:
colmap = pd.read_excel(r'ColumnMapping/20221228_ColMapping.xlsx', sheet_name='NWIS_ColNameMapping')

In [None]:
list_of_cols_in_data_release = []
# loop through the col name mapping doc to create a list of col names
for a, b, c, d, w, x, y, z in zip(colmap['AccessDB_name_while_merging'], colmap['MLR_field_name'], colmap['Aquarius_field_name'], colmap['Local_field_name'], colmap['mastersitelist_data_release'], colmap['oe_info_data_release'], colmap['core_data_release'], colmap['wellcon_data_release']): # scans through all these columns
    if w == 1 or x == 1 or y == 1 or z == 1:  # if the row is located in any data release, do the below code
        list_of_cols_in_data_release += [a, b, c, d] # add each column name from each source (GWSI, MLR, AQ, Local)
# remove all "nan" values from the list
list_of_cols_in_data_release = [x for x in list_of_cols_in_data_release if str(x) != 'nan'] # remove all nans
print(list_of_cols_in_data_release)

In [None]:
dfcomplete2 = dfcomplete.loc[:, list_of_cols_in_data_release]



In [None]:
# calculate additional columns for GWSI
dfcomplete['GWSI_MeasuringPointElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] + dfcomplete['GWSI_MP_height_ft']
dfcomplete['MeasuringPointElevation_mASL_NAVD88'] = dfcomplete['GWSI_MeasuringPointElevation_ftASL_NAVD88'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['GWSI_MeasuringPointHeight_m'] = dfcomplete['GWSI_MP_height_ft'].mul(0.3048).round(3)

dfcomplete['LandSurfaceAltitude_mASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['GWSI_TopOfScreenElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - dfcomplete['GWSI_TopOfScreenDepth_ftBLS']
dfcomplete['TopOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_TopOfScreenElevation_ftASL_NAVD88'].mul(0.3048).round(3)

dfcomplete['GWSI_BottomOfScreenElevation_ftASL_NAVD88'] = dfcomplete['GWSI_LandSurfaceAltitude_ftASL_NAVD88'] - dfcomplete['GWSI_BottomOfScreenDepth_ftBLS']
dfcomplete['BottomOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_BottomOfScreenElevation_ftASL_NAVD88'].mul(0.3048).round(3)

dfcomplete['GWSI_MidOfScreenElevation_mASL_NAVD88'] = dfcomplete['TopOfScreenElevation_mASL_NAVD88'] + dfcomplete['BottomOfScreenElevation_mASL_NAVD88']
dfcomplete['MidOfScreenElevation_mASL_NAVD88'] = dfcomplete['GWSI_MidOfScreenElevation_mASL_NAVD88'].div(2).round(3)

dfcomplete['TotalWellDepth_mBLS'] = dfcomplete['GWSI_TotalWellDepth_ftBLS'].mul(0.3048).round(3) #renamed for Wells

dfcomplete['DiameterOfDrillHole_cm'] = dfcomplete['GWSI_DiameterOfDrillHole_inches'].mul(2.54).round(1) #renamed for Wells

dfcomplete['WellCasingInnerDiameter_cm'] = dfcomplete['GWSI_WellCasingInnerDiameter_inches'].mul(2.54).round(1) #renamed for Wells

dfcomplete['WidthOfOpeningsInOpenInterval_cm'] = dfcomplete['GWSI_WidthOfOpeningsInOpenInterval_inches'].mul(2.54).round(3) #renamed for Wells

dfcomplete['GWSI_ScreenLength_ft'] = dfcomplete['GWSI_BottomOfScreenDepth_ftBLS'] - dfcomplete['GWSI_TopOfScreenDepth_ftBLS']
dfcomplete['ScreenLength_m'] = dfcomplete['GWSI_ScreenLength_ft'].mul(.3048).round(3) #renamed for Wells

dfcomplete['well_MP_height_m'] = dfcomplete['GWSI_MP_height_ft'].mul(.3048).round(3) #renamed for Wells

In [None]:
# create indicator columns in tblSites for which sites exist in each of site type tables (wells, cores, other equipment)
tblSites2['WellSite'] = np.where(tblSites2['LocalSiteName'].isin(tblWells['LocalSiteName']), 1, 0) # inserts 1 if true and 0 if false
tblSites2['CoreSite'] = np.where(tblSites2['LocalSiteName'].isin(tblCores['LocalSiteName']), 1, 0)
tblSites2['OtherEquipmentSite'] = np.where(tblSites2['LocalSiteName'].isin(tblOE['LocalSiteName']), 1, 0)

In [None]:
cols = [col for col in dfcomplete.keys() if col.endswith("_y")]
print(cols)

In [None]:
dfcomplete[['USGS_siteno', 'DrillersFieldComments_x', 'DrillersFieldComments_y','TotalBoring/DrillingDepth_ftBLS_x','TotalBoring/DrillingDepth_ftBLS_y']].sample(10)

In [None]:
cols = [col for col in dfcomplete.keys() if col.endswith("_x")]
print(cols)

In [None]:
dfcomplete.loc[dfcomplete['LocalSiteName'] == '518G-06'].to_csv('deletemeplease.csv')

In [None]:
tblCores['LocalSiteName'].value_counts()


In [None]:
dfcomplete['LocalSiteName'].value_counts()

In [None]:
boolean = dfcomplete.duplicated(subset=['LocalSiteName']).any()
print(boolean)

In [None]:
dfcomplete[['Wellcon_SiteRecordID_x','Wellcon_SiteRecordID_y']].sample(5)

In [None]:
dfcomplete.LocalSiteName