# Import libraries and check the directory

In [4]:
import os # #for working with directories
import pandas as pd #for creating dataframe
import numpy as np #for replacing NaN values in a dataframe
from datetime import datetime #for getting today's date
import pyodbc #working with ODBC databases
import sqlalchemy_access #for reading back into a database

In [3]:
# check your current working directory   
cwd = os.getcwd()
print("Current working directory: {0}".format(cwd))

Current working directory: p:\0083\analysis\DataCompilation\DataCompilationPy\create_site_info_files


In [4]:
#Check to make sure the path exists in current working directory for future use
os.path.exists("../local_access_db/BemidjiMasterSiteData_be.accdb") # ".." means "go back one directory"

True

In [None]:
# If you want every column displayed for the output, delete the pound signs from the code below and run it. Afterwards rerun any dataframe

## pd.set_option('display.max_columns', 0)
## df_bmj3.head()

# Import bmj3

## Meta data

In [5]:
# bmj files are obtained from GWSI. Load the meta into a dataframe
f = r'data_inputs/gwsi_old/bmj3.subf.list_meta.csv'
df_meta = pd.read_csv(f) 
df_meta.head(5)

Unnamed: 0,FIELD,CODE,DESCRIPTION,LOC,LEN,DB_name
0,1,C004,Source agency code,1,5,GWSI_AgencyCode
1,2,C001,Site ID (station number),6,15,GWSI_USGS_siteno
2,3,C802,Site type code,21,7,GWSI_GWSISiteType
3,4,C023,Primary use of site,28,1,GWSI_GWSIUseOfSite
4,5,C713,Aquifer type code,29,1,GWSI_GWSIAquiferType


## Import .subf file and save

In [6]:
# Load GWSI data into a dataframe and rename it
f = r'data_inputs/gwsi_old/bmj3.subf'
#df_bmj3 = pd.read_fwf (f, widths = df_meta.LEN , header=None, names=list(df_meta.DB_name.str.replace('GWSI', 'NWIS'))) # this says (file name, widths are based on a column in df_meta, there's no initial header/col names, the headers/col names should be based on a column in the metadata with an edit)
df_bmj3 = pd.read_fwf (f, widths = df_meta.LEN , header=None, names=list(df_meta.DB_name)) # this says (file name, widths are based on a column in df_meta, there's no initial header/col names, the headers/col names should be based on a column in the metadata)

# Update the Land Surface Altitude column (m) to be rounded to 3 decimal places
df_bmj3['GWSI_LandSurfaceAltitude_mASL_NAVD88'] = df_bmj3['GWSI_LandSurfaceAltitude_ftASL_NAVD88'].mul(0.3048).round(3)

In [7]:
df_bmj3.head(2)

Unnamed: 0,GWSI_AgencyCode,GWSI_USGS_siteno,GWSI_GWSISiteType,GWSI_GWSIUseOfSite,GWSI_GWSIAquiferType,GWSI_GWSIPrimaryAquifer,GWSI_GWSINationalAquifer,GWSI_TotalWellDepth_ftBLS,GWSI_WellCasingInnerDiameter_inches,GWSI_CasingMaterial,...,GWSI_MP_SequenceNo,GWSI_CONS_RecordType,GWSI_MP_LastUpdate,GWSI_MP_WebReady,GWSI_MP_height_m,GWSI_MP_Altitude_m,GWSI_CONS_SequenceNo,GWSI_HOLE_SequenceNo,GWSI_CSNG_SequenceNo,GWSI_OPEN_SequenceNo
0,USGS,473429095051006,GW,O,U,112GSSG,N100GLCIAL,19.07,0.78,V,...,1.0,MPNT,20181020000000.0,Y,0.34,,1.0,1.0,1.0,1.0
1,USGS,473424095052912,SB-UZ,O,,,,18.09,0.13,V,...,1.0,MPNT,20200210000000.0,Y,0.0,,1.0,1.0,1.0,1.0


In [9]:
# save the output as a file
# Note: Make sure to have the excel file closed or else you CANNOT overwrite the save file
df_bmj3.to_csv("data_inputs/gwsi_old/bmj3_fromPy.csv", index=False) #if I want to get rid of number on the side, use index=False

# Import bmj.mpnt 
Repeat the same general process but with different files

In [10]:
f = r'data_inputs/gwsi_old/bmj.mpnt.subf.list_meta.csv'
df_meta = pd.read_csv(f) 

f = r'data_inputs/gwsi_old/bmj.mpnt.subf'
df_mpnt = pd.read_fwf (f, widths = df_meta.LEN , header=None, names=list(df_meta.DB_name))
df_mpnt.to_csv("data_inputs/gwsi_old/bmj_mpnt_fromPy.csv", index=False)

# Import bmj.rmk
Repeat again

In [11]:
f = r'data_inputs/gwsi_old/bmj.rmk.subf.list_meta.csv'
df_meta = pd.read_csv(f) 

f = r'data_inputs/gwsi_old/bmj.rmk.subf'
df_rmk = pd.read_fwf (f, widths = df_meta.LEN , header=None, names=list(df_meta.DB_name))
df_rmk.to_csv("data_inputs/gwsi_old/bmj_rmk_fromPy.csv", index=False)

# Import Referencepoints(2) .csv
Good MP data

In [12]:
f = r'data_inputs/aquarius/Referencepoints (2).csv'
df_rp = pd.read_csv(f)
df_rp.head(2)

Unnamed: 0,UniqueId,Name,Description,site,StandardIdentifier,IsMeasuredAgainstLocalAssumedDatum,ValidFrom,Unit,Elevation,Uncertainty,Method,MeasurementDirection,Comment,AppliedTime,AppliedByUser,DecommissionedDate,DecommissionedReason
0,0868303bbc6c441982fff320c2d0a44f,MP-1 height,"No MP exists, but 0.00 ft. was populated for f...",473356095043701,,True,0001-01-01T00:00:00.0000000+00:00,ft,0.0,0.05,GNSS3,FromTopToBottom,"No MP exists, but 0.00 ft. was populated for f...",2021-03-02T13:12:52.2229750-06:00,aberg,,
1,00ddcc72af874a04a11e2735b1524b39,MP-1 height,"MP assumed to be the top of casing, not protec...",473358095061401,,True,1984-10-17T00:00:00.0000000-06:00,ft,3.0,,,FromTopToBottom,From GW_MPNT 1,2021-01-12T20:13:22.9820002-06:00,admin,,


In [13]:
df_rp.info #.info is a "method". Same as df.describe. Originally [1134 rows x 17 columns]

<bound method DataFrame.info of                               UniqueId                 Name  \
0     0868303bbc6c441982fff320c2d0a44f          MP-1 height   
1     00ddcc72af874a04a11e2735b1524b39          MP-1 height   
2     b570bd5035cb4d0db9f55cce351f4fc6          MP-1 height   
3     b6a787ea9c1d4625b02dacca2b76a2c6          MP-1 height   
4     4d6975e06cda4e32a5df7224776d21e2          MP-1 height   
...                                ...                  ...   
1129  6c0293e1fd884a97a49cdc79c66961c2          MP-1 height   
1130  ec11dddc0a49444db3b113b7c5270c1a  Generic Zero Height   
1131  f534bdda01e1488d9a9f17eb94ff3db1          MP-1 height   
1132  4e4ffdaf38264739b398c38fc7effae9          MP-1 height   
1133  01134a800425416097a51eb47c8c92bd          MP-1 height   

                                            Description             site  \
0     No MP exists, but 0.00 ft. was populated for f...  473356095043701   
1     MP assumed to be the top of casing, not protec...  47

In [14]:
# Drop the row if the "UniqueId" value is a duplicate. Drop the whole "StandardIdentifier" column
df_rp2 = df_rp.drop_duplicates(subset=['UniqueId']).drop("StandardIdentifier", axis=1)
df_rp2.info  #[573 rows x 16 columns]

<bound method DataFrame.info of                               UniqueId                 Name  \
0     0868303bbc6c441982fff320c2d0a44f          MP-1 height   
1     00ddcc72af874a04a11e2735b1524b39          MP-1 height   
2     b570bd5035cb4d0db9f55cce351f4fc6          MP-1 height   
3     b6a787ea9c1d4625b02dacca2b76a2c6          MP-1 height   
4     4d6975e06cda4e32a5df7224776d21e2          MP-1 height   
...                                ...                  ...   
1129  6c0293e1fd884a97a49cdc79c66961c2          MP-1 height   
1130  ec11dddc0a49444db3b113b7c5270c1a  Generic Zero Height   
1131  f534bdda01e1488d9a9f17eb94ff3db1          MP-1 height   
1132  4e4ffdaf38264739b398c38fc7effae9          MP-1 height   
1133  01134a800425416097a51eb47c8c92bd          MP-1 height   

                                            Description             site  \
0     No MP exists, but 0.00 ft. was populated for f...  473356095043701   
1     MP assumed to be the top of casing, not protec...  47

In [15]:
df_rp_newest =  df_rp2.sort_values('ValidFrom').drop_duplicates('site', keep='last')

In [16]:
df_rp2.to_csv("data_inputs/aquarius/Referencepoints_fromPy.csv", index=False)
df_rp_newest.to_csv('data_inputs/aquarius/Referencepoints_updatedMP_fromPy.csv', index=False)

# MLR data
Using water services websites

Alternatively you can use the USGS website to plug in specific site numbers: https://waterservices.usgs.gov/rest/Site-Test-Tool.html

In [17]:
#METHOD A : Opening a .TXT file:
f = open(r'../local_access_db/Bmj_STAIDs_20221014.txt') #opens the file
sites = f.read() #reads the file
sites = sites.split() #split data into list 
del sites[0] #delete the first item in the list (the title)
sites = ', '.join(map(str,sites)) #create a string consisting of every element with a comma and space inbetween

#Create URL that can be copy and pasted or clicked
#print(f"https://waterservices.usgs.gov/nwis/site/?format=rdb&sites={sites}&siteStatus=all")
url = f"https://waterservices.usgs.gov/nwis/site/?format=rdb&sites={sites}&siteStatus=all"
url = url.replace(" ", "%20")
print(url)

https://waterservices.usgs.gov/nwis/site/?format=rdb&sites=473356095043701,%20473358095061401,%20473404095054101,%20473405095060101,%20473405095060201,%20473408095045601,%20473409095045501,%20473411095062901,%20473412095050801,%20473413095053701,%20473413095053702,%20473414095053601,%20473416095051301,%20473416095052601,%20473416095052801,%20473416095060801,%20473417095051801,%20473417095051802,%20473417095051803,%20473417095051804,%20473417095051805,%20473417095051806,%20473417095052101,%20473417095052601,%20473417095052602,%20473417095052701,%20473417095052801,%20473418095050101,%20473418095050201,%20473418095050301,%20473418095051703,%20473418095052801,%20473418095052802,%20473418095052803,%20473418095052901,%20473418095055001,%20473418095055601,%20473418095055602,%20473418095055603,%20473418095055604,%20473418095055605,%20473419095050401,%20473419095052301,%20473419095052302,%20473419095052303,%20473419095052304,%20473419095052305,%20473419095052306,%20473419095052401,%204734190950

In [5]:
# METHOD B : Getting site list straight from the database: 
## Gbe_db = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=P:\0083\analysis\DataCompilation\DataCompilationPy\local_access_db\BemidjiMasterSiteData_be.accdb;')
Gfe_db = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=P:\0083\analysis\DataCompilation\DataCompilationPy\local_access_db\BemidjiMasterSiteData_fe.accdb;')
c_fe = Gfe_db.cursor()
c_fe.execute('select USGS_siteno from tblSites')
df = pd.read_sql('select USGS_siteno from tblSites', Gfe_db)

#Close database connection and cursor
c_fe.close()
Gfe_db.close()



In [6]:
# METHOD B CONTINUED: Drop none values, create a list out of the column, and create a string out of the column
df_sites = df.dropna()
sites = df_sites['USGS_siteno'].tolist()
sites = ', '.join(map(str,sites))

#Create URL that can be copy and pasted/clicked : can also be accessed through https://waterservices.usgs.gov/rest/Site-Test-Tool.html
url = f"https://waterservices.usgs.gov/nwis/site/?format=rdb&sites={sites}&siteOutput=expanded&siteStatus=all"
url = url.replace(" ", "%20")
print(url)

https://waterservices.usgs.gov/nwis/site/?format=rdb&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473412095050801,%20473416095051301,%20473431095052801,%20473356095043701,%20473419095052504,%20473408095045601,%20473426095051001,%20473426095051002,%20473426095051003,%20473425095052706,%20473425095052707,%20473425095052708,%20473425095052709,%20473425095052710,%20473425095052711,%20473425095052712,%20473425095052713,%20473425095052714,%20473426095052422,%20473426095052423,%20473426095052424,%20473426095052425,%20473426095052426,%20473426095052427,%20473426095052428,%20473425095052603,%20473426095052604,%20473426095052605,%20473426095052606,%20473425095052606,%20473425095052607,%20473425095052608,%20473425095052609,%20473425095052502,%20473425095052503,%20473426095052429,%20473426095052430,%20473426095052446,%20473426095052447,%20473426095052615,%20473426095052616,%20473426095052617,%20473426095052618,%20473426095052619,%20473426095052620,%204734260950

## Beautiful Soup?

In [20]:
from bs4 import BeautifulSoup
import urllib

# Read the webpage
r = urllib.request.urlopen(url).read()

# Parse it into components (title, tables, etc., with the Python module lxml) (turned out to be less useful since all the data is in one section)
soup = BeautifulSoup(r, "lxml")

In [21]:
string = soup.string #made one giant string
lines = string.split("\n")# split the string into seperate lines using \n

In [22]:
# creating the dataframe
n=1
for x in lines:  # looping through every element in the list "line"
    words = x.split("\t") # split the line into works
    if words[0] == "agency_cd": # get column names from the line that starts with agency_cd
        col_names=words
    if words[0] == "USGS": 
        if n == 1:
            dfmlr = pd.DataFrame([words], columns=col_names) # use the first line to create a df
            n += 1
        else: 
            dfmlr.loc[len(dfmlr)] = words  # second line and so on gets added to the df

In [23]:
dfmlr.to_csv("data_inputs/MLR/MLR_fromPy.csv", index=False)

# Water Level Retrieval using the Water Services website

# Inital tests

In [24]:
# sites have already been retrieved in the code before and stored in the variable called "sites"
sites

'473429095051006, 473424095052912, 473424095052906, 473423095052902, 473412095050801, 473416095051301, 473431095052801, 473356095043701, 473419095052504, 473408095045601, 473426095051001, 473426095051002, 473426095051003, 473425095052706, 473425095052707, 473425095052708, 473425095052709, 473425095052710, 473425095052711, 473425095052712, 473425095052713, 473425095052714, 473426095052422, 473426095052423, 473426095052424, 473426095052425, 473426095052426, 473426095052427, 473426095052428, 473425095052603, 473426095052604, 473426095052605, 473426095052606, 473425095052606, 473425095052607, 473425095052608, 473425095052609, 473425095052502, 473425095052503, 473426095052429, 473426095052430, 473426095052446, 473426095052447, 473426095052615, 473426095052616, 473426095052617, 473426095052618, 473426095052619, 473426095052620, 473426095052621, 473423095053403, 473423095053404, 473423095053406, 473423095053407, 473423095053408, 473423095053409, 473426095052319, 473426095052320, 4734260950523

In [None]:
import urllib.request
import xml.etree.ElementTree as ET

In [35]:

# URL of the XML file
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473412095050801,%20473416095051301,%20473431095052801,%20473356095043701,%20473419095052504,%20473408095045601,%20473426095051001,%20473426095051002,%20473426095051003,%20473425095052706,%20473425095052707,%20473425095052708,%20473425095052709,%20473425095052710,%20473425095052711,%20473425095052712,%20473425095052713,%20473425095052714,%20473426095052422,%20473426095052423,%20473426095052424,%20473426095052425,%20473426095052426,%20473426095052427,%20473426095052428,%20473425095052603,%20473426095052604,%20473426095052605,%20473426095052606,%20473425095052606,%20473425095052607,%20473425095052608,%20473425095052609,%20473425095052502,%20473425095052503,%20473426095052429,%20473426095052430,%20473426095052446,%20473426095052447,%20473426095052615,%20473426095052616,%20473426095052617,%20473426095052618,%20473426095052619,%20473426095052620,%20473426095052621,%20473423095053403,%20473423095053404,%20473423095053406,%20473423095053407,%20473423095053408,%20473423095053409,%20473426095052319,%20473426095052320,%20473426095052322,%20473426095052323,%20473426095052324,%20473426095052337,%20473427095052210,%20473427095052211,%20473427095052212,%20473427095052213,%20473427095052214,%20473427095052215,%20473426095052325,%20473426095052326,%20473426095052327,%20473426095052328,%20473426095052329,%20473426095052330,%20473426095052331,%20473426095052332,%20473426095052333,%20473426095052334,%20473426095052335,%20473426095052336,%20473426095052420,%20473426095052421,%20473426095052448,%20473426095052449,%20473426095052450&siteStatus=all"

# Read the XML data from the URL
xml_data = urllib.request.urlopen(url).read()

# Parse the XML data using ElementTree
root = ET.fromstring(xml_data)

# Access the XML data
for child in root:
    print(child.tag, child.attrib)

{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.net/gml/3.2}featureMember {}
{http://www.opengis.

In [26]:
import xml.etree.ElementTree as ET

# Parse the XML data
xml_data = """<root>
                 <person id="1">
                     <name>John</name>
                     <age>30</age>
                 </person>
                 <person id="2">
                     <name>Jane</name>
                     <age>25</age>
                 </person>
             </root>"""

root = ET.fromstring(xml_data)

# Print out the tag names of each child element
for child in root:
    print(child.tag)


person
person


In [29]:
# URL of the XML file
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473412095050801,%20473416095051301,%20473431095052801,%20473356095043701,%20473419095052504,%20473408095045601,%20473426095051001,%20473426095051002,%20473426095051003,%20473425095052706,%20473425095052707,%20473425095052708,%20473425095052709,%20473425095052710,%20473425095052711,%20473425095052712,%20473425095052713,%20473425095052714,%20473426095052422,%20473426095052423,%20473426095052424,%20473426095052425,%20473426095052426,%20473426095052427,%20473426095052428,%20473425095052603,%20473426095052604,%20473426095052605,%20473426095052606,%20473425095052606,%20473425095052607,%20473425095052608,%20473425095052609,%20473425095052502,%20473425095052503,%20473426095052429,%20473426095052430,%20473426095052446,%20473426095052447,%20473426095052615,%20473426095052616,%20473426095052617,%20473426095052618,%20473426095052619,%20473426095052620,%20473426095052621,%20473423095053403,%20473423095053404,%20473423095053406,%20473423095053407,%20473423095053408,%20473423095053409,%20473426095052319,%20473426095052320,%20473426095052322,%20473426095052323,%20473426095052324,%20473426095052337,%20473427095052210,%20473427095052211,%20473427095052212,%20473427095052213,%20473427095052214,%20473427095052215,%20473426095052325,%20473426095052326,%20473426095052327,%20473426095052328,%20473426095052329,%20473426095052330,%20473426095052331,%20473426095052332,%20473426095052333,%20473426095052334,%20473426095052335,%20473426095052336,%20473426095052420,%20473426095052421,%20473426095052448,%20473426095052449,%20473426095052450&siteStatus=all"

# Read the XML data from the URL
xml_data = urllib.request.urlopen(url).read()

# Parse the XML data using ElementTree
root = ET.fromstring(xml_data)

# Access the XML data, including the namespace URL
ns = {'gml': 'http://www.opengis.net/gml/3.2'}
for child in root.findall('gml:featureMember', ns):
    print(child.tag)

{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember


In [37]:
# URL of the XML file
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473412095050801,%20473416095051301,%20473431095052801,%20473356095043701,%20473419095052504,%20473408095045601,%20473426095051001,%20473426095051002,%20473426095051003,%20473425095052706,%20473425095052707,%20473425095052708,%20473425095052709,%20473425095052710,%20473425095052711,%20473425095052712,%20473425095052713,%20473425095052714,%20473426095052422,%20473426095052423,%20473426095052424,%20473426095052425,%20473426095052426,%20473426095052427,%20473426095052428,%20473425095052603,%20473426095052604,%20473426095052605,%20473426095052606,%20473425095052606,%20473425095052607,%20473425095052608,%20473425095052609,%20473425095052502,%20473425095052503,%20473426095052429,%20473426095052430,%20473426095052446,%20473426095052447,%20473426095052615,%20473426095052616,%20473426095052617,%20473426095052618,%20473426095052619,%20473426095052620,%20473426095052621,%20473423095053403,%20473423095053404,%20473423095053406,%20473423095053407,%20473423095053408,%20473423095053409,%20473426095052319,%20473426095052320,%20473426095052322,%20473426095052323,%20473426095052324,%20473426095052337,%20473427095052210,%20473427095052211,%20473427095052212,%20473427095052213,%20473427095052214,%20473427095052215,%20473426095052325,%20473426095052326,%20473426095052327,%20473426095052328,%20473426095052329,%20473426095052330,%20473426095052331,%20473426095052332,%20473426095052333,%20473426095052334,%20473426095052335,%20473426095052336,%20473426095052420,%20473426095052421,%20473426095052448,%20473426095052449,%20473426095052450&siteStatus=all"

# Read the XML data from the URL
xml_data = urllib.request.urlopen(url).read()

# Parse the XML data using ElementTree
root = ET.fromstring(xml_data)

# Access the XML data, including the namespace URL
ns = {
    "gml": "http://www.opengis.net/gml/3.2",
 #   "wfs": "http://www.opengis.net/wfs/2.0",
    "xsi": "http://www.w3.org/2001/XMLSchema-instance",
    "wml2": "http://www.opengis.net/waterml/2.0",
    "xlink": "http://www.w3.org/1999/xlink",
    "om": "http://www.opengis.net/om/2.0",
    "sa": "http://www.opengis.net/sampling/2.0",
    "sams": "http://www.opengis.net/samplingSpatial/2.0",
    "swe": "http://www.opengis.net/swe/2.0"
}
for child in root.findall(".//{http://www.opengis.net/gml/3.2}featureMember", ns):
    print(child.tag)
ET.dump(root)

{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
{http://www.opengis.net/gml/3.2}featureMember
<ns0:FeatureCollection xmlns:ns0="

In [46]:
# Read the XML data from the URL
xml_data = urllib.request.urlopen(url).read()

# Parse the XML data using ElementTree
root = ET.fromstring(xml_data)

# Access the XML data, including the namespace URL
ns = {
    "gml": "http://www.opengis.net/gml/3.2",
 #   "wfs": "http://www.opengis.net/wfs/2.0",
    "xsi": "http://www.w3.org/2001/XMLSchema-instance",
    "wml2": "http://www.opengis.net/waterml/2.0",
    "xlink": "http://www.w3.org/1999/xlink",
    "om": "http://www.opengis.net/om/2.0",
    "sa": "http://www.opengis.net/sampling/2.0",
    "sams": "http://www.opengis.net/samplingSpatial/2.0",
    "swe": "http://www.opengis.net/swe/2.0"
}
for child in root.findall("gml:featureMember", ns):
    name = child.find('gml:id', ns)
    print(name.text)
#ET.dump(root)

AttributeError: 'NoneType' object has no attribute 'text'

In [63]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# define URL
#url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906&siteStatus=all"

# send GET request to URL and get response
response = requests.get(url)

# parse response content with BeautifulSoup
soup = BeautifulSoup(response.content, 'xml')

# find all "gml:TimePeriod" elements in the XML
time_periods = soup.find_all("gml:TimePeriod")

# create empty list to hold dictionaries of data
time_periods_list = []

# loop through each "gml:TimePeriod" element and extract data
for time_period in time_periods:
    time_period_dict = {}
    time_period_dict['begin_position'] = time_period.find("gml:beginPosition").text
    time_period_dict['end_position'] = time_period.find("gml:endPosition").text
    time_periods_list.append(time_period_dict)

# create Pandas dataframe from list of dictionaries
df = pd.DataFrame(time_periods_list)
print(df)

            begin_position            end_position
0   2022-08-25T14:08:00UTC  2022-08-25T14:08:00UTC
1   2022-08-25T14:08:00UTC  2022-08-25T14:08:00UTC
2   2022-08-25T14:41:00UTC  2022-08-25T14:41:00UTC
3   2022-08-25T14:41:00UTC  2022-08-25T14:41:00UTC
4   2022-08-25T15:06:00UTC  2022-08-25T15:06:00UTC
5   2022-08-25T15:06:00UTC  2022-08-25T15:06:00UTC
6   2022-08-25T15:22:00UTC  2022-08-25T15:22:00UTC
7   2022-08-25T15:22:00UTC  2022-08-25T15:22:00UTC
8   2022-08-24T11:50:00UTC  2022-08-24T11:50:00UTC
9   2022-08-24T11:50:00UTC  2022-08-24T11:50:00UTC
10  2022-08-17T17:03:00UTC  2022-08-17T17:03:00UTC
11  2022-08-17T17:03:00UTC  2022-08-17T17:03:00UTC
12  2022-08-17T17:03:00UTC  2022-08-17T17:03:00UTC
13  2022-08-26T17:58:00UTC  2022-08-26T17:58:00UTC
14  2022-08-26T17:58:00UTC  2022-08-26T17:58:00UTC
15  2022-08-26T17:58:00UTC  2022-08-26T17:58:00UTC
16  2022-08-26T18:00:00UTC  2022-08-26T18:00:00UTC
17  2022-08-26T18:00:00UTC  2022-08-26T18:00:00UTC
18  2022-08-26T18:00:00UTC  202

In [66]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# define URL
#url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906&siteStatus=all"

# send GET request to URL and get response
response = requests.get(url)

# parse response content with BeautifulSoup
soup = BeautifulSoup(response.content, 'xml')

# find all "om:OM_Observation" elements in the XML
id = soup.find_all("om:OM_Observation")

# create empty list to hold dictionaries of data
time_periods_list = []

# loop through each "om:OM_Observation" element and extract data
for time_period in time_periods:
    time_period_dict = {}
    time_period_dict['ID'] = time_period.find("gml:id").text
    time_period_dict['end_position'] = time_period.find("gml:endPosition").text
    time_periods_list.append(time_period_dict)

# create Pandas dataframe from list of dictionaries
df = pd.DataFrame(time_periods_list)
print(df)

AttributeError: 'NoneType' object has no attribute 'text'

In [67]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# define URL
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906&siteStatus=all"

# send GET request to URL and get response
response = requests.get(url)

# parse response content with BeautifulSoup
soup = BeautifulSoup(response.content, 'xml')

# find all "gml:TimePeriod" elements in the XML
time_periods = soup.find_all("gml:TimePeriod")

# create empty list to hold dictionaries of data
time_periods_list = []

# loop through each "gml:TimePeriod" element and extract data
for time_period in time_periods:
    time_period_dict = {}
    time_period_dict['begin_position'] = time_period.find("gml:beginPosition").text
    time_period_dict['end_position'] = time_period.find("gml:endPosition").text
    time_period_dict['id'] = time_period.get("gml:id")
    time_periods_list.append(time_period_dict)

# create Pandas dataframe from list of dictionaries
df = pd.DataFrame(time_periods_list)
print(df)


           begin_position            end_position  \
0  2022-08-20T18:50:00UTC  2022-08-20T18:50:00UTC   
1  2022-08-20T18:50:00UTC  2022-08-20T18:50:00UTC   
2  2022-08-20T18:50:00UTC  2022-08-20T18:50:00UTC   

                                               id  
0  sample_time.USGS.473429095051006.62610.1.00011  
1  sample_time.USGS.473429095051006.62611.2.00011  
2  sample_time.USGS.473429095051006.72019.3.00011  


In [69]:
df

Unnamed: 0,begin_position,end_position,id
0,2022-08-20T18:50:00UTC,2022-08-20T18:50:00UTC,sample_time.USGS.473429095051006.62610.1.00011
1,2022-08-20T18:50:00UTC,2022-08-20T18:50:00UTC,sample_time.USGS.473429095051006.62611.2.00011
2,2022-08-20T18:50:00UTC,2022-08-20T18:50:00UTC,sample_time.USGS.473429095051006.72019.3.00011


# Test Making dataframe from XML
https://www.youtube.com/watch?v=GFpBYaTJ1uQ


In [98]:
# URL of the XML file
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473427095052214,%20473427095052215,%20473426095052325,%20473426095052326,%20473426095052327,%20473426095052328,%20473426095052329,%20473426095052330,%20473426095052331,%20473426095052332,%20473426095052333,%20473426095052334,%20473426095052335,%20473426095052336,%20473426095052420,%20473426095052421,%20473426095052448,%20473426095052449,%20473426095052450&siteStatus=all"

# Read the XML data from the URL
xml_data = urllib.request.urlopen(url).read()

# Parse the XML data using ElementTree
root = ET.fromstring(xml_data)
root.attrib # gives value in side root

# get child element attributes

{'{http://www.opengis.net/gml/3.2}id': 'USGS.waterservices',
 '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.opengis.net/waterml/2.0 http://schemas.opengis.net/waterml/2.0/waterml2.xsd'}

In [101]:
# URL of the XML file
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473427095052214,%20473427095052215,%20473426095052325,%20473426095052326,%20473426095052327,%20473426095052328,%20473426095052329,%20473426095052330,%20473426095052331,%20473426095052332,%20473426095052333,%20473426095052334,%20473426095052335,%20473426095052336,%20473426095052420,%20473426095052421,%20473426095052448,%20473426095052449,%20473426095052450&siteStatus=all"

# Read the XML data from the URL
xml_data = urllib.request.urlopen(url).read()

def transform_xml(xml_doc):
    attr = xml_doc.attrib
    for xml in xml_doc.iter('gml:featureMember'):
        dict = attr.copy()
        dict.update(xml.attrib)

        yield dict # generates the object I think

# Parse the XML data using ElementTree
root = ET.fromstring(xml_data)

trans = transform_xml(root) # is a generator object

list(trans)
#tr_df = pd.DataFrame()

# get child element attributes

[]

In [106]:
# URL of the XML file
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473427095052214,%20473427095052215,%20473426095052325,%20473426095052326,%20473426095052327,%20473426095052328,%20473426095052329,%20473426095052330,%20473426095052331,%20473426095052332,%20473426095052333,%20473426095052334,%20473426095052335,%20473426095052336,%20473426095052420,%20473426095052421,%20473426095052448,%20473426095052449,%20473426095052450&siteStatus=all"

# Read the XML data from the URL
xml_data = urllib.request.urlopen(url).read()

def transform_xml(xml_doc):
    attr = xml_doc.attrib
    for xml in xml_doc.iter('gml:identifier'):
        dict = attr.copy()
        dict.update(xml.attrib)

        yield dict # generates the object I think

# Parse the XML data using ElementTree
root = ET.fromstring(xml_data)

trans = transform_xml(root) # is a generator object

list(trans)
#tr_df = pd.DataFrame()

# get child element attributes

[]

# New test...just print out the stuff

In [108]:
ET.dump(root)

<ns0:FeatureCollection xmlns:ns0="http://www.opengis.net/gml/3.2" xmlns:ns2="http://www.opengis.net/waterml/2.0" xmlns:ns3="http://www.w3.org/1999/xlink" xmlns:ns4="http://www.opengis.net/om/2.0" xmlns:ns5="http://www.opengis.net/sampling/2.0" xmlns:ns6="http://www.opengis.net/samplingSpatial/2.0" xmlns:ns7="http://www.opengis.net/swe/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ns0:id="USGS.waterservices" xsi:schemaLocation="http://www.opengis.net/waterml/2.0 http://schemas.opengis.net/waterml/2.0/waterml2.xsd"><ns0:featureMember><ns2:Collection ns0:id="C.USGS.473423095052902"><ns0:identifier codeSpace="http://waterservices.usgs.gov/nwis/gw">USGS.473423095052902</ns0:identifier><ns0:name codeSpace="http://waterservices.usgs.gov/nwis/gw">Timeseries collected at 1802      Bemidji Toxics Research Site  0000786140</ns0:name><ns2:metadata><ns2:DocumentMetadata ns0:id="doc.USGS.MP.USGS.473423095052902"><ns0:metaDataProperty about="contact" ns3:href="http://waterservices.usgs.g

In [110]:
xml_data.content

AttributeError: 'bytes' object has no attribute 'content'

In [139]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# define URL
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906&siteStatus=all"

# send GET request to URL and get response
response = requests.get(url)

# parse response content with BeautifulSoup
soup = BeautifulSoup(response.content, 'xml')

# find all "gml:TimePeriod" elements in the XML
time_periods = soup.find_all("gml:TimePeriod")

# create empty list to hold dictionaries of data
time_periods_list = []

# loop through each "gml:TimePeriod" element and extract data
for time_period in time_periods:
    time_period_dict = {}
    time_period_dict['begin_position'] = time_period.find("gml:beginPosition").text
    time_period_dict['end_position'] = time_period.find("gml:endPosition").text
    time_period_dict['id'] = time_period.get("gml:id")
    time_periods_list.append(time_period_dict)

# create Pandas dataframe from list of dictionaries
df = pd.DataFrame(time_periods_list)
print(df)

           begin_position            end_position  \
0  2022-08-20T18:50:00UTC  2022-08-20T18:50:00UTC   
1  2022-08-20T18:50:00UTC  2022-08-20T18:50:00UTC   
2  2022-08-20T18:50:00UTC  2022-08-20T18:50:00UTC   

                                               id  
0  sample_time.USGS.473429095051006.62610.1.00011  
1  sample_time.USGS.473429095051006.62611.2.00011  
2  sample_time.USGS.473429095051006.72019.3.00011  


In [113]:
xml_tag = soup.find("gml:TimePeriod")
xml_tag

<gml:TimePeriod gml:id="sample_time.USGS.473429095051006.62610.1.00011"><gml:beginPosition>2022-08-20T18:50:00UTC</gml:beginPosition><gml:endPosition>2022-08-20T18:50:00UTC</gml:endPosition></gml:TimePeriod>

In [114]:
xml_tag.text

'2022-08-20T18:50:00UTC2022-08-20T18:50:00UTC'

In [117]:
xml_tag = soup.find("om:OM_Observation")
#print(xml_tag, '\n\n', xml_tag.text)
xml_tag = soup.find("om:phenomenonTime")
print(xml_tag, '\n\n', xml_tag.text)

<om:phenomenonTime><gml:TimePeriod gml:id="sample_time.USGS.473429095051006.62610.1.00011"><gml:beginPosition>2022-08-20T18:50:00UTC</gml:beginPosition><gml:endPosition>2022-08-20T18:50:00UTC</gml:endPosition></gml:TimePeriod></om:phenomenonTime> 

 2022-08-20T18:50:00UTC2022-08-20T18:50:00UTC


In [119]:
type(xml_tag)
print(xml_tag.name)

phenomenonTime


In [124]:
# soup is the xml document
print(soup.title)
print(soup.p)
print(soup.a)

None
None
None


In [132]:
# find all the tags in the document 
tags = soup.find_all()

# Iterate over the tags and print out the tag name
for tag in tags:
    print(tag.name)

Collection
identifier
name
metadata
DocumentMetadata
metaDataProperty
generationDate
version
observationMember
OM_Observation
phenomenonTime
TimePeriod
beginPosition
endPosition
resultTime
TimeInstant
timePosition
procedure
ObservationProcess
processType
parameter
NamedValue
name
value
observedProperty
featureOfInterest
MonitoringPoint
descriptionReference
sampledFeature
shape
Point
pos
result
MeasurementTimeseries
defaultPointMetadata
DefaultTVPMeasurementMetadata
qualifier
Category
value
qualifier
Category
value
qualifier
Category
value
qualifier
Category
value
qualifier
Category
value
qualifier
Category
value
qualifier
Category
value
uom
interpolationType
point
MeasurementTVP
time
value
metadata
TVPMeasurementMetadata
qualifier
Category
value
qualifier
Category
value
qualifier
Category
qualifier
Category
value
qualifier
Category
value
observationMember
OM_Observation
phenomenonTime
TimePeriod
beginPosition
endPosition
resultTime
TimeInstant
timePosition
procedure
ObservationProcess


In [135]:
# close but we need namespaces too
# Iterate over the tags and print out the whole tag including the namespace
for tag in tags:
    print(tag.prettify())

<wml2:Collection gml:id="C.USGS.473429095051006" xmlns:gml="http://www.opengis.net/gml/3.2" xmlns:om="http://www.opengis.net/om/2.0" xmlns:sa="http://www.opengis.net/sampling/2.0" xmlns:sams="http://www.opengis.net/samplingSpatial/2.0" xmlns:swe="http://www.opengis.net/swe/2.0" xmlns:wml2="http://www.opengis.net/waterml/2.0" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.opengis.net/waterml/2.0 http://schemas.opengis.net/waterml/2.0/waterml2.xsd">
 <gml:identifier codeSpace="http://waterservices.usgs.gov/nwis/gw">
  USGS.473429095051006
 </gml:identifier>
 <gml:name codeSpace="http://waterservices.usgs.gov/nwis/gw">
  Timeseries collected at 1217E     147N35W02DCBB  06
 </gml:name>
 <wml2:metadata>
  <wml2:DocumentMetadata gml:id="doc.USGS.MP.USGS.473429095051006">
   <gml:metaDataProperty about="contact" xlink:href="http://waterservices.usgs.gov"/>
   <wml2:generationDate>
    2023-02-22T11:33:22.475-05:0

In [138]:
# Access the XML data, including the namespace URL
ns = {
    "gml": "http://www.opengis.net/gml/3.2",
 #   "wfs": "http://www.opengis.net/wfs/2.0",
    "xsi": "http://www.w3.org/2001/XMLSchema-instance",
    "wml2": "http://www.opengis.net/waterml/2.0",
    "xlink": "http://www.w3.org/1999/xlink",
    "om": "http://www.opengis.net/om/2.0",
    "sa": "http://www.opengis.net/sampling/2.0",
    "sams": "http://www.opengis.net/samplingSpatial/2.0",
    "swe": "http://www.opengis.net/swe/2.0"
}
for child in root.findall("gml:featureMember", ns):
    print(child.tag)

# Method A trial - Using XML...KEEP THIS

In [140]:
#import requests
#import pandas as pd
#from bs4 import BeautifulSoup

# define URL
url = "https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906&siteStatus=all"

# send GET request to URL and get response
response = requests.get(url)

# parse response content with BeautifulSoup
soup = BeautifulSoup(response.content, 'xml')

# find all "gml:TimePeriod" elements in the XML
time_periods = soup.find_all("gml:TimePeriod")

# create empty list to hold dictionaries of data
time_periods_list = []

# loop through each "gml:TimePeriod" element and extract data
for time_period in time_periods:
    time_period_dict = {}
    time_period_dict['begin_position'] = time_period.find("gml:beginPosition").text
    time_period_dict['end_position'] = time_period.find("gml:endPosition").text
    time_period_dict['id'] = time_period.get("gml:id")
    time_periods_list.append(time_period_dict)

# create Pandas dataframe from list of dictionaries
df = pd.DataFrame(time_periods_list)
df

Unnamed: 0,begin_position,end_position,id
0,2022-08-20T18:50:00UTC,2022-08-20T18:50:00UTC,sample_time.USGS.473429095051006.62610.1.00011
1,2022-08-20T18:50:00UTC,2022-08-20T18:50:00UTC,sample_time.USGS.473429095051006.62611.2.00011
2,2022-08-20T18:50:00UTC,2022-08-20T18:50:00UTC,sample_time.USGS.473429095051006.72019.3.00011


In [143]:
## Recreate the URL 

# Using the same "sites" variable created before

#Create URL that can be copy and pasted/clicked : can also be accessed through https://waterservices.usgs.gov/rest/Site-Test-Tool.html
url = f"https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites={sites}&siteStatus=all"
url = url.replace(" ", "%20")
print(url)

https://waterservices.usgs.gov/nwis/gwlevels/?format=waterml,2.0&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473412095050801,%20473416095051301,%20473431095052801,%20473356095043701,%20473419095052504,%20473408095045601,%20473426095051001,%20473426095051002,%20473426095051003,%20473425095052706,%20473425095052707,%20473425095052708,%20473425095052709,%20473425095052710,%20473425095052711,%20473425095052712,%20473425095052713,%20473425095052714,%20473426095052422,%20473426095052423,%20473426095052424,%20473426095052425,%20473426095052426,%20473426095052427,%20473426095052428,%20473425095052603,%20473426095052604,%20473426095052605,%20473426095052606,%20473425095052606,%20473425095052607,%20473425095052608,%20473425095052609,%20473425095052502,%20473425095052503,%20473426095052429,%20473426095052430,%20473426095052446,%20473426095052447,%20473426095052615,%20473426095052616,%20473426095052617,%20473426095052618,%20473426095052619,%20473426095052620,%

In [144]:
# send GET request to URL and get response
response = requests.get(url)

# parse response content with BeautifulSoup
soup = BeautifulSoup(response.content, 'xml')

# find all "gml:TimePeriod" elements in the XML
time_periods = soup.find_all("gml:TimePeriod")

# create empty list to hold dictionaries of data
time_periods_list = []

# loop through each "gml:TimePeriod" element and extract data
for time_period in time_periods:
    time_period_dict = {}
    time_period_dict['begin_position'] = time_period.find("gml:beginPosition").text
    time_period_dict['end_position'] = time_period.find("gml:endPosition").text
    time_period_dict['id'] = time_period.get("gml:id")
    time_periods_list.append(time_period_dict)

# create Pandas dataframe from list of dictionaries
df = pd.DataFrame(time_periods_list)
df

Unnamed: 0,begin_position,end_position,id
0,2022-08-25T14:08:00UTC,2022-08-25T14:08:00UTC,sample_time.USGS.473356095043701.62611.879.00011
1,2022-08-25T14:08:00UTC,2022-08-25T14:08:00UTC,sample_time.USGS.473356095043701.72019.880.00011
2,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473358095061401.62610.307.00011
3,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473358095061401.62611.308.00011
4,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473358095061401.72019.309.00011
...,...,...,...
907,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473440095063001.62610.790.00011
908,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473440095063001.62611.791.00011
909,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473440095063001.72019.792.00011
910,2022-08-25T15:42:00UTC,2022-08-25T15:42:00UTC,sample_time.USGS.473503095044501.62611.891.00011


In [150]:
df['id'].str.split('.', expand=True)

Unnamed: 0,0,1,2,3,4,5
0,sample_time,USGS,473356095043701,62611,879,00011
1,sample_time,USGS,473356095043701,72019,880,00011
2,sample_time,USGS,473358095061401,62610,307,00011
3,sample_time,USGS,473358095061401,62611,308,00011
4,sample_time,USGS,473358095061401,72019,309,00011
...,...,...,...,...,...,...
907,sample_time,USGS,473440095063001,62610,790,00011
908,sample_time,USGS,473440095063001,62611,791,00011
909,sample_time,USGS,473440095063001,72019,792,00011
910,sample_time,USGS,473503095044501,62611,891,00011


In [152]:
# Split id column
df[["delete_me", "agency", "siteno", "metadata1", "metadata2", "metadata3"]] = df["id"].str.split(".", expand=True)

df


Unnamed: 0,begin_position,end_position,id,delete_me,agency,siteno,number1,number2,number3,metadata1,metadata2,metadata3
0,2022-08-25T14:08:00UTC,2022-08-25T14:08:00UTC,sample_time.USGS.473356095043701.62611.879.00011,sample_time,USGS,473356095043701,62611,879,00011,62611,879,00011
1,2022-08-25T14:08:00UTC,2022-08-25T14:08:00UTC,sample_time.USGS.473356095043701.72019.880.00011,sample_time,USGS,473356095043701,72019,880,00011,72019,880,00011
2,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473358095061401.62610.307.00011,sample_time,USGS,473358095061401,62610,307,00011,62610,307,00011
3,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473358095061401.62611.308.00011,sample_time,USGS,473358095061401,62611,308,00011,62611,308,00011
4,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473358095061401.72019.309.00011,sample_time,USGS,473358095061401,72019,309,00011,72019,309,00011
...,...,...,...,...,...,...,...,...,...,...,...,...
907,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473440095063001.62610.790.00011,sample_time,USGS,473440095063001,62610,790,00011,62610,790,00011
908,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473440095063001.62611.791.00011,sample_time,USGS,473440095063001,62611,791,00011,62611,791,00011
909,1992-06-16T12:00:00UTC,1992-06-16T12:00:00UTC,sample_time.USGS.473440095063001.72019.792.00011,sample_time,USGS,473440095063001,72019,792,00011,72019,792,00011
910,2022-08-25T15:42:00UTC,2022-08-25T15:42:00UTC,sample_time.USGS.473503095044501.62611.891.00011,sample_time,USGS,473503095044501,62611,891,00011,62611,891,00011


# BACK TO TAB DELINEATED

In [7]:
# water level meta data 
## Recreate the URL 

# Using the same "sites" variable created before and new URL link...
url = f"https://waterservices.usgs.gov/nwis/gwlevels/?format=rdb&sites={sites}&siteStatus=all"
url = url.replace(" ", "%20")
print(url)

https://waterservices.usgs.gov/nwis/gwlevels/?format=rdb&sites=473429095051006,%20473424095052912,%20473424095052906,%20473423095052902,%20473412095050801,%20473416095051301,%20473431095052801,%20473356095043701,%20473419095052504,%20473408095045601,%20473426095051001,%20473426095051002,%20473426095051003,%20473425095052706,%20473425095052707,%20473425095052708,%20473425095052709,%20473425095052710,%20473425095052711,%20473425095052712,%20473425095052713,%20473425095052714,%20473426095052422,%20473426095052423,%20473426095052424,%20473426095052425,%20473426095052426,%20473426095052427,%20473426095052428,%20473425095052603,%20473426095052604,%20473426095052605,%20473426095052606,%20473425095052606,%20473425095052607,%20473425095052608,%20473425095052609,%20473425095052502,%20473425095052503,%20473426095052429,%20473426095052430,%20473426095052446,%20473426095052447,%20473426095052615,%20473426095052616,%20473426095052617,%20473426095052618,%20473426095052619,%20473426095052620,%20473426

In [8]:
from bs4 import BeautifulSoup
import urllib

# Read the webpage
r = urllib.request.urlopen(url).read()

# Parse it into components (title, tables, etc., with the Python module lxml) (turned out to be less useful since all the data is in one section)
soup = BeautifulSoup(r, "lxml")

string = soup.string #made one giant string
lines = string.split("\n")# split the string into seperate lines using \n

In [9]:
# creating the water level dataframe
n=1
for x in lines:  # looping through every element in the list "line"
    words = x.split("\t") # split the line into words
    if words[0] == "agency_cd": # get column names from the line that starts with agency_cd
        col_names=words
    if words[0] == "USGS": 
        if n == 1:
            dfwl = pd.DataFrame([words], columns=col_names) # use the first line to create a df
            n += 1
        else: 
            dfwl.loc[len(dfwl)] = words  # second line and so on gets added to the df

In [10]:
dfwl

Unnamed: 0,agency_cd,site_no,site_tp_cd,lev_dt,lev_tm,lev_tz_cd,lev_va,sl_lev_va,sl_datum_cd,lev_status_cd,lev_agency_cd,lev_dt_acy_cd,lev_acy_cd,lev_src_cd,lev_meth_cd,lev_age_cd
0,USGS,473356095043701,ST,2022-08-25,14:08,UTC,-1.06,,,,USGS,m,,S,D,P
1,USGS,473358095061401,GW,1992-06-16,,UTC,0.00,,,1,USGS,D,,S,V,A
2,USGS,473404095054101,GW,2000-10-17,,UTC,10.33,,,1,USGS,D,,S,V,A
3,USGS,473405095060101,GW,2016-08-11,14:30,UTC,9.42,,,1,USGS,m,,S,V,A
4,USGS,473405095060201,GW,2016-08-11,14:29,UTC,9.39,,,1,USGS,m,,S,V,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,USGS,473435095061501,GW,2022-10-28,15:09,UTC,23.33,,,1,USGS,m,,S,V,P
306,USGS,473437095052401,GW,2022-10-28,15:28,UTC,35.00,,,1,USGS,m,,S,V,P
307,USGS,473439095063001,LK,2022-08-25,16:17,UTC,-0.92,,,,USGS,m,,S,D,P
308,USGS,473440095063001,GW,1992-06-16,,UTC,1.60,,,1,USGS,D,,S,V,A


In [11]:
dfwl.to_csv("../create_master_oil_levels/data_inputs/NWISwaterLevel_fromPy.csv", index=False)

: 

# Test Graphviz

In [89]:
#C:\Users\bmilinic\OneDrive - DOI\Documents\USGS\gap_sediment
os.chdir('C:/Users/bmilinic/OneDrive - DOI/Documents/Python')
#pip install graphviz

# interesting note, new package had to be installed in the Python folder rather than the Bemidji folder (ie os.chdir('C:/Users/bmilinic/OneDrive - DOI/Documents/Python/Bemidji'))

In [91]:
conda install python-graphviz

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: c:\Users\bmilinic\Anaconda3

  added / updated specs:
    - python-graphviz


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    cairo-1.16.0               |       he04af86_2         1.5 MB
    conda-23.1.0               |   py39haa95532_0         946 KB
    expat-2.4.9                |       h6c2663c_0         207 KB
    fribidi-1.0.10             |       h62dcd97_0          63 KB
    getopt-win32-0.1           |       h2bbff1b_0          19 KB
    glib-2.69.1                |       h5dc1a3c_1         1.6 MB
    graphite2-1.3.14           |       hd77b12b_1          91 KB
    graphviz-2.50.0            |       hdb8b0d4_0         903 KB
    gts-0.7.6                  |       h63ab5a1_3         181 KB
    harfbuzz-4.3.0             |    

In [169]:
import graphviz
from xml.etree import ElementTree

In [173]:
# example data
xml = '''
<?xml version="1.0" encoding="UTF-8"?>
<graph>
    <node id="1" label="Grass" />
    <node id="2" label="Rabbit" />
    <node id="3" label="Fox" />
    <edge source="1" target="2" />
    <edge source="2" target="3" />
</graph>
'''

# Remove any leading or trailing whitespace from the string
xml = xml.strip()

# Parse the XML file
tree = ElementTree.fromstring(xml)

# Load the XML file
graph = graphviz.Digraph()
graph.format = 'png'
graph.engine = 'dot'
graph.graph_attr['rankdir'] = 'LR'
graph.graph_attr['size'] = '8,5'

# Add the nodes to the graph
for node in tree.findall('node'):
    graph.node(node.get('id'), node.get('label'))

# Add the edges to the graph
for edge in tree.findall('edge'):
    graph.edge(edge.get('source'), edge.get('target'))

# Render the graph and save it to a file
graph.render('foodweb', view=True)


'foodweb.png'

In [176]:
import graphviz
from xml.etree import ElementTree

# Define the XML data as a string
xml = '''
<?xml version="1.0" encoding="UTF-8"?>
<root label="Root">
    <child label="Child 1" />
    <child label="Child 2" />
    <child label="Child 3" />
</root>
'''

# Remove any leading or trailing whitespace from the string
xml = xml.strip()

# Parse the XML string into an ElementTree object
tree = ElementTree.fromstring(xml)

# Create a Graphviz digraph object
dot = graphviz.Digraph()
dot.format = 'png'
dot.engine = 'dot'

# Add the root node to the graph
root = tree.attrib['label']
dot.node(root, root)

# Add the child nodes to the graph, with edges pointing from the root to the children
for child in tree.findall('child'):
    label = child.attrib['label']
    dot.node(label, label)
    dot.edge(root, label)

# Render the graph and save it to a file
dot.render('root_children', view=True)

'root_children.png'

In [177]:
import graphviz
from xml.etree import ElementTree

# Define the XML data as a string
xml = '''
<?xml version="1.0" encoding="UTF-8"?>
<root label="Root">
    <child label="Child 1">
        <grandchild label="Grandchild 1.1" />
        <grandchild label="Grandchild 1.2" />
    </child>
    <child label="Child 2">
        <grandchild label="Grandchild 2.1" />
        <grandchild label="Grandchild 2.2" />
    </child>
</root>
'''

# Remove any leading or trailing whitespace from the string
xml = xml.strip()

# Parse the XML string into an ElementTree object
tree = ElementTree.fromstring(xml)

# Create a Graphviz digraph object
dot = graphviz.Digraph()
dot.format = 'png'
dot.engine = 'dot'

# Add the root node to the graph
root = tree.attrib['label']
dot.node(root, root)

# Add the child nodes to the graph, with edges pointing from the root to the children
for child in tree.findall('child'):
    child_label = child.attrib['label']
    dot.node(child_label, child_label)
    dot.edge(root, child_label)

    # Add the grandchild nodes to the graph, with edges pointing from the child to the grandchildren
    for grandchild in child.findall('grandchild'):
        grandchild_label = grandchild.attrib['label']
        dot.node(grandchild_label, grandchild_label)
        dot.edge(child_label, grandchild_label)

# Render the graph and save it to a file
dot.render('root_children_grandchildren', view=True)

'root_children_grandchildren.png'

In [179]:
import graphviz
from xml.etree import ElementTree

# Define the XML data as a string
xml = '''
<?xml version="1.0" encoding="UTF-8"?>
<root label="Root">
    <child label="Child 1">
        <grandchild label="Grandchild 1.1" />
        <grandchild label="Grandchild 1.2" />
    </child>
    <child label="Child 2">
        <grandchild label="Grandchild 2.1" />
        <grandchild label="Grandchild 2.2" />
    </child>
</root>
'''

# Remove any leading or trailing whitespace from the string
xml = xml.strip()

# Parse the XML string into an ElementTree object
tree = ElementTree.fromstring(xml)

# Create a Graphviz digraph object
dot = graphviz.Digraph()
dot.format = 'png'
dot.engine = 'dot'
dot.attr('node', shape='box', style='rounded', fontname='Helvetica', fontsize='12')
dot.attr('edge', color='#888888', penwidth='1')

# Add the root node to the graph
root = tree.attrib['label']
dot.node(root, root, style='rounded,filled', fillcolor='#F4D03F', fontcolor='#424242')

# Add the child nodes to the graph, with edges pointing from the root to the children
for child in tree.findall('child'):
    child_label = child.attrib['label']
    dot.node(child_label, child_label, style='rounded', fillcolor='#3498DB', fontcolor='white')
    dot.edge(root, child_label, color='#D7DBDD')

    # Add the grandchild nodes to the graph, with edges pointing from the child to the grandchildren
    for grandchild in child.findall('grandchild'):
        grandchild_label = grandchild.attrib['label']
        dot.node(grandchild_label, grandchild_label, shape='diamond', fillcolor='#E74C3C', fontcolor='white')
        dot.edge(child_label, grandchild_label, color='#7F8C8D', penwidth='2')

# Render the graph and save it to a file
dot.render('root_children_grandchildren', view=True)


'root_children_grandchildren.png'

In [181]:
import graphviz
from xml.etree import ElementTree

# Define the XML data as a string
xml = '''
<?xml version="1.0" encoding="UTF-8"?>
<root label="Root">
    <child label="Child 1">
        <grandchild label="Grandchild 1.1" />
        <grandchild label="Grandchild 1.2" />
    </child>
    <child label="Child 2">
        <grandchild label="Grandchild 2.1" />
        <grandchild label="Grandchild 2.2" />
    </child>
</root>
'''

# Remove any leading or trailing whitespace from the string
xml = xml.strip()

# Parse the XML string into an ElementTree object
tree = ElementTree.fromstring(xml)

# Create a Graphviz digraph object
dot = graphviz.Digraph()
dot.format = 'png'

# Add the root node to the graph
root = tree.attrib['label']
dot.node(root, label=root, color='black', style='filled', fillcolor='#F4D03F')

# Add the child nodes to the graph, with edges pointing from the root to the children
for child in tree.findall('child'):
    child_label = child.attrib['label']
    dot.node(child_label, label=child_label, color='black', style='filled', fillcolor='#3498DB')
    dot.edge(root, child_label, color='#888888')

    # Add the grandchild nodes to the graph, with edges pointing from the child to the grandchildren
    for grandchild in child.findall('grandchild'):
        grandchild_label = grandchild.attrib['label']
        dot.node(grandchild_label, label=grandchild_label, color='black', style='filled', fillcolor='#E74C3C')
        dot.edge(child_label, grandchild_label, color='#888888')

# Render the graph and save it to a file
dot.render('root_children_grandchildren', view=True)


'root_children_grandchildren.png'

In [182]:
import graphviz
from xml.etree import ElementTree

# Define the XML data as a string
xml = '''
<?xml version="1.0" encoding="UTF-8"?>
<root label="Root">
    <child label="Child 1">
        <grandchild label="Grandchild 1.1" />
        <grandchild label="Grandchild 1.2" />
    </child>
    <child label="Child 2">
        <grandchild label="Grandchild 2.1" />
        <grandchild label="Grandchild 2.2" />
    </child>
</root>
'''

# Remove any leading or trailing whitespace from the string
xml = xml.strip()

# Parse the XML string into an ElementTree object
tree = ElementTree.fromstring(xml)

# Create a Graphviz digraph object
dot = graphviz.Digraph()
dot.format = 'png'

# Add the root node to the graph
root = tree.tag
dot.node(root, label=root, color='black', style='filled', fillcolor='#F4D03F')

# Add the child nodes to the graph, with edges pointing from the root to the children
for child in tree.findall('child'):
    child_label = child.tag
    dot.node(child_label, label=child_label, color='black', style='filled', fillcolor='#3498DB')
    dot.edge(root, child_label, color='#888888')

    # Add the grandchild nodes to the graph, with edges pointing from the child to the grandchildren
    for grandchild in child.findall('grandchild'):
        grandchild_label = grandchild.tag
        dot.node(grandchild_label, label=grandchild_label, color='black', style='filled', fillcolor='#E74C3C')
        dot.edge(child_label, grandchild_label, color='#888888')

# Render the graph and save it to a file
dot.render('root_children_grandchildren', view=True)


'root_children_grandchildren.png'

# Converting to DataTime

## BMJ conversion

In [None]:
f = r'data_inputs/gwsi_old/bmj_mpnt_fromPy.csv'
df_mp = pd.read_csv(f) 

In [None]:
# Fill in any empty spaces for the start date with an early time (1980s)
df_mp['NWIS_MP_BeginDate'] = df_mp['NWIS_MP_BeginDate'].replace(np.nan, 19800101.0, regex=True) 

# Fill in any empty spaces for the end dates with today's date
date = float(datetime.today().strftime('%Y%m%d'))
df_mp['NWIS_MP_EndDate'] = df_mp['NWIS_MP_EndDate'].replace(np.nan, date, regex=True) 

In [None]:
# Convert both date columns into DateTimes
df_mp['NWIS_MP_BeginDate'] = pd.to_datetime(df_mp['NWIS_MP_BeginDate'], format= '%Y%m%d') 
df_mp['NWIS_MP_EndDate'] = pd.to_datetime(df_mp['NWIS_MP_EndDate'], format= '%Y%m%d') 

## Aquarius (referencepoints)

In [None]:
f = r'data_inputs/aquarius/Referencepoints_fromPy.csv'
df_rp2 = pd.read_csv(f)

In [None]:
print(df_rp2['AppliedTime'])

In [None]:
# Replace year 1 DateTime values with 1980
df_rp2['ValidFrom'] = df_rp2['ValidFrom'].replace('0001-01-01T00:00:00.0000000+00:00', '1980-01-01T00:00:00.0000000+00:00')
#df_rp2['AppliedTime'] = df_rp2['AppliedTime'].replace('0001-01-01T00:00:00.0000000+00:00', '1980-01-01T00:00:00.0000000+00:00')

In [None]:
# Convert both date columns into DateTimes
df_rp2['ValidFrom'] = pd.to_datetime(df_rp2['ValidFrom'], utc=True, format= '%Y-%m-%dT%H:%M:%S.%f%z') 
df_rp2['AppliedTime'] = pd.to_datetime(df_rp2['AppliedTime'], utc= True, format= '%Y-%m-%dT%H:%M:%S.%f%z') 

In [None]:
import pandas as pd
#string
df = pd.DataFrame({'DOB': {0: '26/1/2016', 1: '26/1/2016'}})
print (df)

#datetime format
df['DOB'] = pd.to_datetime(df.DOB)
print (df)

#change the datetimeformat
df['DOB1'] = df['DOB'].dt.strftime('%m/%d/%Y')
print (df)

df['DOB2'] = df['DOB'].dt.strftime('%Y%m%d')
print (df)

## SQL alchemy Access

In [None]:
import sqlalchemy_access