In [1]:
from tabula import read_pdf
from tabulate import tabulate
import pandas as pd
import numpy as np

Using tabula to extract the tables from the pdf, but they are spread across different pages, so this will gather a list of dfs instead of a single df

In [2]:
df_list  = read_pdf("Data\mcls_dlrs_phgs.pdf", pages='all')


The codes represent the following: 
- MCL - Maximum contaminant level
- DLR - detection limits for reporting
- PHG - public health goals (often smells, tastes... )
- MCLG - maximum containment level goal

In [3]:
len(df_list)

15

There are 15 dataframes extracted from the pdf

In [4]:
df_list[0].head()

Unnamed: 0,State Regulated,State,State.1,State PHG,State.2,Federal,Federal.1
0,Inorganic Chemical,MCL,DLR,,Date of,MCL,MCLG
1,Contaminant,,,,PHG,,
2,Aluminum,1,0.05,0.6,2001,--,--
3,Antimony,0.006,0.006,0.001,2016,0.006,0.006
4,Arsenic,0.010,0.002,4e-06,2004,0.010,zero


In [141]:
df_list[1].head()

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,"Chromium,\rHexavalent - 0.01-\rmg/L MCL & 0.00...",--,--,2e-05,2011,--,--
1,Cyanide,0.15,0.1,0.15,1997,0.2,0.2
2,Fluoride,2,0.1,1.0,1997,4.0,4.0
3,Mercury (inorganic),0.002,0.001,0.0012,1999\r(rev2005)*,0.002,0.002
4,Nickel,0.1,0.01,0.012,2001,--,--


In [144]:
df_list[11].head()

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Federal\rMCLG
0,Polychlorinated\rbiphenyls (PCBs),0.0005,0.0005,9e-05,2007,0.0005,,zero
1,Simazine,0.004,0.001,0.004,2001,0.004,,0.004
2,Thiobencarb,0.07,0.001,0.042,2016,--,,--
3,Toxaphene,0.003,0.001,3e-05,2003,0.003,,zero
4,"1,2,3-\rTrichloropropane",0.00000\r5,0.00000\r5,7e-07,2009,--,,--


Notice that the list for 1 and 13 have the titles and subtitles concatenated, where as 0 has a second heading row, which does occur in the other lists.
I don't really care about the classification of the chemicals, so I'm going to remove the subheadings and change all of the headers to be the same: 

- Contaminant, 
- State_MCL, 
- State_DLR, (remove) 
- State_PHG, 
- PHG_Date, (remove)
- Federal_MCL, 
- Federal_MCLG (remove)

In [5]:
# This step will work for almost all tables, but there seem to be 2 that have an 8th column, which is somewhere 
# in the middle, so these will be adjusted after we have standardized naming

for df in df_list:
    df.rename(columns={df.columns[0]: "Contaminant",
                       df.columns[1]: "State_MCL",
                       df.columns[2]: "State_DLR",
                       df.columns[3]: "State_PHG",
                       df.columns[4]: "PHG_Date",
                       df.columns[5]: "Federal_MCL",
                       df.columns[6]: "Federal_MCLG"
                       }, inplace=True)
    df.replace('--', np.nan, inplace=True)


In [None]:
# contaminants = pd.concat(df_list, ignore_index=True)

In [None]:
# contaminants

In [None]:
# contaminants.drop(columns=[
#                     "State_DLR",
#                     "PHG_Date",
#                     "Federal_MCLG", 
#                     "Federal\rMCLG"], inplace=True)


After filtering the contaminants df in Views, I determined that by removing all NaN from the State_MCL will eliminate any of the cells that contained multiple lines as a byproduct of the tabulation

In [None]:
# contaminants_filtered = contaminants.dropna(subset=['State_MCL'], how='all')

In [None]:
# contaminants_filtered = contaminants_filtered.loc[contaminants_filtered.State_MCL != "MCL"]

In [None]:
# contaminants_filtered.reset_index(drop=True, inplace=True)

In [None]:
# contaminants_filtered

There are several rows that need to be removed that were subtitles of the different tables - I filtered the State_MCL column for all containing "MCL", as these were the headers.  This was verified with the Federal and PHG columns - all yielded the same 4 rows
- 0 Inorganic Chemical
- 20 Copper and Lead
- 23 Radionuclides
- 107 Disinfection



At this point, I'm noticing that some of the columns for the Federal MCL have years in them, and it isn't just from one table, so the tables will sadly have to be cleaned individually. 

---

In [6]:
import copy
df_list_mod = copy.deepcopy(df_list)

In [7]:
df_list[0].dropna(subset=['State_MCL'], how='all', inplace=True)
df_list[0] = df_list[0].loc[df_list[0].State_MCL != 'MCL']
df_list[0]['Units'] = 'mg/L' # All units are mg/L unless otherwise specified


This next box fixes the specific rows

In [8]:
df_list[0].loc[4, ["Federal_MCLG"]] = [0]  # Fixes the string zero to numerical
df_list[0].loc[5, ["Contaminant", "State_MCL", "State_DLR", "State_PHG", "PHG_Date", "Federal_MCL", "Federal_MCLG", "Units"]] = [
    'Asbestos', 7.0, 0.2, 7.0, 2003, 7.0, 7.0, 'MFL']  # Removes the units from every value to numerical values
# Changes long text to just chromium, total - changes 'witdrawn' to Null
df_list[0].loc[12, ["Contaminant", "State_PHG"]] = ['Chromium, Total', np.nan]



In [9]:
df_list[0]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
2,Aluminum,1.0,0.05,0.6,2001,,,mg/L
3,Antimony,0.006,0.006,0.001,2016,0.006,0.006,mg/L
4,Arsenic,0.01,0.002,4e-06,2004,0.01,0.0,mg/L
5,Asbestos,7.0,0.2,7.0,2003,7.0,7.0,MFL
9,Barium,1.0,0.1,2.0,2003,2.0,2.0,mg/L
10,Beryllium,0.004,0.001,0.001,2003,0.004,0.004,mg/L
11,Cadmium,0.005,0.001,4e-05,2006,0.005,0.005,mg/L
12,"Chromium, Total",0.05,0.01,,1999,0.1,0.1,mg/L


In [10]:
n = 1
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]['Units'] = 'mg/L'
# df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
1,Cyanide,0.15,0.1,0.15,1997,0.2,0.2,mg/L
2,Fluoride,2,0.1,1,1997,4.0,4.0,mg/L
3,Mercury (inorganic),0.002,0.001,0.0012,1999\r(rev2005)*,0.002,0.002,mg/L
4,Nickel,0.1,0.01,0.012,2001,,,mg/L
5,"Nitrate (as nitrogen,\rN)",10 as N,0.4,45 as NO3\r(=10 as N),2018,10.0,10.0,mg/L
6,Nitrite (as N),1 as N,0.4,1 as N,2018,1.0,1.0,mg/L
7,Nitrate + Nitrite (as\rN),10 as N,,10 as N,2018,,,mg/L
8,Perchlorate,0.006,0.002,0.001,2015,,,mg/L
9,Selenium,0.05,0.005,0.03,2010,0.05,0.05,mg/L
10,Thallium,0.002,0.001,0.0001,1999\r(rev2004),0.002,0.0005,mg/L


In [11]:

df_list[n].loc[0, ["Contaminant"]] = ['Chromium, Hexavalent']
df_list[n].loc[3, ["Contaminant", "PHG_Date"]] = ['Mercury', 2005]
df_list[n].loc[5, ["Contaminant", "State_MCL", "State_PHG", "Units"]] = ['Nitrate', 10, 45, '10 as N mg/L']
df_list[n].loc[6, ["Contaminant", "State_MCL", "State_PHG", "Units"]] = ['Nitrite', 1, 1, '1 as N mg/L']
df_list[n].loc[7, ["Contaminant", "State_MCL", "State_PHG", "Units"]] = ['Nitrate + Nitrite', 10, 10, '10 as N mg/L']
df_list[n].loc[10, ["PHG_Date"]] = [2004]
# df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
1,Cyanide,0.15,0.1,0.15,1997.0,0.2,0.2,mg/L
2,Fluoride,2.0,0.1,1.0,1997.0,4.0,4.0,mg/L
3,Mercury,0.002,0.001,0.0012,2005.0,0.002,0.002,mg/L
4,Nickel,0.1,0.01,0.012,2001.0,,,mg/L
5,Nitrate,10.0,0.4,45.0,2018.0,10.0,10.0,10 as N mg/L
6,Nitrite,1.0,0.4,1.0,2018.0,1.0,1.0,1 as N mg/L
7,Nitrate + Nitrite,10.0,,10.0,2018.0,,,10 as N mg/L
8,Perchlorate,0.006,0.002,0.001,2015.0,,,mg/L
9,Selenium,0.05,0.005,0.03,2010.0,0.05,0.05,mg/L
10,Thallium,0.002,0.001,0.0001,2004.0,0.002,0.0005,mg/L


In [12]:
n = 2
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]['Units'] = 'mg/L'
# df_list[n]


In [13]:
df_list[n].loc[3, ["Federal_MCLG"]] = [0.0]
df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Copper and Lead,MCL,DLR,PHG,Date of,MCL,MCLG
1,Contaminant,,,,PHG,,
2,Copper,1.3,0.05,0.3,2008,1.3,1.3
3,Lead,0.015,0.005,0.0002,2009,0.015,0.0


In [147]:
n = 3
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'mrem/yr']
# df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
2,Gross alpha particle,15,3.0,none,,15,zero
7,Gross beta particle,4,4.0,none,,4,zero
14,Radium-226 + Radium-,5,,,,5,zero


In [148]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[2, ["Contaminant", "State_PHG", "PHG_Date","Federal_MCLG", "Units"]] = ["Gross Alpha Particle", np.nan, np.nan, 0.0, 'pCi/L']
df_list[n].loc[7, ["Contaminant", "State_PHG", "PHG_Date", "Federal_MCLG", "Units"]] = ["Gross Beta Particle", np.nan, np.nan, 0.0, 'pCi/L']
df_list[n].loc[14, ["Contaminant", "State_DLR", "State_PHG", "PHG_Date", "Federal_MCLG", "Units"]] = [
    'Radium-226 + Radium-228', np.nan, np.nan, np.nan, 0.0, 'pCi/L']
df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
2,Gross Alpha Particle,15,3.0,,,15,0.0,pCi/L
7,Gross Beta Particle,4,4.0,,,4,0.0,pCi/L
14,Radium-226 + Radium-228,5,,,,5,0.0,pCi/L


In [None]:
# df_list[n]['Units'] = 'mg/L'
# df_list[n].loc[5, ["Contaminant", "State_MCL", "State_DLR", "State_PHG", "PHG_Date", "Federal_MCL", "Federal_MCLG", "Units"]] = []
# df_list[n]


In [149]:
n = 4
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Federal\rMCLG
0,Strontium-90,8,2,0.35,,2006,,
1,Tritium,"""20,000""","""1,000""",400.0,,2006,,
2,Uranium,20,1,0.43,,2001,30 μg/L,zero


Note: IN the above table, there is an extra column inserted where PHG_Date should be, and the values are all shifted over, this will have to be adjusted

In [150]:
df_list[n].drop(columns='PHG_Date', inplace=True)
df_list[n].rename(columns={'Federal_MCL': 'PHG_Date', 
                            'Federal_MCLG': 'Federal_MCL', 
                            'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)
df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Strontium-90,8,2,0.35,2006,,
1,Tritium,"""20,000""","""1,000""",400.0,2006,,
2,Uranium,20,1,0.43,2001,30 μg/L,zero


In [152]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[0, ["Federal_MCL", "Federal_MCLG", "Units"]] = [np.nan, np.nan, 'pCi/L']
df_list[n].loc[1, ["State_MCL", "State_DLR", "State_PHG", "Federal_MCL", "Federal_MCLG", "Units"]] = [20000, 1000, 400, np.nan, np.nan, 'pCi/L']
df_list[n].loc[2, ["Federal_MCL", "Federal_MCLG", 'Units']] = [30, 0.0, 'pCi/L (ug/L for Federal_MCL)']
df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,Strontium-90,8,2,0.35,2006,,,pCi/L
1,Tritium,20000,1000,400.0,2006,,,pCi/L
2,Uranium,20,1,0.43,2001,30.0,0.0,pCi/L (ug/L for Federal_MCL)


Now the above matches the others

In [153]:
n = 5
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Benzene,0.001,0.0005,0.00015,2001,0.005,zero
1,Carbon tetrachloride,0.0005,0.0005,0.0001,2000,0.005,zero
2,"1,2-Dichlorobenzene",0.6,0.0005,0.6,1997\r(rev2009),0.6,0.6
3,"1,4-Dichlorobenzene (p-\rDCB)",0.005,0.0005,0.006,1997,0.075,0.075
4,"1,1-Dichloroethane\r(1,1-DCA)",0.005,0.0005,0.003,2003,,
5,"1,2-Dichloroethane\r(1,2-DCA)",0.0005,0.0005,0.0004,1999\r(rev2005),0.005,zero
6,"1,1-Dichloroethylene\r(1,1-DCE)",0.006,0.0005,0.01,1999,0.007,0.007


In [154]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[0, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[1, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[2, ["PHG_Date"]] = [2009]
df_list[n].loc[3, ["Contaminant"]] = ['1,4-Dichlorobenzene(p-DCB)']
df_list[n].loc[4, ["Contaminant", "Federal_MCL", "Federal_MCLG"]] = ['1,1-Dichloroethane (1,1-DCA)', np.nan, np.nan]
df_list[n].loc[5, ["Contaminant", "PHG_Date", "Federal_MCLG"]] = ['1,2-Dichloroethane (1,2-DCA)', 2005, 0.0]
df_list[n].loc[6, ["Contaminant"]] = ['1,1-Dichloroethylene (1,1-DCE)']
df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,Benzene,0.001,0.0005,0.00015,2001,0.005,0.0,mg/L
1,Carbon tetrachloride,0.0005,0.0005,0.0001,2000,0.005,0.0,mg/L
2,"1,2-Dichlorobenzene",0.6,0.0005,0.6,2009,0.6,0.6,mg/L
3,"1,4-Dichlorobenzene(p-DCB)",0.005,0.0005,0.006,1997,0.075,0.075,mg/L
4,"1,1-Dichloroethane (1,1-DCA)",0.005,0.0005,0.003,2003,,,mg/L
5,"1,2-Dichloroethane (1,2-DCA)",0.0005,0.0005,0.0004,2005,0.005,0.0,mg/L
6,"1,1-Dichloroethylene (1,1-DCE)",0.006,0.0005,0.01,1999,0.007,0.007,mg/L


In [155]:
n = 6
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,"cis-1,2-Dichloroethylene",0.006,0.0005,0.013,2018,0.07,0.07
1,"trans-1,2-\rDichloroethylene",0.01,0.0005,0.05,2018,0.1,0.1
2,Dichloromethane\r(Methylene chloride),0.005,0.0005,0.004,2000,0.005,zero
3,"1,2-Dichloropropane",0.005,0.0005,0.0005,1999,0.005,zero
4,"1,3-Dichloropropene",0.0005,0.0005,0.0002,1999\r(rev2006),,
5,Ethylbenzene,0.3,0.0005,0.3,1997,0.7,0.7
6,Methyl tertiary butyl\rether (MTBE),0.013,0.003,0.013,1999,,
7,Monochlorobenzene,0.07,0.0005,0.07,2014,0.1,0.1
8,Styrene,0.1,0.0005,0.0005,2010,0.1,0.1
9,"1,1,2,2-\rTetrachloroethane",0.001,0.0005,0.0001,2003,0.1,0.1


In [156]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[1, ["Contaminant"]] = ['trans-1,2-Dichloroethylene']
df_list[n].loc[2, ["Contaminant", "Federal_MCLG"]] = ['Dichloromethane (Methylene chloride)', 0.0]
df_list[n].loc[3, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[4, ["PHG_Date", "Federal_MCL", "Federal_MCLG"]] = [2006, np.nan, np.nan]
df_list[n].loc[6, ["Contaminant", "Federal_MCL", "Federal_MCLG"]] = [
    'Methyl tertiary butyl ether (MTBE)', np.nan, np.nan]
df_list[n].loc[9, ["Contaminant"]] = ['1,1,2,2-Tetrachloroethane']
df_list[n].loc[10, ["Contaminant", "Federal_MCLG"]] = ['Tetrachloroethylene (PCE)', 0.0]


df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,"cis-1,2-Dichloroethylene",0.006,0.0005,0.013,2018,0.07,0.07,mg/L
1,"trans-1,2-Dichloroethylene",0.01,0.0005,0.05,2018,0.1,0.1,mg/L
2,Dichloromethane (Methylene chloride),0.005,0.0005,0.004,2000,0.005,0.0,mg/L
3,"1,2-Dichloropropane",0.005,0.0005,0.0005,1999,0.005,0.0,mg/L
4,"1,3-Dichloropropene",0.0005,0.0005,0.0002,2006,,,mg/L
5,Ethylbenzene,0.3,0.0005,0.3,1997,0.7,0.7,mg/L
6,Methyl tertiary butyl ether (MTBE),0.013,0.003,0.013,1999,,,mg/L
7,Monochlorobenzene,0.07,0.0005,0.07,2014,0.1,0.1,mg/L
8,Styrene,0.1,0.0005,0.0005,2010,0.1,0.1,mg/L
9,"1,1,2,2-Tetrachloroethane",0.001,0.0005,0.0001,2003,0.1,0.1,mg/L


In [157]:
n = 7
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Federal\rMCLG
0,"1,1,1-Trichloroethane\r(1,1,1-TCA)",0.2,,0.0005,1.0,2006,0.2,0.2
1,"1,1,2-Trichloroethane\r(1,1,2-TCA)",0.005,,0.0005,0.0003,2006,0.005,0.003
2,Trichloroethylene (TCE),0.005,,0.0005,0.0017,2009,0.005,zero
3,Trichlorofluoromethane\r(Freon 11),0.15,,0.005,1.3,2014,,
4,"""1,1,2-Trichloro-1,2,2-\rTrifluoroethane (Freo...",1.2,,0.01,4.0,1997\r(rev2011),,
5,Vinyl chloride,0.0005,,0.0005,5e-05,2000,0.002,zero
6,Xylenes,1.75,,0.0005,1.8,1997,10.0,10


In [158]:
df_list[n].drop(columns='State_DLR', inplace=True)
df_list[n].rename(columns={'State_PHG': 'State_DLR',
                           'PHG_Date': 'State_PHG',
                           'Federal_MCL': 'PHG_Date',
                           'Federal_MCLG': 'Federal_MCL',
                           'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)


In [159]:
df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,"1,1,1-Trichloroethane\r(1,1,1-TCA)",0.2,0.0005,1.0,2006,0.2,0.2
1,"1,1,2-Trichloroethane\r(1,1,2-TCA)",0.005,0.0005,0.0003,2006,0.005,0.003
2,Trichloroethylene (TCE),0.005,0.0005,0.0017,2009,0.005,zero
3,Trichlorofluoromethane\r(Freon 11),0.15,0.005,1.3,2014,,
4,"""1,1,2-Trichloro-1,2,2-\rTrifluoroethane (Freo...",1.2,0.01,4.0,1997\r(rev2011),,
5,Vinyl chloride,0.0005,0.0005,5e-05,2000,0.002,zero
6,Xylenes,1.75,0.0005,1.8,1997,10.0,10


In [None]:
# df_list[n]['Units'] = 'mg/L'
# df_list[n].loc[5, ["Contaminant", "State_MCL", "State_DLR",
#                    "State_PHG", "PHG_Date", "Federal_MCL", "Federal_MCLG"]] = []
# df_list[n]


In [161]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[0, ["Contaminant"]] = ['1,1,1-Trichloroethane (1,1,1-TCA)']
df_list[n].loc[1, ["Contaminant"]] = ['1,1,2-Trichloroethane (1,1,2-TCA)']
df_list[n].loc[2, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[3, ["Contaminant", "Federal_MCL", "Federal_MCLG"]] = [
    'Trichlorofluoromethane (Freon 11)', np.nan, np.nan]
df_list[n].loc[4, ["Contaminant", "PHG_Date", "Federal_MCL", "Federal_MCLG"]] = [
    '1,1,2-Trichloro-1,2,2-Trifluoroethane (Freon 113)', 2011, np.nan, np.nan]
df_list[n].loc[5, ["Federal_MCLG"]] = [0.0]


df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,"1,1,1-Trichloroethane (1,1,1-TCA)",0.2,0.0005,1.0,2006,0.2,0.2,mg/L
1,"1,1,2-Trichloroethane (1,1,2-TCA)",0.005,0.0005,0.0003,2006,0.005,0.003,mg/L
2,Trichloroethylene (TCE),0.005,0.0005,0.0017,2009,0.005,0.0,mg/L
3,Trichlorofluoromethane (Freon 11),0.15,0.005,1.3,2014,,,mg/L
4,"1,1,2-Trichloro-1,2,2-Trifluoroethane (Freon 113)",1.2,0.01,4.0,2011,,,mg/L
5,Vinyl chloride,0.0005,0.0005,5e-05,2000,0.002,0.0,mg/L
6,Xylenes,1.75,0.0005,1.8,1997,10.0,10.0,mg/L


This list has the same issue as 4, but the State_DLR is the row that needs removal

In [162]:
n = 8
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Alachlor,0.002,0.001,0.004,1997,0.002,zero
1,Atrazine,0.001,0.0005,0.00015,1999,0.003,0.003
2,Bentazon,0.018,0.002,0.2,1999\r(rev2009),,


In [163]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[0, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[2, ["PHG_Date"]] = [2009]
df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,Alachlor,0.002,0.001,0.004,1997,0.002,0.0,mg/L
1,Atrazine,0.001,0.0005,0.00015,1999,0.003,0.003,mg/L
2,Bentazon,0.018,0.002,0.2,2009,,,mg/L


In [164]:
n = 9
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Benzo(a)pyrene,0.0002,0.0001,7e-06,2010,0.0002,zero
1,Carbofuran,0.018,0.005,0.0007,2016,0.04,0.04
2,Chlordane,0.0001,0.0001,3e-05,1997\r(rev2006),0.002,zero
3,Dalapon,0.2,0.01,0.79,1997\r(rev2009),0.2,0.2
4,"1,2-Dibromo-3-\rchloropropane\r(DBCP)",0.0002,1e-05,3e-06,2020,0.0002,zero
5,"2,4-\rDichlorophenoxyaceti\rc acid (2,4-D)",0.07,0.01,0.02,2009,0.07,0.07
6,Di(2-\rethylhexyl)adipate,0.4,0.005,0.2,2003,0.4,0.4
7,Di(2-\rethylhexyl)phthalate\r(DEHP),0.004,0.003,0.012,1997,0.006,zero
8,Dinoseb,0.007,0.002,0.014,1997\r(rev2010),0.007,0.007
9,Diquat,0.02,0.004,0.006,2016,0.02,0.02


In [165]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[0, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[2, ["PHG_Date", "Federal_MCLG"]] = [2006, 0.0]
df_list[n].loc[3, ["PHG_Date"]] = [2009]
df_list[n].loc[4, ["Contaminant", "Federal_MCLG"]] = [
    '1,2-Dibromo-3-chloropropane (DBCP)', 0.0]
df_list[n].loc[5, ["Contaminant"]] = [
    '2,4-Dichlorophenoxyacetic acid (2,4-D)']
df_list[n].loc[6, ["Contaminant"]] = ['Di(2-ethylhexyl)adipate']
df_list[n].loc[7, ["Contaminant", "Federal_MCLG"]] = [
    'Di(2-ethylhexyl)phthalate (DEHP)', 0.0]
df_list[n].loc[8, ["PHG_Date"]] = [2010]
df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,Benzo(a)pyrene,0.0002,0.0001,7e-06,2010,0.0002,0.0,mg/L
1,Carbofuran,0.018,0.005,0.0007,2016,0.04,0.04,mg/L
2,Chlordane,0.0001,0.0001,3e-05,2006,0.002,0.0,mg/L
3,Dalapon,0.2,0.01,0.79,2009,0.2,0.2,mg/L
4,"1,2-Dibromo-3-chloropropane (DBCP)",0.0002,1e-05,3e-06,2020,0.0002,0.0,mg/L
5,"2,4-Dichlorophenoxyacetic acid (2,4-D)",0.07,0.01,0.02,2009,0.07,0.07,mg/L
6,Di(2-ethylhexyl)adipate,0.4,0.005,0.2,2003,0.4,0.4,mg/L
7,Di(2-ethylhexyl)phthalate (DEHP),0.004,0.003,0.012,1997,0.006,0.0,mg/L
8,Dinoseb,0.007,0.002,0.014,2010,0.007,0.007,mg/L
9,Diquat,0.02,0.004,0.006,2016,0.02,0.02,mg/L


In [166]:
n = 10
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Endrin,0.002,0.0001,0.0003,2016,0.002,0.002
1,Ethylene dibromide\r(EDB),5e-05,2e-05,1e-05,2003,0.0000\r5,zero
2,Glyphosate,0.7,0.025,0.9,2007,0.7,0.7
3,Heptachlor,1e-05,1e-05,8e-06,1999,0.0004,zero
4,Heptachlor epoxide,1e-05,1e-05,6e-06,1999,0.0002,zero
5,Hexachlorobenzene,0.001,0.0005,3e-05,2003,0.001,zero
6,Hexachlorocyclopent\radiene,0.05,0.001,0.002,2014,0.05,0.05
7,Lindane,0.0002,0.0002,3.2e-05,1999\r(rev2005),0.0002,0.0002
8,Methoxychlor,0.03,0.01,9e-05,2010,0.04,0.04
9,Molinate,0.02,0.002,0.001,2008,,


In [167]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[1, ["Contaminant", "Federal_MCL", "Federal_MCLG"]] = [
    'Ethylene dibromide (EDB)', 0.00005, 0.0]
df_list[n].loc[3, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[4, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[5, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[6, ["Contaminant"]] = ['Hexachlorocyclopentadiene']
df_list[n].loc[7, ["PHG_Date"]] = [2005]
df_list[n].loc[11, ["Federal_MCLG"]] = [0.0]
df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,Endrin,0.002,0.0001,0.0003,2016,0.002,0.002,mg/L
1,Ethylene dibromide (EDB),5e-05,2e-05,1e-05,2003,5e-05,0.0,mg/L
2,Glyphosate,0.7,0.025,0.9,2007,0.7,0.7,mg/L
3,Heptachlor,1e-05,1e-05,8e-06,1999,0.0004,0.0,mg/L
4,Heptachlor epoxide,1e-05,1e-05,6e-06,1999,0.0002,0.0,mg/L
5,Hexachlorobenzene,0.001,0.0005,3e-05,2003,0.001,0.0,mg/L
6,Hexachlorocyclopentadiene,0.05,0.001,0.002,2014,0.05,0.05,mg/L
7,Lindane,0.0002,0.0002,3.2e-05,2005,0.0002,0.0002,mg/L
8,Methoxychlor,0.03,0.01,9e-05,2010,0.04,0.04,mg/L
9,Molinate,0.02,0.002,0.001,2008,,,mg/L


In [168]:
n = 11
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Federal\rMCLG
0,Polychlorinated\rbiphenyls (PCBs),0.0005,0.0005,0.00009,2007,0.0005,,zero
1,Simazine,0.004,0.001,0.004,2001,0.004,,0.004
2,Thiobencarb,0.07,0.001,0.042,2016,,,
3,Toxaphene,0.003,0.001,0.00003,2003,0.003,,zero
4,"1,2,3-\rTrichloropropane",0.00000\r5,0.00000\r5,0.0000007,2009,,,
5,"2,3,7,8-TCDD\r(dioxin)",3x10-8,5x10-9,5x10-11,2010,3x10-8,,zero
6,"2,4,5-TP (Silvex)",0.05,0.001,0.003,2014,0.05,,0.05


The above table here shifted in a different place from the last shifted table

In [169]:
df_list[n].drop(columns='Federal_MCLG', inplace=True)
df_list[n].rename(columns={'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)
df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Polychlorinated\rbiphenyls (PCBs),0.0005,0.0005,0.00009,2007,0.0005,zero
1,Simazine,0.004,0.001,0.004,2001,0.004,0.004
2,Thiobencarb,0.07,0.001,0.042,2016,,
3,Toxaphene,0.003,0.001,0.00003,2003,0.003,zero
4,"1,2,3-\rTrichloropropane",0.00000\r5,0.00000\r5,0.0000007,2009,,
5,"2,3,7,8-TCDD\r(dioxin)",3x10-8,5x10-9,5x10-11,2010,3x10-8,zero
6,"2,4,5-TP (Silvex)",0.05,0.001,0.003,2014,0.05,0.05


In [170]:
df_list[n]['Units'] = 'mg/L'
df_list[n].loc[0, ["Contaminant", "Federal_MCLG"]] = [
    'Polychlorinated biphenyls (PCBs)', 0.0]
df_list[n].loc[3, ["Federal_MCLG"]] = [0.0]
df_list[n].loc[4, ["Contaminant", "State_MCL", "State_DLR"]] = [
    '1,2,3-Trichloropropane', 0.000005, 0.000005]
df_list[n].loc[5, ["Contaminant", "State_MCL", "State_DLR", "State_PHG", "Federal_MCL", "Federal_MCLG"]] = [
    '2,3,7,8-TCDD (dioxin)', 3.0e-8, 5.0e-9, 5.0e-11, 3.0e-8, 0.0]
df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,Polychlorinated biphenyls (PCBs),0.0005,0.0005,9e-05,2007,0.0005,0.0,mg/L
1,Simazine,0.004,0.001,0.004,2001,0.004,0.004,mg/L
2,Thiobencarb,0.07,0.001,0.042,2016,,,mg/L
3,Toxaphene,0.003,0.001,3e-05,2003,0.003,0.0,mg/L
4,"1,2,3-Trichloropropane",5e-06,5e-06,7e-07,2009,,,mg/L
5,"2,3,7,8-TCDD (dioxin)",0.0,0.0,0.0,2010,0.0,0.0,mg/L
6,"2,4,5-TP (Silvex)",0.05,0.001,0.003,2014,0.05,0.05,mg/L


In [171]:
n = 12
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Total Trihalomethanes,0.08,,,,0.08,


In [172]:
# df_list[n]['Units'] = 'mg/L'
df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
0,Total Trihalomethanes,0.08,,,,0.08,,mg/L


In [173]:
n = 13
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
2,Haloacetic Acids (five)\r(HAA5),0.06,,,,0.06,
8,Bromate,0.01,0.0050**,0.0001,2009.0,0.01,zero
9,Chlorite,1.0,0.020,0.05,2009.0,1.0,0.8


In [175]:
# df_list[n]['Units'] = 'mg/L'
df_list[n].loc[2, ["Contaminant"]] = ['Haloacetic Acids (five) (HAA5)']
df_list[n].loc[8, ["State_DLR", "Federal_MCLG"]] = [0.0050, 0.0]
df_list[n]


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units
2,Haloacetic Acids (five) (HAA5),0.06,,,,0.06,,mg/L
8,Bromate,0.01,0.005,0.0001,2009.0,0.01,0.0,mg/L
9,Chlorite,1.0,0.02,0.05,2009.0,1.0,0.8,mg/L
4,,,,,,,0.0,mg/L


List 14 was an unregulated material, so it was excluded, since it has not state regulatory value

In [177]:
n = 14
# df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
# df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
# df_list[n]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)


Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Units


Changes that were made to each of the tables: 
- All /r values were changed to either a space or removing the space, based on the chemical name
- Multiple line names were reduced to a single line, but chemical names were fixed to ensure the full contaminant was recognized
- Units were added to an additional column to show differences when present, and to be able to change the values in the measureable columns to be changed to numeric data types
- Many of the zero values were entered as the string zero. These all had to be changed to 0.0
- All of the '--' were changed to NaN values using numpy np.nan




Now that all of the rows have the correct columns with all the garbage removed, we can concatenate the tables

In [185]:
contaminants = pd.concat(df_list, ignore_index=True)


In [186]:
contaminants

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG,Federal\rMCLG
0,Aluminum,1,0.05,0.6,2001,--,--,
1,Antimony,0.006,0.006,0.001,2016,0.006,0.006,
2,Arsenic,0.010,0.002,0.000004,2004,0.010,zero,
3,Asbestos (MFL =,7 MFL,0.2 MFL,7 MFL,2003,7 MFL,7 MFL,
4,Barium,1,0.1,2,2003,2,2,
...,...,...,...,...,...,...,...,...
99,Monobromoacetic Acid,--,0.0010,0.025,2022,--,--,
100,Dibromoacetic Acid,--,0.0010,0.00003,2022,--,--,
101,Bromate,0.010,0.0050**,0.0001,2009,0.01,zero,
102,Chlorite,1.0,0.020,0.05,2009,1,0.8,


So the table now only has the values of the actual contaminants. Next steps: 

1. Per the documentation, all values are in mg/L unless otherwise noted, this is NOT the case with the actual measurements from the Water Data
2. There are problematic rows that have different units: 
    - Asbestos (MFL=million fibers per liter; for fibers >10 microns long)
    - Nitrate, Nitrite, and Nitrate + Nitrite have weird conversion factors - need to coordinate these with the database
    - Gross beta particle activity - mrem/yr
    - Tritium is in quotation marks with comma separators
    - Uranium is in ug/L

In [213]:
def Decontaminate(df_list):
    index = 0
    for df in df_list:
        df.rename(columns={df.columns[0]: "Contaminant",
                           df.columns[1]: "State_MCL",
                           df.columns[2]: "State_DLR",
                           df.columns[3]: "State_PHG",
                           df.columns[4]: "PHG_Date",
                           df.columns[5]: "Federal_MCL",
                           df.columns[6]: "Federal_MCLG"
                           }, inplace=True)
        df.dropna(subset=['State_MCL'], how='all', inplace=True)
        df = df.loc[df.State_MCL != 'MCL']
        df = df.loc[df.State_MCL != 'mrem/yr']

        # if index == 4:
        #     df.drop(columns='PHG_Date', inplace=True)
        #     df.rename(columns={'Federal_MCL': 'PHG_Date',
        #                        'Federal_MCLG': 'Federal_MCL',
        #                        'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)
        #     index += 1
        #     return df
        # elif index == 7:
        #     df.drop(columns='State_DLR', inplace=True)
        #     df.rename(columns={'State_PHG': 'State_DLR',
        #                        'PHG_Date': 'State_PHG',
        #                        'Federal_MCL': 'PHG_Date',
        #                        'Federal_MCLG': 'Federal_MCL',
        #                        'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)
        #     index += 1
        #     return df
        # elif index == 11:
        #     df.drop(columns='Federal_MCLG', inplace=True)
        #     df.rename(columns={'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)
        #     index += 1
        #     return df
        # else:
        #     index += 1
        #     return df
        
    df_concat = pd.concat(df_list, ignore_index=True)
    return df_concat


In [320]:
def Decontaminate_Labels(df_list):
    for df in df_list:
        df.rename(columns={df.columns[0]: "Contaminant",
                        df.columns[1]: "State_MCL",
                        df.columns[2]: "State_DLR",
                        df.columns[3]: "State_PHG",
                        df.columns[4]: "PHG_Date",
                        df.columns[5]: "Federal_MCL",
                        df.columns[6]: "Federal_MCLG"
                        }, inplace=True)
    return df_list
        

def Decontaminate_Nulls(df_list):
    import numpy as np
    for n in range(len(df_list)):
        df_list[n].replace('--', np.nan)


def Decontaminate_Rows(df_list):
    for n in range(len(df_list)):
        df_list[n].dropna(subset=['State_MCL'], how='all', inplace=True)
        df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'MCL']
        df_list[n] = df_list[n].loc[df_list[n].State_MCL != 'mrem/yr']
    return df_list


def Decontaminate_Lists(df_list): 
    for n in range(len(df_list)):
        if n == 4:
            df_list[n].drop(columns='PHG_Date', inplace=True)
            df_list[n].rename(columns={'Federal_MCL': 'PHG_Date',
                            'Federal_MCLG': 'Federal_MCL',
                            'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)
        elif n == 7:
            df_list[n].drop(columns='State_DLR', inplace=True)
            df_list[n].rename(columns={'State_PHG': 'State_DLR',
                            'PHG_Date': 'State_PHG',
                            'Federal_MCL': 'PHG_Date',
                            'Federal_MCLG': 'Federal_MCL',
                            'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)
        elif n == 11:
            df_list[n].drop(columns='Federal_MCLG', inplace=True)
            df_list[n].rename(columns={'Federal\rMCLG': 'Federal_MCLG'}, inplace=True)
    return df_list


def Decontaminate(filename):
    df_list = read_pdf(filename, pages='all')
    Decontaminate_Labels(df_list)
    Decontaminate_Rows(df_list)
    Decontaminate_Lists(df_list)
    df_concat = pd.concat(df_list, ignore_index=True)
    return df_concat


In [314]:
test = Decontaminate_Labels(df_list)

In [315]:
test[0]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Inorganic Chemical,MCL,DLR,,Date of,MCL,MCLG
1,Contaminant,,,,PHG,,
2,Aluminum,1,0.05,0.6,2001,--,--
3,Antimony,0.006,0.006,0.001,2016,0.006,0.006
4,Arsenic,0.010,0.002,0.000004,2004,0.010,zero
5,Asbestos (MFL =,7 MFL,0.2 MFL,7 MFL,2003,7 MFL,7 MFL
6,million fibers per liter;,,,,,,
7,for fibers >10,,,,,,
8,microns long),,,,,,
9,Barium,1,0.1,2,2003,2,2


In [321]:
test2=Decontaminate_Rows(test)

In [322]:
test2[14]

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
3,N-Nitrosodimethylamine,--,--,3e-06,2006,--,--


In [313]:
filename = "Data\mcls_dlrs_phgs.pdf"
df_list = read_pdf(filename, pages='all')


In [323]:
Decontaminate(filename)

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Aluminum,1,0.05,0.6,2001,--,--
1,Antimony,0.006,0.006,0.001,2016,0.006,0.006
2,Arsenic,0.010,0.002,0.000004,2004,0.010,zero
3,Asbestos (MFL =,7 MFL,0.2 MFL,7 MFL,2003,7 MFL,7 MFL
4,Barium,1,0.1,2,2003,2,2
...,...,...,...,...,...,...,...
99,Monobromoacetic Acid,--,0.0010,0.025,2022,--,--
100,Dibromoacetic Acid,--,0.0010,0.00003,2022,--,--
101,Bromate,0.010,0.0050**,0.0001,2009,0.01,zero
102,Chlorite,1.0,0.020,0.05,2009,1,0.8


In [324]:
from Decontaminate import Decontaminate

In [325]:
Decontaminate(filename)

Unnamed: 0,Contaminant,State_MCL,State_DLR,State_PHG,PHG_Date,Federal_MCL,Federal_MCLG
0,Aluminum,1,0.05,0.6,2001,--,--
1,Antimony,0.006,0.006,0.001,2016,0.006,0.006
2,Arsenic,0.010,0.002,0.000004,2004,0.010,zero
3,Asbestos (MFL =,7 MFL,0.2 MFL,7 MFL,2003,7 MFL,7 MFL
4,Barium,1,0.1,2,2003,2,2
...,...,...,...,...,...,...,...
99,Monobromoacetic Acid,--,0.0010,0.025,2022,--,--
100,Dibromoacetic Acid,--,0.0010,0.00003,2022,--,--
101,Bromate,0.010,0.0050**,0.0001,2009,0.01,zero
102,Chlorite,1.0,0.020,0.05,2009,1,0.8
