In [2]:
!cd paylesshealth/ && dolt sql -q "select * from hospitals where cdm_indirect_url is not null;" -r csv > all_nulls.csv

In [1]:
import pandas as pd
# df = pd.read_csv('paylesshealth/hospitals.csv')
df = pd.read_csv('./paylesshealth-1/all_nulls.csv')
# df = df.head(100)

In [2]:
import aiohttp
import asyncio
import re
import cgi
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup

In [3]:
def create_headers(url):
    headers= {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
    "Host": urlparse(url).netloc,
    "Referer": urlparse(url).netloc
    }
    return headers

In [7]:
async def checker(session, url):
    find_strings = ["price transparency", "standard charges", "pricing", "billing", "prices", "price listing", "charges", "cms", "transparency"]
    url_string = ["price-transparency", "standard-charges", "pricing", "billing", "prices", "price-list", "price-listing", "charges", "cms", "transparency"]
    if pd.isna(url):
        return False
    
    if "/../" in url or "|" in url: # aiohttp is kinda not smart
        return False

    headers = create_headers(url)
    
    try:
        async with session.get(url, allow_redirects=True, ssl=False, raise_for_status=True, timeout=100, headers=headers) as r:
            if r:
                r = await r.text()
                soup = BeautifulSoup(r, "html.parser")
                
                r = r.lower()
                # Check for common price transparency names
                if any(string in r for string in find_strings):
                    matching_str = [string for string in find_strings if string in r]

                    possible_indirects = []
                    for t_url in soup.find_all('a'):
                        t_url = t_url.get("href")
                        
                        if t_url:
                            if any(string in t_url for string in url_string):
                                possible_indirects.append(t_url)

                    if not possible_indirects:
                        possible_indirects = pd.NA
                    
                    return possible_indirects, matching_str
                
                else:
                    return pd.NA, pd.NA
                    
    except aiohttp.ClientResponseError as e:
#         print("ERROR", e, url)
        return pd.NA, pd.NA

    except asyncio.TimeoutError:
        return pd.NA, pd.NA

    except aiohttp.ClientConnectionError:
        return pd.NA, pd.NA    
    
    except KeyboardInterrupt:
        import os; os.exit()
    except Exception as e:
        print(e)
        return pd.NA, pd.NA

In [5]:
from tqdm.asyncio import tqdm

async def main():
    async with aiohttp.ClientSession(raise_for_status = True) as session:
        df[["possible_indirect", "found_string"]] = await tqdm.gather(*(checker(session, url) for url in df["homepage"]), leave=True, position=0)
        return df

In [8]:
t = await main()

 46%|████▌     | 823/1795 [00:56<01:38,  9.82it/s]

'utf-8' codec can't decode byte 0x97 in position 79817: invalid start byte


100%|██████████| 1795/1795 [01:41<00:00, 17.70it/s] 
  return asarray(a).ndim


In [9]:
t

Unnamed: 0,ccn,name,state,city,homepage,cdm_indirect_url,cdm_url,possible_indirect,found_string
0,154064,ASSURANCE HEALTH PSYCHIATRIC HOSPITAL,IN,INDIANAPOLIS,http://assurancehealthsystem.com/,,,,
1,364056,ASSURANCE HEALTH CINCINNATI LLC,OH,CINCINNATI,http://assurancehealthsystem.com/,,,,
2,364059,ASSURANCE HEALTH HUDSON LLC,OH,HUDSON,http://assurancehealthsystem.com/,,,,
3,364066,ASSURANCE HEALTH TOLEDO LLC,OH,SYLVANIA,http://assurancehealthsystem.com/,,,,
4,050842,CENTRAL VALLEY SURGICAL CENTER LLC,CA,BAKERSFIELD,http://centralvalleysc.com/,,,,
...,...,...,...,...,...,...,...,...,...
1790,270091,YELLOWSTONE SURGERY CENTER LLC,MT,BILLINGS,https://yellowstonesurgerycenter.com/,,,[https://yellowstonesurgerycenter.com/about-ye...,"[pricing, billing, charges, cms]"
1791,270092,YELLOWSTONE SURGERY CENTER WEST,MT,BILLINGS,https://yellowstonesurgerycenter.com/,,,[https://yellowstonesurgerycenter.com/about-ye...,"[pricing, billing, charges, cms]"
1792,180154,"PINEVILLE COMMUNITY HEALTH CENTER, INC",KY,PINEVILLE,https://yourhometownhospital.org/,,,,
1793,250169,COMPREHENSIVE PAIN MANAGEMENT LLC,MS,TUPELO,https://yourpaincenter.com/,,,,


In [19]:
t.dropna(subset=["found_string"], inplace=True)
t

Unnamed: 0,ccn,name,state,city,homepage,cdm_indirect_url,cdm_url,possible_indirect,found_string
9,400007,RYDER MEMORIAL HOSPITAL INC,PR,HUMACAO,http://hryder.org/,,,,"[cms, transparency]"
10,394033,MONTGOMERY COUNTY EMERGENCY SERVICE,PA,NORRISTOWN,http://mces.org/WordPress/,,,[https://www.ibxtpa.com/transparency-in-covera...,"[cms, transparency]"
11,371306,MERCY HEALTH LOVE COUNTY,OK,MARIETTA,http://mercyhealthlovecounty.com/,,,,"[pricing, billing, prices]"
14,194079,"RED RIVER BEHAVIORAL CENTER, LLC",LA,BOSSIER CITY,http://redriverbc.com/,,,,[pricing]
17,111300,SOUTHWEST GEORGIA REGIONAL MEDICAL CENTER,GA,CUTHBERT,http://southwestgeorgiaregionalmedicalcenter.c...,,,,"[cms, transparency]"
...,...,...,...,...,...,...,...,...,...
1786,521347,WESTERN WISCONSIN HEALTH,WI,BALDWIN,https://www.wwhealth.org/,,,[https://www.wwhealth.org/your-visit/price-tra...,"[price transparency, charges, transparency]"
1787,361329,WYANDOT MEMORIAL HOSPITAL,OH,UPPER SANDUSKY,https://www.wyandotmemorial.org/,,,[https://www.wyandotmemorial.org/pricing/],"[pricing, transparency]"
1789,281336,YORK GENERAL HOSPITAL,NE,YORK,https://www.yorkgeneral.org/,,,,"[price transparency, transparency]"
1790,270091,YELLOWSTONE SURGERY CENTER LLC,MT,BILLINGS,https://yellowstonesurgerycenter.com/,,,[https://yellowstonesurgerycenter.com/about-ye...,"[pricing, billing, charges, cms]"


In [20]:
t.to_csv("possible_indirects.csv", index=False)