Goals
1. Function that takes url as arguement
2. Use urllib to get response from url, ignoring ssl
3. Use beautiful soup to read content via html parser
4. Next loop through elements of html using regex to find string that matches for City, StateAbrrev ZipCode
5. Then retrieve the parent of that tag and match for full street address using regex
6. Once found then append to list with url preceding

In [1]:
import requests 
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import urllib
import ssl

In [3]:
cpas = pd.read_excel('./US_Firms_Market_Analysis_2023.xlsx',sheet_name='DATA')

In [5]:
cpas.dropna(subset=['URL'], inplace=True)

## Script to grab url from dataset

In [6]:
urls=[]
for url in cpas['URL'].unique():
    if len(str(url)) == 0:
        pass
    else:
        match = re.search(r'((?:https?://)?[^\s()]+\.com)', url)
        if match:
            urls.append(match.group(1))

## Adding https:// if it doesn't already exist and /contact

In [7]:
index = 0
for url in urls:
    if 'https://' in url or 'http://' in url:
        index = urls.index(url)
        urls.remove(url)
        url = url + '/contact'
        urls.insert(index, url)
    else:
        index = urls.index(url)
        urls.remove(url)
        url = 'https://' + url + '/contact/'
        urls.insert(index, url)
        

### Using old regex this matches city, stateAbbr ZipCode

addresses=[]
for url in urls[2:5]:
    response = requests.get(url, verify=False)
    soup = bs(response.content, 'html.parser')
    
    
    pattern = re.compile('[A-z]+,\s[A-Z]{2}\s\d{5}(?!\d)')
    

    for tag in soup.find_all(string=re.compile(pattern)):
        print(tag)
        print('\n'*2)
        addresses.append([url, tag.parent.get_text().replace('\n', ' ').replace("\xa0", ' ')])

print(addresses)

### Loop through all urls
1. Search for string that matches the pattern of City, StateAbbrev ZipCode
2. then search the parent of that tag for the full street address
    - also replaces a few errors like \n, \xa0
3. verifies if a match was found and if so then appends the url and street address to a list called addresses
4. otherwises passes then reruns loop

In [349]:
#Ignore SSL since some sites aren't https
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE


## Read in the html
addresses=[]
for url in urls[1:20]:
    
    try:
        ## URLlib paramters, the user-agent header may need to be updated by looking at network in dev tools on website
        req = urllib.request.Request(url)
        req.add_header('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57')

        ## Write html to r
        r = urllib.request.urlopen(req, context=ctx).read().decode('utf-8')
        with open("test.html", 'w', encoding="utf-8") as f:
            f.write(r)

        ## Have beautfil soup run it's html parsing magic
        soup = bs(r, 'html.parser')

        ## Here's the pattern to just find what tag contains a city like string
        pattern = re.compile('[A-z]+,\s[A-Z]{2}\s\d{5}(?!\d)')


        # For each string in a tag that gets returned look for a full street address within the string and append to addresses
        for tag in soup.find_all(string=re.compile(pattern)):
            match = re.search('\d{1,5}\s(?:[A-z\S]*\s){1,3}(?:Avenue|Lane|Road|Boulevard|Drive|Floor|Street|Ave|Dr|Rd|Blvd|Ln|St|Suite).*\d{5}',
                              tag.parent.get_text().replace('\n', ' ').replace("\xa0", ' '))
            if match:
                print(tag,match.group(0))
                addresses.append([url, match.group(0)])
            else:
                pass
            
            
    except urllib.error.HTTPError as err:
        print(url, err.code)



Richardson, TX 75080 2425 N Central Expressway, Suite 200 Richardson, TX 75080

Boston, MA 02110 160 Federal Street, 16th Floor Boston, MA 02110

Waltham, MA 02452 465 Waverly Oaks Road, Waltham, MA 02452

Westborough, MA 01581 50 Washington Street Westborough, MA 01581
 Floor, New York, NY 10019 1740 Broadway 15th Floor, New York, NY 10019
New York, NY 10019 1740 Broadway 15th FloorNew York, NY 10019
Beverly Hills, CA 90211 8383 Wilshire Blvd Ste 800Beverly Hills, CA 90211

New York, NY 10019 1740 Broadway 15th Floor New York, NY 10019

Campbell, CA 95008 1901 S. Bascom Ave Suite 105 Campbell, CA 95008
© 2023 Abbott, Stringham & Lynch - Silicon Valley CPA Firm - 1901 South Bascom Avenue, Suite 105, Campbell, CA 95008  2023 Abbott, Stringham & Lynch - Silicon Valley CPA Firm - 1901 South Bascom Avenue, Suite 105, Campbell, CA 95008

Edina, MN 55436 5201 Eden Avenue, Suite 250 Edina, MN 55436

Mankato, MN 56001 100 Warren Street, Suite 600 Mankato, MN 56001

Scottsdale, AZ 85260 14500 

In [350]:
addresses

[['https://agllp-cpa.com/contact/',
  '2425 N Central Expressway, Suite 200 Richardson, TX 75080'],
 ['https://www.aafcpa.com/contact/',
  '160 Federal Street, 16th Floor Boston, MA 02110'],
 ['https://www.aafcpa.com/contact/',
  '465 Waverly Oaks Road, Waltham, MA 02452'],
 ['https://www.aafcpa.com/contact/',
  '50 Washington Street Westborough, MA 01581'],
 ['https://www.abfinwright.com/contact/',
  '1740 Broadway 15th Floor, New York, NY 10019'],
 ['https://www.abfinwright.com/contact/',
  '1740 Broadway 15th FloorNew York, NY 10019'],
 ['https://www.abfinwright.com/contact/',
  '8383 Wilshire Blvd Ste 800Beverly Hills, CA 90211'],
 ['https://www.abfinwright.com/contact/',
  '1740 Broadway 15th Floor New York, NY 10019'],
 ['http://www.aslcpa.com/contact',
  '1901 S. Bascom Ave Suite 105 Campbell, CA 95008'],
 ['http://www.aslcpa.com/contact',
  '2023 Abbott, Stringham & Lynch - Silicon Valley CPA Firm - 1901 South Bascom Avenue, Suite 105, Campbell, CA 95008'],
 ['http://www.aemcpa

In [310]:
pd.Series(addresses)

0     [https://agllp-cpa.com/contact/, 2425 N Centra...
1     [https://www.aafcpa.com/contact/, 160 Federal ...
2     [https://www.aafcpa.com/contact/, 465 Waverly ...
3     [https://www.aafcpa.com/contact/, 50 Washingto...
4     [https://www.abfinwright.com/contact/, 1740 Br...
5     [https://www.abfinwright.com/contact/, 1740 Br...
6     [https://www.abfinwright.com/contact/, 8383 Wi...
7     [https://www.abfinwright.com/contact/, 1740 Br...
8     [http://www.aslcpa.com/contact, 1901 S. Bascom...
9     [http://www.aslcpa.com/contact, 2023 Abbott, S...
10    [http://www.aemcpas.com/contact, 5201 Eden Ave...
11    [http://www.aemcpas.com/contact, 100 Warren St...
12    [http://www.aemcpas.com/contact, 14500 N North...
13    [http://www.aemcpas.com/contact, 5201 Eden Ave...
14    [http://www.aemcpas.com/contact, 100 Warren St...
15    [http://www.aemcpas.com/contact, 14500 N North...
16    [https://www.abitos.com/contact/, 255 Aragon A...
17    [https://www.abitos.com/contact/, 20803 Bi

## Troubleshooting edge cases

### AGL LLP
- Needed to use urllib because of security defenses on this website

In [303]:
import urllib
import ssl

In [317]:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

req = urllib.request.Request('http://www.acuitycpas.com/contact')
req.add_header('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57')

r = urllib.request.urlopen(req, context=ctx).read().decode('utf-8')
with open("test.html", 'w', encoding="utf-8") as f:
    f.write(r)
    
soup = bs(r, 'html.parser')



In [318]:
soup


<!DOCTYPE html>

<html class="no-js" lang="en-US">
<head>
<meta charset="utf-8"/>
<script type="text/javascript">
var gform;gform||(document.addEventListener("gform_main_scripts_loaded",function(){gform.scriptsLoaded=!0}),window.addEventListener("DOMContentLoaded",function(){gform.domLoaded=!0}),gform={domLoaded:!1,scriptsLoaded:!1,initializeOnLoaded:function(o){gform.domLoaded&&gform.scriptsLoaded?o():!gform.domLoaded&&gform.scriptsLoaded?window.addEventListener("DOMContentLoaded",o):document.addEventListener("gform_main_scripts_loaded",o)},hooks:{action:{},filter:{}},addAction:function(o,n,r,t){gform.addHook("action",o,n,r,t)},addFilter:function(o,n,r,t){gform.addHook("filter",o,n,r,t)},doAction:function(o){gform.doHook("action",o,arguments)},applyFilters:function(o){return gform.doHook("filter",o,arguments)},removeAction:function(o,n){gform.removeHook("action",o,n)},removeFilter:function(o,n,r){gform.removeHook("filter",o,n,r)},addHook:function(o,n,r,t,i){null==gform.hooks[o][n]&&(

In [295]:
tag.parent.get_text().replace('\n', ' ').replace("\xa0", ' ')

'AAFCPAs Westborough 50 Washington Street Westborough, MA 01581'

In [306]:
match.group(0)

'2425 N Central Expressway, Suite 200 Richardson, TX 75080'

###  adamsbrowncpa.com
This url doesn't actually have locations on contact page, just a contact form. there is a different url that has locations. But still not sure why I'm getting a 404 error in my loop but not here

In [347]:
req = urllib.request.Request('https://www.adamsbrowncpa.com/contact/')
req.add_header('user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57')

In [348]:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

try:
    urllib.request.urlopen(req, context=ctx)
    print('try')
except urllib.error.HTTPError as err:
    print(err.code)
    print('except')

try
