# Scrape data from some chamber of commerce

- Black Chamber of Commerce
- Asian Chamber of Commerce
- Hispanic Chamber of Commerce

In [2]:
# libraries
import pandas as pd
import csv
import requests
from bs4 import BeautifulSoup
import json
import re

## 1. Black Chamber of Commerce

In [3]:
# URL to scrape
url = 'https://business.northernvirginiabcc.org/directory/FindStartsWith?term=%23%21'

# Send a GET request to the URL
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
directory_div = soup.find('div', {'class': 'row gz-cards gz-directory-cards'})

f = open("../../data/listings/chamber_of_commerce/clean/black_chamber.txt", "w")
f.write(response.text)
f.close()
filename = '../../data/listings/chamber_of_commerce/clean/black_chamber.csv'


with open(filename, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['company_name', 'email', 'phone'])
    nested_divs = directory_div.find_all(
        'div', {'class': 'gz-directory-card'})
    print(len(nested_divs))
    
    for div_element in nested_divs:
        name_element = div_element.find(
            'h5', {'class': 'gz-card-title'})
        name = name_element.get_text(strip=True)
        email = ''
        phone = ''
        writer.writerow([name, email, phone])
        
print("SUCCESS!")

371
SUCCESS!


In [4]:
# show the data
black_chamber = pd.read_csv('../../data/listings/chamber_of_commerce/clean/black_chamber.csv')
black_chamber.head()

Unnamed: 0,company_name,email,phone
0,Wilson Dental of McLean,,
1,Beyond Accounting & Tax,,
2,Ward Avenue Style Parlor,,
3,PointShift,,
4,Loice Mae's Kitchen,,


## 2. Asian Chamber of Commerce

In [5]:
# URL to scrape
aacc_url = 'https://business.asian-americanchamber.org/list/searchalpha/a?o=&'
# Send a GET request to the URL
response = requests.get(aacc_url)
page = BeautifulSoup(response.text, 'html.parser')
allresults_div = page.find_all('div', {'class': 'gz-list-card-wrapper col-sm-6 col-md-4'})

#set up csv file:  card gz-results-card gz-web-participation-10 gz-no-logo gz-nonsponsor
f = open("../../data/listings/chamber_of_commerce/clean/asian_chamber.txt", "w")
f.write(response.text)
f.close()
file_name = '../../data/listings/chamber_of_commerce/clean/asian_chamber.csv'

with open(file_name, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
#column titles in csv
    writer.writerow(['company_name', 'address', 'phone'])
    name = ''
    address = ''
    phone = ''
    for element in allresults_div:
        name_element = element.find('div', {'class': 'card-header'})
        name = name_element.find('span', {'class': 'gz-img-placeholder'}).get_text(strip=True)

        #bottom of card extraction
        tele_addy = element.find('div', {'class': 'card-body gz-results-card-body'})
        tel_add = tele_addy.find('ul', {'class': 'list-group list-group-flush'})
        
        # get the address
        address_card = tel_add.find('li',{'class': 'list-group-item gz-card-address'})
        if address_card is None:
            address = ''
        else:
            address_element = address_card.find('a', {'class': 'card-link'})
            address_list = address_element.find_all('span')
            address = ', '.join([address_list[x].get_text(strip=True) for x in range(0,len(address_list))])
        
        # get the phone
        phone_card = tel_add.find('li', {'class': 'list-group-item gz-card-phone'})
        if phone_card is not None:
            phone = phone_card.find('span').get_text(strip=True)
        else: 
            phone = ''
        writer.writerow([name, address, phone])

print("SUCCESS!")

#concerns....this code takes childrenlist as a list of lists instead of a list of strings ....not taking each child as a string,
# only lists strings of child to their span location
#for trinity's branch

SUCCESS!


In [6]:
# show the data
asian_chamber = pd.read_csv('../../data/listings/chamber_of_commerce/clean/asian_chamber.csv')
print(len(asian_chamber))
asian_chamber.head()

47


Unnamed: 0,company_name,address,phone
0,AACC - Test Account,"Tysons Corner, VA, 22182",
1,"AASA, Inc.","1640 Boro Place, 503, Mclean, VA, 22102",(703) 444-6170
2,Abberly Avera Apartment Homes by HHHu...,"11601 Hokie Stone Loop, Manassas, VA, 20109",(571) 379-4342
3,Absolute Thai,"1381 Beverly Road, Mclean, VA, 22101",(703) 847-1111
4,Ackerman Security Systems,"P.O Box 1172, Sterling, VA, 20167",(571) 577-1214


## 3. Hispanic Chamber of Commerce

In [7]:
# URL of the webpage
url = "https://docu.team/mms.php?association=410#"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Extract the JSON file URL using regex
    pattern = r'var jsonfile\s*=\s*\'([^"]+)\''
    match = re.search(pattern, response.text)

    if match:
        # Extracted JSON file URL
        json_url = match.group(1)

        # Send a GET request to the JSON file URL
        json_response = requests.get(json_url)

        # Check if the request was successful (status code 200)
        if json_response.status_code == 200:
            # Parse the JSON response
            json_data = json.loads(json_response.text)

            filename = '../../data/listings/chamber_of_commerce/clean/hispanic_chamber.csv'
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(['company_name', 'owner_name', 'phone', 'email', 'address'])

                for key, data in json_data.items():
                    # Extract relevant information from JSON
                    business_name = data.get('business_name', '')
                    first_name = data.get('first_name', '')
                    last_name = data.get('last_name', '')
                    phone = data.get('phone', '')
                    address_1 = data.get('address_1', '')
                    address_2 = data.get('address_2', '')
                    city = data.get('city', '')
                    state = data.get('state_province', '')
                    zip_code = data.get('zip_postal', '')
                    email = data.get('email', '')
                    address = f'{address_1} {address_2}, {city}, {state} {zip_code}'
                    full_name = f'{first_name} {last_name}'

                    writer.writerow([business_name, full_name, phone, email, address])

            print(f"Data successfully saved to {filename}")
        else:
            print("Error: Failed to retrieve the JSON file.")
    else:
        print("Error: JSON file URL not found.")
else:
    print("Error: Failed to retrieve the webpage.")

Data successfully saved to ../../data/listings/chamber_of_commerce/clean/hispanic_chamber.csv


In [8]:
# show the data
hispanic_chamber = pd.read_csv('../../data/listings/chamber_of_commerce/clean/hispanic_chamber.csv')
print(len(hispanic_chamber))
hispanic_chamber.head()

58


Unnamed: 0,company_name,owner_name,phone,email,address
0,Abberly Avera Apartment Homes,Neha Lorenzo,(571) 379-4342,abberlyaverateam@hhhunt.com,"11601 Hokie Stone Loop , Manassas, VA 20109"
1,Amazon,Patrick Phillippi,,merylrob@amazon.com,", ,"
2,Apollonia Business Solutions LLC,William Barker,5717231374,wbarker@apolloniacorp.com,"3033 Wilson Blvd Suite 700, Arlington, VA 22201"
3,AQUAS Inc,Carmen Larsen,(301) 654-4000,clarsen@aquasinc.com,"10400 Connecticut Avenue Suite 310, Kensington..."
4,Arepa Zone LLC,Gabriela Febres,7032616456,info@arepazone.com,"3160 Spring Street Unit B, Fairfax, VA 22031"


# 4. Combine all chamber data

In [9]:
# load all data and combine: variables are: company-name, owner_name, address, source
black_chamber['owner_name'] = ''
black_chamber['address'] = ''
black_chamber['source'] = 'black chamber of commerce'
asian_chamber['owner_name'] = ''
asian_chamber['source'] = 'asian chamber of commerce'
hispanic_chamber['source'] = 'hispanic chamber of commerce'

# filter the main columns and combine all data
black_chamber = black_chamber[['company_name','address','owner_name','source']]
asian_chamber = asian_chamber[['company_name','address','owner_name','source']]
hispanic_chamber = hispanic_chamber[['company_name','address','owner_name','source']]
chamber_data = pd.concat([black_chamber, asian_chamber, hispanic_chamber], ignore_index=True, sort=False)
chamber_data.head()

Unnamed: 0,company_name,address,owner_name,source
0,Wilson Dental of McLean,,,black chamber of commerce
1,Beyond Accounting & Tax,,,black chamber of commerce
2,Ward Avenue Style Parlor,,,black chamber of commerce
3,PointShift,,,black chamber of commerce
4,Loice Mae's Kitchen,,,black chamber of commerce


In [14]:
# save the data
chamber_data.to_csv('../../data/listings/chamber_of_commerce/clean/chamber.csv') 

# data description (count the number of companies)
print('Number of companies reported : ',len(chamber_data))
print('Number of unique company name: ',len(chamber_data['company_name'].unique()))

Number of companies reported :  476
Number of unique company name:  467


In [20]:
# Identify duplicate company's name and print them
test = chamber_data['company_name'].duplicated()
chamber_data[test]

Unnamed: 0,company_name,address,owner_name,source
379,Ad-centive Marketing,"6124 Rockwell Court, Burke, VA, 22015",,asian chamber of commerce
380,Adobe,"345 Park Avenue, San Jose, California, 95110-2704",,asian chamber of commerce
402,Arlington Economic Development,"Tysons Corner, VA, 22182",,asian chamber of commerce
404,Aronson LLC,"111 Rockville Pike,, Suite 600, Rockville, MD,...",,asian chamber of commerce
407,Asbury Methodist Village,"201 Russell Ave, , MD 20877, Gaithersburg, MD,...",,asian chamber of commerce
417,Audi Field,,,asian chamber of commerce
428,Carefirst BlueCross BlueShield,"3060 Williams Drive Suite 200, Fairfax, VA 22031",Jeanine Finch,hispanic chamber of commerce
451,"McBoyz, LLC","10614 Springmann Drive , Fairfax, VA 22030",Daniel McGuire,hispanic chamber of commerce
462,Sandy Spring Bank,"1356 Chain Bridge Road , McLean, VA 20101",Rosemary Flores Troche,hispanic chamber of commerce


In [19]:
chamber_data.loc[chamber_data['company_name']=='Adobe']

Unnamed: 0,company_name,address,owner_name,source
166,Adobe,,,black chamber of commerce
380,Adobe,"345 Park Avenue, San Jose, California, 95110-2704",,asian chamber of commerce


In [21]:
chamber_data.loc[chamber_data['company_name']=='Ad-centive Marketing']

Unnamed: 0,company_name,address,owner_name,source
378,Ad-centive Marketing,"6124 Rockwell Court, Burke, VA, 22015",,asian chamber of commerce
379,Ad-centive Marketing,"6124 Rockwell Court, Burke, VA, 22015",,asian chamber of commerce


In [22]:
chamber_data.loc[chamber_data['company_name']=='Arlington Economic Development']

Unnamed: 0,company_name,address,owner_name,source
45,Arlington Economic Development,,,black chamber of commerce
402,Arlington Economic Development,"Tysons Corner, VA, 22182",,asian chamber of commerce


In [23]:
chamber_data.loc[chamber_data['company_name']=='Aronson LLC']

Unnamed: 0,company_name,address,owner_name,source
403,Aronson LLC,"805 King Farm Blvd, Suite 300, Rockville, MD, ...",,asian chamber of commerce
404,Aronson LLC,"111 Rockville Pike,, Suite 600, Rockville, MD,...",,asian chamber of commerce


In [25]:
chamber_data.loc[chamber_data['company_name']=='Asbury Methodist Village']

Unnamed: 0,company_name,address,owner_name,source
406,Asbury Methodist Village,"201 Russell Ave, Gaithersburg, MD, 20877",,asian chamber of commerce
407,Asbury Methodist Village,"201 Russell Ave, , MD 20877, Gaithersburg, MD,...",,asian chamber of commerce


In [26]:
chamber_data.loc[chamber_data['company_name']=='Audi Field']

Unnamed: 0,company_name,address,owner_name,source
416,Audi Field,,,asian chamber of commerce
417,Audi Field,,,asian chamber of commerce


In [27]:
chamber_data.loc[chamber_data['company_name']=='Carefirst BlueCross BlueShield']

Unnamed: 0,company_name,address,owner_name,source
427,Carefirst BlueCross BlueShield,"3060 Williams Drive Suite 200, Fairfax, VA 22031",,hispanic chamber of commerce
428,Carefirst BlueCross BlueShield,"3060 Williams Drive Suite 200, Fairfax, VA 22031",Jeanine Finch,hispanic chamber of commerce


In [28]:
chamber_data.loc[chamber_data['company_name']=='McBoyz, LLC']

Unnamed: 0,company_name,address,owner_name,source
450,"McBoyz, LLC","10614 Springmann Drive , Fairfax, VA 22030",Daniel McGuire,hispanic chamber of commerce
451,"McBoyz, LLC","10614 Springmann Drive , Fairfax, VA 22030",Daniel McGuire,hispanic chamber of commerce


In [29]:
chamber_data.loc[chamber_data['company_name']=='Sandy Spring Bank']

Unnamed: 0,company_name,address,owner_name,source
18,Sandy Spring Bank,,,black chamber of commerce
462,Sandy Spring Bank,"1356 Chain Bridge Road , McLean, VA 20101",Rosemary Flores Troche,hispanic chamber of commerce


In [30]:
# Clean the data before saving all chamber of commerce
# - add duplicate with more information (like address and owner name)
chamber_data_final = chamber_data[~chamber_data.duplicated('company_name')]
chamber_data_final.to_csv('../../data/listings/chamber_of_commerce/clean/chamber.csv') 