In [7]:
import csv
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

# Summary

This script will gather addresses for businesses, using either a "light-weight" script, or one that is more comprehensive.

The lightweight script will pull all businesses in the S&P 500 (from Wikipedia), and then get headquarters address from the SEC website. Each SEC filing contains the address for the company.

In the more comprehensive version, addresses are provided for every business which has made a filing with the SEC, totaling over 12,000 businesses.

## Part 1: Light-Weight Version

In [9]:
table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
df = table[0]

In [18]:
df.iloc[474]

df.iloc[474]['Security']

'Viatris'

In [21]:
# Loop through the CIKs and pull information from the SEC website

comp_dict = {}
count = 0

for i in df['CIK']:
    if count % 50 == 0:
        print('Processing Business Number: ',count+1)
        
    sec = df.iloc[count]['Security']
    
    # CIKs are 10 numbers with leading 0s
    padded_num = str(i).rjust(10, '0')

    try:
        new_str = 'https://www.edgarcompany.sec.gov/servlet/CompanyDBSearch?page=detailed&cik='+padded_num+'&main_back=2'
    except:
        print(sec, i)
        count+=1
        continue
    new_req = requests.get(new_str)
    new_req_txt = new_req.text.replace('<BR>',' ')

    soup = BeautifulSoup(new_req_txt, 'html.parser')
    soup_text = soup.get_text().replace('\n','')

    add_search = re.search('Address:(.*)Phone', soup_text)
    if add_search == None:
        count+=1
        continue
    address = add_search.group(1).strip()
    comp_dict[sec] = address
    count += 1


Processing Business Number:  1
Processing Business Number:  51
Processing Business Number:  101
Processing Business Number:  151
Processing Business Number:  201
Processing Business Number:  251
Processing Business Number:  301
Processing Business Number:  351
Processing Business Number:  401
Processing Business Number:  451
Processing Business Number:  501


In [24]:
address_df = pd.DataFrame.from_dict(comp_dict, orient='index', columns=['Address']).reset_index()
address_df.rename(columns={'index':'Company Name'})

Unnamed: 0,Company Name,Address
0,3M,"3M CENTER BLDG. 220-13E-26A ST PAUL, MINNESOTA..."
1,A. O. Smith,"11270 WEST PARK PLACE MILWAUKEE, WISCONSIN 53224"
2,Abbott,"100 ABBOTT PARK ROAD ABBOTT PARK, ILLINOIS 600..."
3,AbbVie,"1 NORTH WAUKEGAN ROAD NORTH CHICAGO, ILLINOIS ..."
4,Accenture,1 GRAND CANAL SQUARE GRAND CANAL HARBOUR DUBLI...
...,...,...
497,Yum! Brands,"1441 GARDINER LANE LOUISVILLE, KENTUCKY 40213"
498,Zebra Technologies,"3 OVERLOOK POINT LINCOLNSHIRE, ILLINOIS 60069"
499,Zimmer Biomet,"345 EAST MAIN STREET WARSAW, INDIANA 46580"
500,Zions Bancorporation,ONE SOUTH MAIN STREET 11TH FLOOR SALT LAKE CIT...


In [25]:
address_df.to_csv('Company_Addresses_Lite.csv')

## Part 2: Comprehensive Version

Run this to pull addresses for any company which has filed with the SEC. At last run it was over 12,000 businesses. 
It takes 1-2 hours to run due to this, but may be faster on a more powerful machine.

In [None]:
import urllib.request 

c = 0
comp_dict = {}

# Contains data for every company which has filed with the SEC
# Format is ticker (e.g. MSFT), CIK
data = urllib.request.urlopen('https://www.sec.gov/include/ticker.txt') 

In [None]:
# Pull address information, similiarly to the other method.
# I did attempt to pull phone number, but it is all set to 000-000-0000 on this website

for line in data:
    if c % 50 == 0:
        print('Processing line: ',c)

    new_line = line.decode('ASCII').replace('\n','').split('\t')
    ticker = new_line[0]
    cik = new_line[1]

    padded_num = str(cik).rjust(10, '0')

    new_str = 'https://www.edgarcompany.sec.gov/servlet/CompanyDBSearch?page=detailed&cik='+padded_num+'&main_back=2'

    new_req = requests.get(new_str)
    new_req_txt = new_req.text.replace('<BR>',' ')
    soup = BeautifulSoup(new_req_txt, 'html.parser')
    soup_text = soup.get_text().replace('\n','')

    name_search = re.search('Company Name:(.*)CIK', soup_text)
    if name_search == None:
        print("No Company Name: ",cik)
        name = ''
    else:
        name = name_search.group(1).strip()

    add_search = re.search('Address:(.*)Phone', soup_text)
    if add_search == None:
        print("No Address: ", cik)
        address = ''
    else:
        address = add_search.group(1).strip()

    phone_search = re.search('Phone Number:(.*)State', soup_text)
    if phone_search == None:
        print("No Phone #: ", )
        phone = ''
    else:
        phone = phone_search.group(1).strip()
    comp_dict[c] = {'Name':name,'Address':address,'Phone':phone}

    c+=1

In [None]:
with open('company_addresses_sec.csv','w',newline='') as outputFile:
    writer = csv.writer(outputFile)
    writer.writerow(['Company_Name', 'Address','Phone'])
    for row in comp_dict:
        values = comp_dict[row]
        writer.writerow([values['Name'],values['Address'],values['Phone']])