# Scraping Grocery Stores

This notebook outlines steps to pull addresses of various grocery stores in Houston, TX.

In [1]:
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
import lxml
import pandas as pd
import numpy as np
import re
import time
import os
import csv
import datetime as dt
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_colwidth', -1)

In [2]:
cols = ['Name','Grocer','Address','Category']
listing_all_tot = pd.DataFrame(columns=cols)

In [3]:
HEADERS = {'User-Agent':'Mozilla/5.0'}

## Kroger DF

In [4]:
kroger_addresses = pd.DataFrame(columns=['Name','Address','Zip','Store'])
#Kroger listings
for i in range(4):
    BASE_URL = "https://www.yellowpages.com/search?search_terms=kroger&geo_location_terms=Houston%2C%20TX&page=" + str(i+1)
    response = requests.get(BASE_URL,headers=HEADERS)
    soup = BeautifulSoup(response.content, "html5lib")
    addrs_street = soup.findAll("span", { "class" : "street-address"})
    addrs = []
    for addr in addrs_street:
        addrs.append(addr.text)
    addrs_zip = soup.findAll(itemprop="postalCode")
    zips = []
    for zipc in addrs_zip:
        zips.append(zipc.text)
    name_q = soup.findAll("a", { "class" : "business-name"})
    names = []
    for n in name_q:
        names.append(n.text)
    addresses = pd.DataFrame(
            {'Name':names,'Address': addrs,'Zip': zips})
    addresses['Store'] = "Kroger"
    kroger_addresses = pd.concat([kroger_addresses,addresses])

In [5]:
kroger_addresses.shape

(114, 4)

In [6]:
kroger_addresses.head()

Unnamed: 0,Address,Name,Store,Zip
0,7747 Kirby Dr,Kroger Pharmacy,Kroger,77030
1,3300 Montrose Blvd,Kroger Pharmacy,Kroger,77006
2,5150 Buffalo Speedway,Kroger,Kroger,77005
3,1352 W 43rd St,Kroger,Kroger,77018
4,3300 Montrose Blvd,Kroger Fresh Fare,Kroger,77006


## Fiesta DF

In [None]:
fiesta_addresses = pd.DataFrame(columns=['Name','Address','Zip','Store'])
#fiesta listings
for i in range(5):
    BASE_URL = "https://www.yellowpages.com/search?search_terms=fiesta+mart&geo_location_terms=Houston%2C+TX&page=" + str(i+1)
    response = requests.get(BASE_URL,headers=HEADERS)
    soup = BeautifulSoup(response.content, "html5lib")
    addrs_street = soup.findAll("span", { "class" : "street-address"})
    addrs = []
    for addr in addrs_street:
        addrs.append(addr.text)
    addrs_zip = soup.findAll(itemprop="postalCode")
    zips = []
    for zipc in addrs_zip:
        zips.append(zipc.text)
    zips = [ x for x in zips if '77207' not in x ]   #custom filter 
    name_q = soup.findAll("a", { "class" : "business-name"})
    names = []
    for n in name_q:
        names.append(n.text)
    names = [ x for x in names if "Fiesta" in x ] #custom filter
    names = [ x for x in names if "2016 Ford" not in x ] #custom filter
    names = [ x for x in names if "Fiesta Patrias" not in x ] #custom filter
    addresses = pd.DataFrame(
            {'Name':names,'Address': addrs,'Zip': zips})
    addresses['Store'] = "fiesta"
    fiesta_addresses = pd.concat([fiesta_addresses,addresses])

## Randalls

In [None]:
Randalls_addresses = pd.DataFrame(columns=['Name','Address','Zip','Store'])
#Randalls listings
for i in range(2):
    BASE_URL = "https://www.yellowpages.com/search?search_terms=randalls&geo_location_terms=Houston%2C+TX&page=" + str(i+1)
    response = requests.get(BASE_URL,headers=HEADERS)
    soup = BeautifulSoup(response.content, "html5lib")
    addrs_street = soup.findAll("span", { "class" : "street-address"})
    addrs = []
    for addr in addrs_street:
        addrs.append(addr.text)
    addrs = [ x for x in addrs if "627 W 19th St" not in x ] #custom filter
    addrs = [ x for x in addrs if "1015 W 24th St" not in x ] #custom filter
    addrs = [ x for x in addrs if "1111 Heights Blvd" not in x ] #custom filter
    addrs_zip = soup.findAll(itemprop="postalCode")
    zips = []
    for zipc in addrs_zip:
        zips.append(zipc.text)
    zips = [ x for x in zips if '77008' not in x ]   #custom filter 
    name_q = soup.findAll("a", { "class" : "business-name"})
    names = []
    for n in name_q:
        names.append(n.text)
    names = [ x for x in names if "Randall's Executive Transportation" not in x ] #custom filter
    names = [ x for x in names if "Randalls Executive Transportation Service" not in x ] #custom filter
    names = [ x for x in names if "Randall Murrow Photography" not in x ] #custom filter
#     names = [ x for x in names if "2016 Ford" not in x ] #custom filter
#     names = [ x for x in names if "Randalls Patrias" not in x ] #custom filter
    addresses = pd.DataFrame(
            {'Name':names,'Address': addrs,'Zip': zips})
    addresses['Store'] = "Randalls"
    Randalls_addresses = pd.concat([Randalls_addresses,addresses])

## HEB

In [None]:
HEB_addresses = pd.DataFrame(columns=['Name','Address','Zip','Store'])
#HEB listings
for i in range(7):
    BASE_URL = "https://www.yellowpages.com/search?search_terms=HEB&geo_location_terms=Houston%2C+TX&page=" + str(i+1)
    response = requests.get(BASE_URL,headers=HEADERS)
    soup = BeautifulSoup(response.content, "html5lib")
    addrs_street = soup.findAll("span", { "class" : "street-address"})
    addrs = []
    for addr in addrs_street:
        addrs.append(addr.text)
#     addrs = [ x for x in addrs if "627 W 19th St" not in x ] #custom filter
#     addrs = [ x for x in addrs if "1015 W 24th St" not in x ] #custom filter
#     addrs = [ x for x in addrs if "1111 Heights Blvd" not in x ] #custom filter
    addrs_zip = soup.findAll(itemprop="postalCode")
    zips = []
    for zipc in addrs_zip:
        zips.append(zipc.text)
#     zips = [ x for x in zips if '77008' not in x ]   #custom filter 
    name_q = soup.findAll("a", { "class" : "business-name"})
    names = []
    for n in name_q:
        names.append(n.text)
#     names = [ x for x in names if "Randall's Executive Transportation" not in x ] #custom filter
#     names = [ x for x in names if "HEB Executive Transportation Service" not in x ] #custom filter
#     names = [ x for x in names if "Randall Murrow Photography" not in x ] #custom filter
#     names = [ x for x in names if "2016 Ford" not in x ] #custom filter
#     names = [ x for x in names if "HEB Patrias" not in x ] #custom filter
    addresses = pd.DataFrame(
            {'Name':names,'Address': addrs,'Zip': zips})
    addresses['Store'] = "HEB"
    HEB_addresses = pd.concat([HEB_addresses,addresses])

## Exports

In [None]:
dir = "C:/Users/David/Dropbox/Skills/Python/\
Projects/Real_Estate/htx_grocery_scraping/"
kroger_addresses.to_csv(dir+"kroger.csv")
fiesta_addresses.to_csv(dir+"fiesta.csv")
Randalls_addresses.to_csv(dir+"randalls.csv")
HEB_addresses.to_csv(dir+"heb.csv")