## Base Imports

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import requests

## Notebook and Request Header Settings

In [3]:
# set some display settings for notebooks
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

req_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}


## Helper functions for Creating Soups

In [8]:
def build_soups(url):
    """
    Builds BeautifulSoup objects for each page of the Zillow search results.
    """
    response = requests.get(url, headers=req_headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup


## Grab/Create all Soups

In [22]:
base_url = 'https://www.zillow.com/homes/for_rent/'
page_max_range = 10
city = 'Omaha'
soups = []

for i in range(1, page_max_range):
    url = f'{base_url}{city}/{str(i)}_p/'
    soups.append(build_soups(url))

## Grab Data from All Soups and Create Huge Dataframe

In [23]:
def grab_soup_data(soups):
    """
    Grabs all the data from the soup objects and puts it into a list.
    """
    soup_data = []
    for soup in soups:
        # all for loops are pulling the specified variable using beautiful soup and inserting into said variable
        df = pd.DataFrame(
            columns=['prices', 'address', 'beds', 'sqft', 'links'])

        for i in soup:
            address = soup.find_all(class_='list-card-addr')
            price = list(soup.find_all(class_='list-card-price'))
            beds = list(soup.find_all("ul",  class_="list-card-details"))
            sqft = list(soup.find_all("ul",  class_="list-card-details"))
            details = soup.find_all('div', {'class': 'list-card-details'})
            home_type = soup.find_all('div', {'class': 'list-card-footer'})
            last_updated = soup.find_all('div', {'class': 'list-card-top'})
            brokerage = list(soup.find_all(
                class_='list-card-brokerage list-card-img-overlay', text=True))
            link = soup.find_all(class_='list-card-link')

            # create dataframe columns out of variables
            df['prices'] = price
            df['address'] = address
            df['beds'] = beds
            df['sqft'] = sqft

        # create empty url list
        urls = []

        # loop through url, pull the href and strip out the address tag
        for link in soup.find_all("article"):
            href = link.find('a', class_="list-card-link")
            if href:
                addresses = href.get('href')
                urls.append(addresses)

        # import urls into a links column
        df['links'] = urls
        df['links'] = df['links'].astype('str')

        # remove html tags
        df['links'] = df['links'].replace(
            '<a class="list-card-link" href="', ' ', regex=True)
        df['links'] = df['links'].replace(
            '" tabindex="0"></a>', ' ', regex=True)
        soup_data.append(df)
    return soup_data

df = pd.concat(grab_soup_data(soups))

## Cleanup the Huge Dataframe

In [24]:
#convert columns to str
df['prices'] = df['prices'].astype('str')
df['address'] = df['address'].astype('str')
df['beds'] = df['beds'].astype('str')
df['sqft'] = df['sqft'].astype('str')

#remove html tags
df['prices'] = df['prices'].replace('<div class="list-card-price">', ' ', regex=True)
df['address'] = df['address'].replace('<address class="list-card-addr">', ' ', regex=True)
df['prices'] = df['prices'].replace('</div>', ' ', regex=True)
df['address'] = df['address'].replace('</address>', ' ', regex=True)
df['prices'] = df['prices'].str.replace(r'\D', '')

#need to figure out correct html to add to this code from zillow browser
#remove html tags from beds column
df['beds'] = df['beds'].replace('Studio</li><li>', '0 ', regex=True)
df['beds'] = df['beds'].str.replace(r'\D', ' ', regex=True)
df['sqft'] = df['sqft'].replace('Studio</li><li>', '0 ', regex=True)
df['sqft'] = df['sqft'].str.replace(r'\D', ' ', regex=True)


#split beds column into beds, bath and sqft
#will need to concatenate and split up columns after everything is pulled into data frame

#remove commas from sq_feet and convert to float
df.replace(',','', regex=True, inplace=True)

#drop nulls
df = df[(df['prices'] != '') & (df['prices']!= ' ')]

#convert column to float
df['prices'] = df['prices'].astype('float')


#remove spaces from link column
df['links'] = df.links.str.replace(' ','')

print('The column datatypes are:')
print(df.dtypes)
print('The dataframe shape is:', df.shape)

#re arrange the data frame and include sq  - make sure the dataframe has correct vallues
#will need to split zip code into its own column
#find out how to extract zillow html and put it into notebook

df

The column datatypes are:
prices     float64
address     object
beds        object
sqft        object
links       object
dtype: object
The dataframe shape is: (81, 5)


  if sys.path[0] == '':


Unnamed: 0,prices,address,beds,sqft,links
0,1500.0,2510 N 64th St Omaha NE 68104,2 ...,2 ...,https://www.zillow.com/homedetails/2510-N-64th...
1,1600.0,2329 N 143rd Ave Omaha NE 68164,3 ...,3 ...,https://www.zillow.com/homedetails/2329-N-143r...
2,970.0,Broadmoor at Aksarben Village | 2225 S 64th P...,1 ...,1 ...,/b/broadmoor-at-aksarben-village-omaha-ne-5XtPhk/
3,1125.0,The Duke Omaha | 201 N 46th St Omaha NE,1 ...,1 ...,/b/the-duke-omaha-omaha-ne-BLzSVn/
4,1110.0,The Conrad | 3601 Jones St Omaha NE,1 ...,1 ...,/b/the-conrad-omaha-ne-9WHB3w/
5,9541.0,Briar Hills | 17010 Hawthorne Plz Omaha NE,1 ...,1 ...,/b/briar-hills-omaha-ne-5Xs6sD/
6,2493.0,909 Capitol Ave APT 401 Omaha NE 68102,2 ...,2 ...,https://www.zillow.com/homedetails/909-Capitol...
7,9951.0,Wyndham Hill by Broadmoor | 9226 Burt St Omah...,1 ...,1 ...,/b/wyndham-hill-by-broadmoor-omaha-ne-5XtP4L/
8,1250.0,3412 Dodge St #106 Omaha NE 68131,1 ...,1 ...,https://www.zillow.com/homedetails/3412-Dodge-...
0,10001.0,Jones13 | 1301 Jones St Omaha NE,,,/b/jones13-omaha-ne-5ZLdYZ/
