## Obtain Data: Web Scraping and Data formatting

We scraped apartment data from www.apartment.com for 5 cities in Big D.C. Area, including Washington (D.C.), Arlington (VA), Alexandria (VA), Bethesda (MD) and Silver Spring (MD) then save them as individual csv files for further exploration. (using python, pandas)

In [3]:
from bs4 import BeautifulSoup as bs
import requests
import time
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### Define a new function to get data

In [20]:
def getData(url):
    """
    Input: Dictionary containing URLs of first result page
    Output: Two csv files summarizing the data from results
    Returns: Nothing
    """
    
    apartments = []
    i = 0 # Key to identify restaurants
    pages = range(1,29)   # Range of strings to add to URL to go to next result page
    #amenities = ['Dogs', 'Cats','AC','Washer-Dryer','Dishwasher','Parking','Gym','Wheelchair']
    
    
    for page in pages:   # Get results one page at a time
        time.sleep(0.1)
        url1 = url+str(page)+'/'  # URL to search
        request = requests.get(url1)
        html = request.text
        soup = bs(html,"html.parser")    
        results1 = soup.findAll('article', {"class" : "diamond placard"})
        results2 = soup.findAll('article', {"class" : "platinum placard"})
        results3 = soup.findAll('article', {"class" : "gold placard"})
        results4 = soup.findAll('article', {"class" : "silver placard"})
        results5 = soup.findAll('article', {"class" : "prosumer placard"})
        results6 = soup.findAll('article', {"class" : "basic placard"})
        results = results1 + results2 + results3 + results4 + results5 + results6

        for result in results:  
            apartments.append({})  
            apt_name = result.findAll('a', {"class" : "placardTitle js-placardTitle"})
            apartments[i]['Name'] = apt_name[0].getText()
            address = result.findAll('div', {"class" : "location"})
            address = address[0].getText()
            splits = address.split(',')
            if len(splits) > 2:
                apartments[i]['Address'] = splits[0]
                apartments[i]['City'] = splits[1]
                apartments[i]['State'] = splits[2].split()[0]
                apartments[i]['Zip'] = splits[2].split()[1]
            else:
                apartments[i]['Address'] = apt_name[0].getText()
                apartments[i]['City'] = splits[0]
                apartments[i]['State'] = splits[1].split()[0]
                apartments[i]['Zip'] = splits[1].split()[1]
            
            last_update = result.findAll('span', {"class" : "listingFreshness"})
            apartments[i]['Last update'] = last_update[0].getText().strip()            
            images = result.findAll('span', {"class" : "js-spnImgCount"})
            if len(images) > 0:
                apartments[i]['Image count'] = images[0].getText()
            else:
                apartments[i]['Image count'] = 'Not specified'
    
            price = result.findAll('span', {"class" : "altRentDisplay"})
            apartments[i]['Rent'] = price[0].getText().replace('$','')
            
            style = result.findAll('span', {"class" : "unitLabel propertyStyle"})
            if len(style) == 0:
                style = result.findAll('span', {"class" : "unitLabel"})
                if 'Studio' in style[0].getText():
                    apartments[i]['Style'] = 'Studio'
                    if len(style[0].getText().split()) < 2:
                        apartments[i]['Num bedrooms'] = 1
                    else:
                        if '-' in style[0].getText().split()[2]:                  
                            apartments[i]['Num bedrooms'] = style[0].getText().split()[2].split('-')[1]
                        else:
                            apartments[i]['Num bedrooms'] = style[0].getText().split()[2]
                else:
                    apartments[i]['Style'] = 'Apartment'
                    if '-' in style[0].getText():
                        apartments[i]['Num bedrooms'] = style[0].getText().split()[0].split('-')[1]
                    else:
                        apartments[i]['Num bedrooms'] = style[0].getText().split()[0]
            else:
                apartments[i]['Style'] = style[0].getText()
                apartments[i]['Num bedrooms'] = 'N/A'
            
            if (len(result.findAll('ul', {"class" : "amenities"}))) > 0:
                apartments[i]['Gym'] = {}
                apartments[i]['Dogs'] = {}
                apartments[i]['Cats'] = {}
                apartments[i]['AC'] = {}
                apartments[i]['Washer_Dryer'] = {}
                apartments[i]['Dishwasher'] = {}
                apartments[i]['Parking'] = {}
                apartments[i]['Wheelchair'] = {}
                dogs = result.findAll('li', {"class" : "petIcon"})
                cats = result.findAll('li', {"class" : "catIcon"})
                ac = result.findAll('li', {"class" : "airConditionerIcon"})
                wd = result.findAll('li', {"class" : "laundryIcon"})
                dw = result.findAll('li', {"class" : "dishWasherIcon"})
                par = result.findAll('li', {"class" : "carIcon"})
                gym = result.findAll('li', {"class" : "fitnessIcon"})
                wc = result.findAll('li', {"class" : "wheelchairIcon"})   
                
                """
                for amenity in amenities:
                    if (len(amenities_results[amenity]) > 0) :                   
                        apartments[i]['Amenities'][amenity] = True
                    else:
                        apartments[i]['Amenities'][amenity] = False
                """
                
                if (len(dogs) > 0) : 
                    apartments[i]['Dogs'] = True
                else:
                    apartments[i]['Dogs'] = False
                
                if (len(cats) > 0) : 
                    apartments[i]['Cats'] = True
                else:
                    apartments[i]['Cats'] = False
                
                if (len(gym) > 0) : 
                    apartments[i]['Gym'] = True
                else:
                    apartments[i]['Gym'] = False
                                
                if (len(ac) > 0) : 
                    apartments[i]['AC'] = True
                else:
                    apartments[i]['AC'] = False
                
                if (len(wd) > 0) : 
                    apartments[i]['Washer_Dryer'] = True
                else:
                    apartments[i]['Washer_Dryer'] = False
                        
                if (len(dw) > 0) : 
                    apartments[i]['Dishwasher'] = True
                else:
                    apartments[i]['Dishwasher'] = False
                
                if (len(par) > 0) : 
                    apartments[i]['Parking'] = True
                else:
                    apartments[i]['Parking'] = False 
                        
                if (len(wc) > 0) : 
                    apartments[i]['Wheelchair'] = True
                else:
                    apartments[i]['Wheelchair'] = False            
                    
            else:
                apartments[i]['Gym'] = 'Not listed'
                apartments[i]['Dogs'] = 'Not listed'
                apartments[i]['Cats'] = 'Not listed'
                apartments[i]['AC'] = 'Not listed'
                apartments[i]['Washer_Dryer'] = 'Not listed'
                apartments[i]['Dishwasher'] = 'Not listed'
                apartments[i]['Parking'] = 'Not listed'
                apartments[i]['Wheelchair'] = 'Not listed'

                
            i+=1
    return apartments

### 5 Cities we are targeting to obtain data

In [22]:
mydata = getData('http://www.apartments.com/arlington-va/') 
#mydata = getData('http://www.apartments.com/washington-dc/')
#mydata = getData('http://www.apartments.com/arlington-va/')
#mydata = getData('http://www.apartments.com/alexandria-va/')
#mydata = getData('http://www.apartments.com/bethesda-md/')
#mydata = getData('http://www.apartments.com/silver-spring-md/')
#print(mydata)

### Save the Rent Data of D.C. to csv file

In [23]:
with open('apartments_arli.csv', 'w', newline = '') as file1:
    fieldnames = ['Name','Address','City','State','Zip','Last update','Image count','Num bedrooms','Rent','Style','Dogs','Cats','Wheelchair','Parking','Gym','Washer_Dryer','AC','Dishwasher']
    writer = csv.DictWriter(file1, fieldnames=fieldnames)
    writer.writeheader()
    for i in mydata:
        writer.writerow(i)

In [24]:
#url = 'http://www.apartments.com/washington-dc/'
#url = 'http://www.apartments.com/arlington-va/'
#url = 'http://www.apartments.com/alexandria-va/'
#url = 'http://www.apartments.com/bethesda-md/'
#url = 'http://www.apartments.com/silver-spring-md/'

### Check the outlook of csv file

In [41]:
!csvcut apartments_arli.csv | head -3 | csvlook

|---------------+------------------+------------+-------+-------+-------------+-------------+--------------+---------------+--------+------+------+------------+---------+-------+--------------+-------+-------------|
|  Name         | Address          | City       | State | Zip   | Last update | Image count | Num bedrooms | Rent          | Style  | Dogs | Cats | Wheelchair | Parking | Gym   | Washer_Dryer | AC    | Dishwasher  |
|---------------+------------------+------------+-------+-------+-------------+-------------+--------------+---------------+--------+------+------+------------+---------+-------+--------------+-------+-------------|
|  Parc Rosslyn | 1531 N Pierce St |  Arlington | VA    | 22209 | New         | 68          | 3            | 1,698 - 4,010 | Studio | True | True | False      | False   | False | True         | True  | True        |
|  RiverHouse   | 1400 S Joyce St  |  Arlington | VA    | 22202 | 5 hrs       | 59          | 3            | 1,314 - 2,908 | Studio 

### Data cleaning and formatting

Step1: Select and only keep the upper bound of Rent range for consistency 

Step2: "count if" to create a new column - number of amenities

In [51]:
import pandas as pd
df = pd.read_csv('/home/xinyuan/Desktop/PROGRAMMING_FINAL_PROJECT/apartments_arli.csv', names = None, index_col=False)
df1 = pd.DataFrame(df.Rent.str.split('-',1).tolist(), columns = ['Lower Bound','Upper Rent'])
df['Rent'] = df1['Upper Rent'] #select and keep the upper bound for analysis - step1

Criteria1 = df.Dogs == "True"  #following codes are creating number of amenities
Criteria2 = df.Cats == "True"
Criteria3 = df.Wheelchair == "True"
Criteria4 = df.Gym == "True"
Criteria5 = df.Washer_Dryer == "True"
Criteria6 = df.AC == "True"
Criteria7 = df.Dishwasher == "True"
Criteria8 = df.Parking == "True"

criteria_df = pd.concat([Criteria1, Criteria2, Criteria3, Criteria4, Criteria5, Criteria6, Criteria7, Criteria8],axis=1)
df['Number of Amenities'] = criteria_df.sum(axis=1)
df.to_csv('rentdata_arli.csv', index = False)

In [52]:
!csvcut rentdata_arli.csv | head -3 | csvlook

|---------------+------------------+------------+-------+-------+-------------+-------------+--------------+--------+--------+------+------+------------+---------+-------+--------------+-------+------------+----------------------|
|  Name         | Address          | City       | State | Zip   | Last update | Image count | Num bedrooms | Rent   | Style  | Dogs | Cats | Wheelchair | Parking | Gym   | Washer_Dryer | AC    | Dishwasher | Number of Amenities  |
|---------------+------------------+------------+-------+-------+-------------+-------------+--------------+--------+--------+------+------+------------+---------+-------+--------------+-------+------------+----------------------|
|  Parc Rosslyn | 1531 N Pierce St |  Arlington | VA    | 22209 | New         | 68          | 3.0          |  4,010 | Studio | True | True | False      | False   | False | True         | True  | True       | 5                    |
|  RiverHouse   | 1400 S Joyce St  |  Arlington | VA    | 22202 | 5 hrs 