### Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd 
import numpy as np
import urllib.parse 

### HTTP Request

#### store website in variable

In [2]:
website = 'https://www.trulia.com/NY/New_York/'
headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.54'})

#### Get Request

In [3]:
response = requests.get(website, headers=headers)

#### Status Code

In [4]:
response

<Response [200]>

### Soup Object 

In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <script>
   window.trulia = window.trulia || {};
          trulia.analytics = trulia.analytics || {};
          trulia.analytics.automaticTrackState = false;
  </script>
  <style data-styled="" data-styled-version="5.1.1">
   .eIkmHi{-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;}/*!sc*/
data-styled.g1[id="MediaBlock__MediaContainer-skmvlj-0"]{content:"eIkmHi,"}/*!sc*/
.dwmuOU{margin:0 8px;}/*!sc*/
.dwmuOU:last-child{margin-right:0;}/*!sc*/
.dwmuOU:first-child{margin-left:0;}/*!sc*/
.BVZzV{-webkit-flex:1 1 0px;-ms-flex:1 1 0px;flex:1 1 0px;width:100%;margin:0 8px;}/*!sc*/
.BVZzV:last-child{margin-right:0;}/*!sc*/
.BVZzV:first-child{margin-left:0;}/*!sc*/
.gnreBg{margin:0 4px;}/*!sc*/
.gnreBg:last-child{margin-right:0;}/*!sc*/
.gnre

### Results

In [7]:
result = soup.find_all('li', {'class' : 'Grid__CellBox-sc-144isrp-0 SearchResultsList__WideCell-b7y9ki-2 jiZmPM'})

In [8]:
len(result)

42

### Get Only the RIght Result

In [9]:
result_update = [i for i in result if i.has_attr('data-testid')]

In [10]:
len(result_update)

40

### Get the Attribute

In [11]:
address = [result.find('div', {'data-testid':'property-address'}).get_text() for result in result_update]
beds = [result.find('div', {'data-testid':'property-beds'}).get_text() for result in result_update]
baths = [result.find('div', {'data-testid':'property-baths'}).get_text() for result in result_update]
prices = [result.find('div', {'data-testid':'property-price'}).get_text() for result in result_update]

### Make a DataFrame

In [13]:
real_estate = pd.DataFrame(columns=['Address', 'Beds', 'Baths', 'Price'])

for i in range (len(address)):
    real_estate=real_estate.append({'Address':address[i], 'Beds':beds[i], 'Baths':baths[i], 'Price':prices[i]}, ignore_index=True)

In [14]:
real_estate

Unnamed: 0,Address,Beds,Baths,Price
0,"469 E 49th St #6, Brooklyn, NY 11203",10bd,6ba,"$1,250,000"
1,"715 Avenue L, Brooklyn, NY 11230",8bd,6ba,"$1,690,000"
2,"223 E 62nd St, New York, NY 10065",5bd,6ba,"$7,995,000"
3,"1824 E 17th St #B2, Brooklyn, NY 11229",2bd,2ba,"$579,000"
4,"52-15 65th Pl #3D, Queens, NY 11378",2bd,1ba,"$469,000"
5,"432 Carlton Ave, Brooklyn, NY 11238",7bd,4ba,"$2,450,000"
6,"1143 Lafayette Ave #3B, Brooklyn, NY 11221",1bd,1ba,"$650,000"
7,"43 Bay 25th St, Brooklyn, NY 11214",6bd,4ba,"$2,299,000"
8,"155 Bay St #5B, Staten Island, NY 10301",2bd,2ba,"$599,000"
9,"18 W 127th St #1, New York, NY 10027",7bd,5ba,"$3,500,000"


### Get 5 Pages

In [24]:
real_estate_new=pd.DataFrame(columns=['Address', 'Beds', 'Baths', 'Area', 'Price'])

address=[]
beds=[]
baths=[]
areas=[]
prices=[]

web = ['https://www.trulia.com/NY/New_York/',
      'https://www.trulia.com/CA/Los_Angeles/',
      'https://www.trulia.com/IL/Chicago/',
      'https://www.trulia.com/AZ/Phoenix/',
      'https://www.trulia.com/NV/Las_Vegas/']

for h in web:
    for i in range (1,21):
        website = requests.get(h + str(i) + '_p/', headers=headers)

        soup = BeautifulSoup(website.content, 'html.parser')

        result = soup.find_all('li', {'class' : 'Grid__CellBox-sc-144isrp-0 SearchResultsList__WideCell-b7y9ki-2 jiZmPM'})

        result_update = [k for k in result if k.has_attr('data-testid')]

        for result in result_update:

            try:
                address.append(result.find('div', {'data-testid':'property-address'}).get_text())
            except:
                address.append('n/a')

            try:
                beds.append(result.find('div', {'data-testid':'property-beds'}).get_text())
            except:
                beds.append('n/a')

            try:
                baths.append(result.find('div', {'data-testid':'property-baths'}).get_text())
            except:
                baths.append('n/a')    

            try:
                areas.append(result.find('div', {'data-testid':'property-floorSpace'}).get_text())
            except:
                areas.append('n/a')  

            try:
                prices.append(result.find('div', {'data-testid':'property-price'}).get_text())
            except:
                prices.append('n/a')      

        for j in range (len(address)):
            real_estate_new=real_estate_new.append({'Address':address[j], 'Beds':beds[j], 'Baths':baths[j], 'Area':areas[j], 'Price':prices[j]}, ignore_index=True)

In [28]:
real_estate_new

Unnamed: 0,Address,Beds,Baths,Area,Price
0,"4-74 48th Ave #9K, Long Island City, NY 11109",1bd,1ba,700 sqft,"$555,000"
1,"469 E 49th St #6, Brooklyn, NY 11203",10bd,6ba,,"$1,250,000"
2,"715 Avenue L, Brooklyn, NY 11230",8bd,6ba,"2,015 sqft","$1,690,000"
3,"223 E 62nd St, New York, NY 10065",5bd,6ba,"3,750 sqft","$7,995,000"
4,"1824 E 17th St #B2, Brooklyn, NY 11229",2bd,2ba,888 sqft,"$579,000"
...,...,...,...,...,...
208145,"Crimson Plan 524 in Watercolor, North Las Vega...",3bd,3ba,"1,366 sqft","$327,990+"
208146,"Crimson Plan 523 in Watercolor, North Las Vega...",3bd,3ba,"1,352 sqft","$327,990+"
208147,"Sienna Plan 512 in Watercolor, North Las Vegas...",3bd,2ba,"1,347 sqft","$311,990+"
208148,"Indigo Plan 544 in Watercolor, North Las Vegas...",3bd,3ba,"2,023 sqft","$426,990+"


In [29]:
# dataframe info
real_estate_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208150 entries, 0 to 208149
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Address  208150 non-null  object
 1   Beds     208150 non-null  object
 2   Baths    208150 non-null  object
 3   Area     208150 non-null  object
 4   Price    208150 non-null  object
dtypes: object(5)
memory usage: 7.9+ MB


In [30]:
#first 5 results
real_estate_new.head()

Unnamed: 0,Address,Beds,Baths,Area,Price
0,"4-74 48th Ave #9K, Long Island City, NY 11109",1bd,1ba,700 sqft,"$555,000"
1,"469 E 49th St #6, Brooklyn, NY 11203",10bd,6ba,,"$1,250,000"
2,"715 Avenue L, Brooklyn, NY 11230",8bd,6ba,"2,015 sqft","$1,690,000"
3,"223 E 62nd St, New York, NY 10065",5bd,6ba,"3,750 sqft","$7,995,000"
4,"1824 E 17th St #B2, Brooklyn, NY 11229",2bd,2ba,888 sqft,"$579,000"


### Data Cleaning

In [31]:
real_estate_new['Beds'] = real_estate_new['Beds'].apply(lambda x: x.strip('bd'))
real_estate_new['Baths'] = real_estate_new['Baths'].apply(lambda x: x.strip('ba'))
real_estate_new['Price'] = real_estate_new['Price'].apply(lambda x: x.strip('$'))
real_estate_new['Price'] = real_estate_new['Price'].apply(lambda x: x.replace(",",""))
real_estate_new['Price'] = real_estate_new['Price'].apply(lambda x: x.replace("+",""))
real_estate_new['Area'] = real_estate_new['Area'].apply(lambda x: x.replace(" sqft",""))

real_estate_new['Street']=real_estate_new['Address'].apply(lambda x: x.split(',')[0])
real_estate_new['District']=real_estate_new['Address'].apply(lambda x: x.split(',')[1])
real_estate_new['City']=real_estate_new['Address'].apply(lambda x: x.split(',')[2].split(' ')[1])
real_estate_new['Zip Code']=real_estate_new['Address'].apply(lambda x: x.split(',')[2].split(' ')[2])

In [40]:
real_estate_new.describe()

Unnamed: 0,Address,Beds,Baths,Area,Price,Street,District,City,Zip Code
count,208150,208150,208150,208150.0,208150,208150,208150,208150,208150
unique,4080,25,19,1919.0,1466,4060,164,7,477
top,"1170 Ocean Pkwy, Brooklyn, NY 11230",3,2,,799000,Address Not Disclosed,Chicago,NY,85339
freq,188,57838,70888,39019.0,2932,899,38193,73534,2250


In [51]:
real_estate_new

Unnamed: 0,Address,Beds,Baths,Area,Price,Street,District,City,Zip Code
0,"4-74 48th Ave #9K, Long Island City, NY 11109",1,1,700,555000,4-74 48th Ave #9K,Long Island City,NY,11109
1,"469 E 49th St #6, Brooklyn, NY 11203",10,6,,1250000,469 E 49th St #6,Brooklyn,NY,11203
2,"715 Avenue L, Brooklyn, NY 11230",8,6,2015,1690000,715 Avenue L,Brooklyn,NY,11230
3,"223 E 62nd St, New York, NY 10065",5,6,3750,7995000,223 E 62nd St,New York,NY,10065
4,"1824 E 17th St #B2, Brooklyn, NY 11229",2,2,888,579000,1824 E 17th St #B2,Brooklyn,NY,11229
...,...,...,...,...,...,...,...,...,...
208145,"Crimson Plan 524 in Watercolor, North Las Vega...",3,3,1366,327990,Crimson Plan 524 in Watercolor,North Las Vegas,NV,89086
208146,"Crimson Plan 523 in Watercolor, North Las Vega...",3,3,1352,327990,Crimson Plan 523 in Watercolor,North Las Vegas,NV,89086
208147,"Sienna Plan 512 in Watercolor, North Las Vegas...",3,2,1347,311990,Sienna Plan 512 in Watercolor,North Las Vegas,NV,89086
208148,"Indigo Plan 544 in Watercolor, North Las Vegas...",3,3,2023,426990,Indigo Plan 544 in Watercolor,North Las Vegas,NV,89086


In [52]:
real_estate_new.to_csv('Dataset Real Estate.csv')