### Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.parse

### HTTP Request

#### store website in variable

In [2]:
website = "https://www.trulia.com/CA/San_Diego/"

#### Get Request

In [3]:
response = requests.get(website)

#### Status Code

In [4]:
response.status_code

200

### Soup Object

In [5]:
soup = BeautifulSoup(response.content,'html.parser')

### Results

In [6]:
results = soup.find_all('li', {'class':'SearchResultsList__WideCell-b7y9ki-2'})
len(results)

42

### Update Result

we just want to target the elements which have the attribute 'data-testid'

In [7]:
results_update=[]

In [8]:
for r in results:
    if r.has_attr('data-testid'):
        results_update.append(r)

In [9]:
len(results_update)

40

### Concate 2 URL Parts to get absolute URL

In [14]:
# we combine url part 1 with part 2 in order to get absolute url

url_part_1 = 'https://www.trulia.com'

In [15]:
url_part_2 = []

for item in results_update:
    
    for link in item.find_all('div',{'data-testid':'property-card-details'}):
        url_part_2.append(link.find('a').get('href'))

In [17]:
# Joining both Urls

url_joined = []

for link_2 in url_part_2:
    url_joined.append(urllib.parse.urljoin(url_part_1, link_2))

In [18]:
url_joined

['https://www.trulia.com/p/ca/san-diego/3838-martha-st-san-diego-ca-92117--1063392591',
 'https://www.trulia.com/p/ca/san-diego/7055-lisbon-st-san-diego-ca-92114--1010003527',
 'https://www.trulia.com/p/ca/san-diego/1858-ridge-view-dr-san-diego-ca-92105--1061703729',
 'https://www.trulia.com/p/ca/la-jolla/308-vista-de-la-playa-la-jolla-ca-92037--2079474424',
 'https://www.trulia.com/p/ca/la-jolla/5850-camino-de-la-costa-la-jolla-ca-92037--2079478670',
 'https://www.trulia.com/p/ca/san-diego/851-euclid-ave-san-diego-ca-92114--2079824167',
 'https://www.trulia.com/p/ca/san-diego/10925-polaris-dr-san-diego-ca-92126--2079929837',
 'https://www.trulia.com/p/ca/san-diego/5404-chollas-station-rd-san-diego-ca-92105--2079730536',
 'https://www.trulia.com/p/ca/san-diego/3500-jewell-st-san-diego-ca-92109--2079764388',
 'https://www.trulia.com/p/ca/san-diego/4104-mount-alifan-pl-d-san-diego-ca-92111--2079790538',
 'https://www.trulia.com/p/ca/san-diego/6656-reservoir-ln-san-diego-ca-92115--1064638

### Get Data From First Link

#### Store first link in variable

In [20]:
first_link = url_joined[0]

#### Get Request & Soup Object

In [22]:
response = requests.get(first_link)
response

<Response [200]>

In [23]:
soup= BeautifulSoup(response.content, 'html.parser')

### Target Necessary Data

Address

In [25]:
soup.find('span',{'data-testid':'home-details-summary-headline'}).get_text()

'3838 Martha St'

Bedrooms


In [26]:
soup.find('li',{'data-testid':'bed'}).get_text()

'3 Beds'

Bathrooms

In [27]:
soup.find('li',{'data-testid':'bath'}).get_text()

'3 Baths'

Sqfts

In [29]:
soup.find('li',{'data-testid':'floor'}).get_text()

'1,331 sqft'

Year Build

In [31]:
soup.find('div',string="Year Built").findNext('div').get_text()

'1958'

Parking

In [32]:
soup.find('div',string="Parking").findNext('div').get_text()

'2 Car Garage'

Price

In [36]:
soup.find('h3',{'data-testid':'on-market-price-details'}).get_text()

'$859,000'

### Append all Results in List with List Comprehension

In [2]:
address = []
bedrooms = []
bathrooms = []
area = []
year_build = []
parking = []
prices = []

for link in url_joined:
    response = requests.get(link)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    
    #address
    try:
        address.append(soup.find('span',{'data-testid':'home-details-summary-headline'}).get_text())
    except:
        address.append('')
        
    #bedrooms
    try:
        bedrooms.append(soup.find('li',{'data-testid':'bed'}).get_text())
    except:
        bedrooms.append('')
        
    #bathrooms
    try:
        bathrooms.append(soup.find('li',{'data-testid':'bath'}).get_text())
    except:
        bathrooms.append('')
        
    #area
    try:
        area.append(soup.find('li',{'data-testid':'floor'}).get_text())
    except:
        area.append('')
        
    #year build
    try:
        year_build.append(soup.find('div',string="Year Built").findNext('div').get_text())
    except:
        year_build.append('')
    
    #parking
    try:
        parking.append(soup.find('div',string="Parking").findNext('div').get_text())
    except:
        parking.append('')

    #prices
    try:
        prices.append(soup.find('h3',{'data-testid':'on-market-price-details'}).get_text())
    except:
        prices.append('')

NameError: name 'url_joined' is not defined

In [44]:
real_estate = {'Address':address, 
               'Bedrooms':bedrooms, 
               'Bathrooms':bathrooms,
               'Area':area,
               'Year Build':year_build,
               'Parking':parking,
               'Price':prices 
              }

### Create Pandas DataFrame

In [47]:
real_estate = pd.DataFrame(real_estate)

### Multiple Pages - San Diego

In [4]:
# empty lists

address = []
bedrooms = []
bathrooms = []
area = []
year_build = []
parking = []
prices = []

# url_part_1 
url_part_1 = 'https://www.trulia.com'

for i in range(1,26):
    
    #website
    website: 'https://www.trulia.com/CA/San_Diego/'+ str(i) +'_p/'
    
    #request
    response = requests.get('https://www.trulia.com/CA/San_Diego/'+ str(i) +'_p/')

    #soup object
    soup= BeautifulSoup(response.content, 'html.parser')
    
    #result
    results = soup.find_all('li', {'class':'SearchResultsList__WideCell-b7y9ki-2'})
    results_update=[]
    for r in results:
        if r.has_attr('data-testid'):
            results_update.append(r)
    
    # relative url
    relative_url = []

    for item in results_update:
        for link in item.find_all('div',{'data-testid':'property-card-details'}):
            relative_url.append(link.find('a').get('href'))
    
    # create absolute url
    
    url_joined = []
    
    for link_2 in relative_url:
        url_joined.append(urllib.parse.urljoin(url_part_1, link_2))
    
    #loop through all joined links
    
    for link in url_joined:
        response = requests.get(link)

        soup = BeautifulSoup(response.content, 'html.parser')


        #address
        try:
            address.append(soup.find('span',{'data-testid':'home-details-summary-headline'}).get_text())
        except:
            address.append('')

        #bedrooms
        try:
            bedrooms.append(soup.find('li',{'data-testid':'bed'}).get_text())
        except:
            bedrooms.append('')

        #bathrooms
        try:
            bathrooms.append(soup.find('li',{'data-testid':'bath'}).get_text())
        except:
            bathrooms.append('')

        #area
        try:
            area.append(soup.find('li',{'data-testid':'floor'}).get_text())
        except:
            area.append('')

        #year build
        try:
            year_build.append(soup.find('div',string="Year Built").findNext('div').get_text())
        except:
            year_build.append('')

        #parking
        try:
            parking.append(soup.find('div',string="Parking").findNext('div').get_text())
        except:
            parking.append('')

        #prices
        try:
            prices.append(soup.find('h3',{'data-testid':'on-market-price-details'}).get_text())
        except:
            prices.append('')

In [5]:
real_estate_final = {'Address':address, 
               'Bedrooms':bedrooms, 
               'Bathrooms':bathrooms,
               'Area':area,
               'Year Build':year_build,
               'Parking':parking,
               'Price':prices,
                     
              }

In [6]:
real_estate_final = pd.DataFrame(real_estate_final)
real_estate_final['Location'] = 'San Diego'


In [8]:
real_estate_final.to_excel('real_estate_san_diego.xlsx', index= False)