In [13]:
import requests
import os
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from links import links

In [14]:
# Get the responses from the links
def get_url(url):
    return requests.get(url).text
with ThreadPoolExecutor(max_workers=10) as pool:
    response_list = list(pool.map(get_url,links))

len(response_list)

In [15]:
# Save as HTML files
for i, res in enumerate(response_list):
    with open(f'HTML/{i}.html', 'w') as f:
        f.write(res)

In [16]:
# Load the HTML files into BeautifulSoup objects
documents = []
for i in range(len(response_list)):
    with open(f'HTML/{i}.html', 'r') as f:
        documents.append(BeautifulSoup(f.read(), 'lxml'))
len(documents)

In [17]:
# Read data from HTML files
documents = []
files = sorted([filename for filename in os.listdir('HTML') if filename.endswith('.html')])

for file in files:
    with open(f'HTML/{file}', 'r') as f:
        documents.append(BeautifulSoup(f.read(), 'lxml'))
len(documents)

428

In [29]:
def get_price(doc):
    """
    Get the price
    """
    price = None
    try:
        price = doc.find('span', {'itemprop': 'price'}).text.strip()
    except Exception:
        try:
            price = doc.find('div', {'class': 'ldp-header-price'}).find('div').find_all('span')[1].text.strip()
        except Exception:
            print('No price found')
    return price

In [19]:
def get_main_property_features(doc):
    """
    Get the main property features
    """
    final_features = []
    try:
        features = doc.find('div', {'id': 'load-more-features'}).find_all('div', {'class':'ldp-features-image-tag'})
        for feature in features:
            name = feature.find('h4').text
            elements = feature.find_all('li')
            final_features.append({name: [element.text for element in elements]})
    except Exception:
        print('No features found')

    return final_features

In [20]:
def get_other_features(doc):
    """
    Get the other features
    """
    other_features = None
    try:
        other_features = []
        items = doc.find('div', {'id': 'load-more-features'}).find_all(recursive=False)[1:-2]
        indexes = [i for i, x in enumerate(items) if x.name == 'h4']
        for i in range(1,len(indexes)):
            if i == len(indexes)-1:
                divs = items[indexes[i]:]
            else:
                divs = items[indexes[i-1]:indexes[i]]
            name = divs[0].text
            features = divs[1].find_all('li')
            features = [feature.text for feature in features]
            other_features.append({name: features})
    except Exception:
        print('No other features found')
    return other_features

In [21]:
def get_public_records(doc):
    """
    Get the public records section in the page
    """
    public_records = None
    try:
        items = doc.find('div', {'id': 'ldp-detail-public-records'}).find_all('li')
        if items:
            public_records = {'Public Records': [item.text for item in items]}
    except Exception:
        print('No public records found')
    return public_records

In [22]:
def get_zip_code(doc):
    """
    Get the zip code
    """
    zip_code = None
    try:
        zip_code = doc.find('span', {'itemprop': 'postalCode'}).text.strip()
    except:
        print('No zip code found')
    return zip_code

In [34]:
# Save to CSV file 
failures = 0
zip_codes = []
with open('data.txt', 'w') as f:
    
    f.write('[')
    for i, doc in enumerate(documents):
        feature_data = []
        zip_code = get_zip_code(doc)
        price = get_price(doc)
        features = get_main_property_features(doc)
        other_features = get_other_features(doc)
        public_records = get_public_records(doc)
        zip_codes.append(zip_code)
        # if price:
            # feature_data.append(price)
        if features:
                feature_data.extend(features)
        if other_features:
                feature_data.extend(other_features)
        if public_records:
                feature_data.extend(public_records)
        if None in [price, features, other_features]:
            print(f'No data found for {i}')
            failures += 1
        if i != len(documents)-1:
            f.write(f'{[price, feature_data]},\n')
        else:
            f.write(f'{[price, feature_data]}]')
with open('zip_codes.txt', 'w') as f:
    f.write('[')
    for i, zip_code in enumerate(zip_codes):
        if i != len(zip_codes)-1:
            f.write(f'{zip_code},\n')
        else:
            f.write(f'{zip_code}]')
print(f'Failure rate: {(failures/len(documents)) *100}%')


No data found for 9
No data found for 10
No data found for 89
No data found for 103
No data found for 109
No data found for 116
No data found for 124
No data found for 126
No data found for 131
No data found for 134
No data found for 178
No data found for 219
No data found for 237
No data found for 339
No data found for 388
No data found for 404
Failure rate: 3.7383177570093453%
