In [1]:
import os
import json
import pandas as pd
from parsel import Selector

In [2]:
def extract_data_from_html(filename):

    with open(f'./pages/{filename}', 'r') as f:
        
        selector = Selector(f.read())
        data = selector.css("script#__NEXT_DATA__::text").get()
        
        if data:
            # Option 1: some properties are located in NEXT DATA cache
            data = json.loads(data)
            property_data = json.loads(data["props"]["pageProps"]["componentProps"]["gdpClientCache"])
            property_data = property_data[list(property_data)[0]]['property']
        else:
            # Option 2: other times it's in Apollo cache
            data = selector.css("script#hdpApolloPreloadedData::text").get()
            data = json.loads(json.loads(data)["apiCache"])
            property_data = next(
                v["property"] for k, v in data.items() if "ForSale" in k
            )
    
        return {
            'zpid': property_data.get('zpid'),
            'street_address': property_data.get('streetAddress'),
            'bedrooms': property_data.get('bedrooms'),
            'bathrooms': property_data.get('bathrooms'),
            'price': property_data.get('price'),
            'year_built': property_data.get('yearBuilt'),
            'living_area': property_data.get('livingArea'),
            'home_type': property_data.get('homeType'),
            'lot_size': property_data.get('lotSize'),
            'lot_area_value': property_data.get('lotAreaValue')
        }

In [3]:
pages = os.listdir('./pages')
data = [ extract_data_from_html(fn) for fn in pages if fn.endswith('.html') ]

In [4]:
final = pd.DataFrame(data)
final

Unnamed: 0,zpid,street_address,bedrooms,bathrooms,price,year_built,living_area,home_type,lot_size,lot_area_value
0,58161822,88 Skitchewaug St,5.0,2.0,225000,1930.0,2384.0,MULTI_FAMILY,7405.0,7405.00
1,57738930,145 Ridgefield St,4.0,1.5,240300,1920.0,2354.0,SINGLE_FAMILY,8712.0,8712.00
2,58142856,91 Ashford St,3.0,2.5,223900,1971.0,1372.0,SINGLE_FAMILY,5662.0,5662.00
3,57730186,585 Hillside Ave,3.0,1.0,205100,1922.0,1383.0,SINGLE_FAMILY,6098.0,6098.00
4,57730678,20 Hughes St,3.0,1.5,237600,1948.0,1536.0,SINGLE_FAMILY,5227.0,5227.00
...,...,...,...,...,...,...,...,...,...,...
204,57731132,454 W Preston St,4.0,2.0,205000,1925.0,1857.0,SINGLE_FAMILY,3484.0,3484.80
205,58161832,69 Skitchewaug St,3.0,2.0,281400,1950.0,1195.0,SINGLE_FAMILY,9583.0,9583.00
206,57793864,15 Rosemont Ave,4.0,1.5,269400,1940.0,1536.0,SINGLE_FAMILY,7405.0,7405.00
207,59288635,109 Mercer Ave,3.0,1.0,243700,1952.0,1184.0,SINGLE_FAMILY,7200.0,7200.00


In [5]:
final.to_csv('final.csv', index=False)
