In [None]:
import json
import requests
import pandas as pd
import time
class RealtorScraper:
    def __init__(self, start_page: int, end_page: int, state: str) -> None:
        self.start_page = start_page
        self.end_page = end_page
        self.state = state

    def send_request(self, page_number: int, offset_parameter: int) -> dict:
        url = "https://www.realtor.com/api/v1/hulk?client_id=rdc-x&schema=vesta"
        headers = {"content-type": "application/json"}

        body = r'{"query":"\n\nquery ConsumerSearchMainQuery($query: HomeSearchCriteria!, $limit: Int, $offset: Int, $sort: [SearchAPISort], $client_data: JSON, $geoSupportedSlug: String!, $bucket: SearchAPIBucket, $by_prop_type: [String])\n{\n  home_search: home_search(query: $query,\n    sort: $sort,\n    limit: $limit,\n    offset: $offset,\n    client_data: $client_data,\n    bucket: $bucket,\n  ){\n    count\n    total\n    results {\n      property_id\n      list_price\n      pet_policy{\n        cats\n        dogs\n        text\n        }\n        primary_photo (https: true){\n        href\n      }\n      source {\n        id\n        agents{\n          office_name\n        }\n        type\n        spec_id\n        plan_id\n      }\n      community {\n        property_id\n        description {\n          name\n        }\n        advertisers{\n          office{\n            hours\n            phones {\n              type\n              number\n            }\n          }\n          builder {\n            fulfillment_id\n          }\n        }\n      }\n      products {\n        brand_name\n        products\n      }\n      listing_id\n      matterport\n      virtual_tours{\n        href\n        type\n      }\n      status\n      permalink\n      price_reduced_amount\n      other_listings{rdc {\n      listing_id\n      status\n      listing_key\n      primary\n    }}\n      description{\n        beds\n        baths\n        baths_full\n        baths_half\n        baths_1qtr\n        baths_3qtr\n        garage\n        stories\n        type\n        sub_type\n        lot_sqft\n        sqft\n        year_built\n        sold_price\n        sold_date\n        name\n      }\n      location{\n        street_view_url\n        address{\n          line\n          postal_code\n          state\n          state_code\n          city\n          coordinate {\n            lat\n            lon\n          }\n        }\n        county {\n          name\n          fips_code\n        }\n      }\n      tax_record {\n        public_record_id\n      }\n      lead_attributes {\n        show_contact_an_agent\n        opcity_lead_attributes {\n          cashback_enabled\n          flip_the_market_enabled\n        }\n        lead_type\n      }\n      open_houses {\n        start_date\n        end_date\n        description\n        methods\n        time_zone\n        dst\n      }\n      flags{\n        is_coming_soon\n        is_pending\n        is_foreclosure\n        is_contingent\n        is_new_construction\n        is_new_listing (days: 14)\n        is_price_reduced (days: 30)\n        is_plan\n        is_subdivision\n      }\n      list_date\n      last_update_date\n      coming_soon_date\n      photos(limit: 2, https: true){\n        href\n      }\n      tags\n      branding {\n        type\n        photo\n        name\n      }\n    }\n  }\n  geo(slug_id: $geoSupportedSlug) {\n    parents {\n      geo_type\n      slug_id\n      name\n    }\n    geo_statistics(group_by: property_type) {\n      housing_market {\n        by_prop_type(type: $by_prop_type){\n          type\n           attributes{\n            median_listing_price\n            median_lot_size\n            median_sold_price\n            median_price_per_sqft\n            median_days_on_market\n          }\n        }\n        listing_count\n        median_listing_price\n        median_rent_price\n        median_price_per_sqft\n        median_days_on_market\n        median_sold_price\n        month_to_month {\n          active_listing_count_percent_change\n          median_days_on_market_percent_change\n          median_listing_price_percent_change\n          median_listing_price_sqft_percent_change\n        }\n      }\n    }\n    recommended_cities: recommended(query: {geo_search_type: city, limit: 20}) {\n      geos {\n        ... on City {\n          city\n          state_code\n          geo_type\n          slug_id\n        }\n        geo_statistics(group_by: property_type) {\n          housing_market {\n            by_prop_type(type: [\"home\"]) {\n              type\n              attributes {\n                median_listing_price\n              }\n            }\n            median_listing_price\n          }\n        }\n      }\n    }\n    recommended_neighborhoods: recommended(query: {geo_search_type: neighborhood, limit: 20}) {\n      geos {\n        ... on Neighborhood {\n          neighborhood\n          city\n          state_code\n          geo_type\n          slug_id\n        }\n        geo_statistics(group_by: property_type) {\n          housing_market {\n            by_prop_type(type: [\"home\"]) {\n              type\n              attributes {\n                median_listing_price\n              }\n            }\n            median_listing_price\n          }\n        }\n      }\n    }\n    recommended_counties: recommended(query: {geo_search_type: county, limit: 20}) {\n      geos {\n        ... on HomeCounty {\n          county\n          state_code\n          geo_type\n          slug_id\n        }\n        geo_statistics(group_by: property_type) {\n          housing_market {\n            by_prop_type(type: [\"home\"]) {\n              type\n              attributes {\n                median_listing_price\n              }\n            }\n            median_listing_price\n          }\n        }\n      }\n    }\n    recommended_zips: recommended(query: {geo_search_type: postal_code, limit: 20}) {\n      geos {\n        ... on PostalCode {\n          postal_code\n          geo_type\n          slug_id\n        }\n        geo_statistics(group_by: property_type) {\n          housing_market {\n            by_prop_type(type: [\"home\"]) {\n              type\n              attributes {\n                median_listing_price\n              }\n            }\n            median_listing_price\n          }\n        }\n      }\n    }\n  }\n}","variables":{"query":{"primary":true},"client_data":{"device_data":{"device_type":"web"},"user_data":{"last_view_timestamp":-1}},"limit":42,"offset":42,"sort":[{"field":"list_date","direction":"desc"}], "zohoQuery":{"silo":"search_result_page","filters":{},"page_index":"2"},"geoSupportedSlug":"","by_prop_type":["home"]},"operationName":"ConsumerSearchMainQuery","callfrom":"SRP","nrQueryType":"MAIN_SRP","visitor_id":"eff16470-ceb5-4926-8c0b-6d1779772842","isClient":true,"seoPayload":{"asPath":"/realestateandhomes-search/New-York/pg-2","pageType":{"silo":"search_result_page","status":"for_sale"},"county_needed_for_uniq":false}}'

        json_body = json.loads(body)
        json_body["variables"]["query"]["state_code"] = self.state

        json_body["variables"]["page_index"] = page_number
        json_body["seoPayload"] = page_number
        json_body["variables"]["offset"] = offset_parameter

        r = requests.post(url=url, json=json_body, headers=headers)
        json_data = r.json()
        return json_data

    def extract_features(self, entry: dict) -> dict:
        feature_dict = {
            "id": entry["property_id"],
            "price": entry["list_price"],
            "beds": entry["description"]["beds"],
            "baths": entry["description"]["baths"],
            "garage": entry["description"]["garage"],
            "stories": entry["description"]["stories"],
            "house_type": entry["description"]["type"],
            "lot_sqft": entry["description"]["lot_sqft"],
            "sqft": entry["description"]["sqft"],
            "year_built": entry["description"]["year_built"],
            "address": entry["location"]["address"]["line"],
            "postal_code": entry["location"]["address"]["postal_code"],
            "state": entry["location"]["address"]["state_code"],
            "city": entry["location"]["address"]["city"],
            "tags": entry["tags"],
            "is_new_construction": entry["flags"]["is_new_construction"],
            "sold_price": entry["description"]["sold_price"],
            # "branding_type": entry["branding"][0],
            "status": entry["status"],
            "price_reduced_amount": entry["price_reduced_amount"],
        }

        if entry["location"]["address"]["coordinate"]:
            feature_dict.update({"lat": entry["location"]["address"]["coordinate"]["lat"]})
            feature_dict.update({"lon": entry["location"]["address"]["coordinate"]["lon"]})

        if entry["location"]["county"]:
            feature_dict.update({"county": entry["location"]["county"]["name"]})

        if entry["pet_policy"]:
            feature_dict.update({"pet_policy_dog": entry["pet_policy"]["dogs"]})
            feature_dict.update({"pet_policy_cat": entry["pet_policy"]["cats"]})

        if entry["branding"]:
            feature_dict.update({"branding": entry["branding"][0]})

        return feature_dict

    def parse_json_data(self) -> list:
        offset_parameter = 42
        feature_dict_list = []

        for i in range(self.start_page, self.end_page + 1):
            json_data = self.send_request(page_number=i, offset_parameter=offset_parameter)
            offset_parameter += 42
            for entry in json_data["data"]["home_search"]["results"]:
                feature_dict = self.extract_features(entry)
                feature_dict_list.append(feature_dict)

        return feature_dict_list

    def create_dataframe(self) -> pd.DataFrame:
        feature_dict_list = self.parse_json_data()

        df = pd.DataFrame(feature_dict_list)
        return df

def run(start: int, end: int, sort: str):
    states = [
        "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN",
        "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV",
        "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN",
        "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
    ]
    all_dfs = []
    empty_states = []

    for i, state in enumerate(states, 1):
      try:
          r = RealtorScraper(start_page=start, end_page=end, state=state)
          df = r.create_dataframe()
          if df.empty:
              empty_states.append(state)
          else:
              all_dfs.append(df)
          if i % 10 == 0:
                  print(f"Start page {start}: Retrieved data for {i} states.")
      except Exception as e:
        print(f"Start page {start}: Exception for {state}: {e}")

    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)

    else:
        print("No data available for any state.")

    if empty_states:
        print("Empty states:", empty_states)
    else:
        print("No states were empty.")

    final_df.to_csv(f'./all_states_page_{start}_to_{end}_sorted_{sort}.csv')

if __name__ == "__main__":
    for page in range(48, 55, 6):
        run(start=page, end=page+5, sort="listdate")
        time.sleep(300)


Start page 48: Retrieved data for 10 states.
Start page 48: Retrieved data for 20 states.
Start page 48: Retrieved data for 30 states.
Start page 48: Retrieved data for 40 states.
Start page 48: Retrieved data for 50 states.
No states were empty.
Start page 54: Retrieved data for 10 states.
Start page 54: Retrieved data for 20 states.
Start page 54: Retrieved data for 30 states.
Start page 54: Retrieved data for 40 states.
Start page 54: Retrieved data for 50 states.
No states were empty.


In [None]:
final_df

Unnamed: 0,id,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,...,is_new_construction,sold_price,status,price_reduced_amount,lat,lon,county,branding,pet_policy_dog,pet_policy_cat
0,9151520261,529900.0,6.0,4.0,,2.0,single_family,,3199.0,,...,True,,for_sale,,34.209664,-86.823094,Cullman,"{'type': 'Builder', 'photo': 'https://nh.rdcpi...",,
1,7261969644,550.0,1.0,1.0,,,condos,,,,...,,,for_rent,,32.577455,-85.487691,Lee,"{'type': 'Office', 'photo': None, 'name': None}",,
2,9020331701,139900.0,2.0,1.0,,1.0,farm,387684.0,1101.0,1940.0,...,,,for_sale,30000.0,33.983448,-85.851280,Etowah,"{'type': 'Office', 'photo': None, 'name': 'Rea...",,
3,9551516999,1000.0,1.0,2.0,,,single_family,,,,...,,,for_rent,,34.115322,-86.185070,Etowah,"{'type': 'Office', 'photo': None, 'name': None}",True,True
4,7545717852,430000.0,3.0,2.0,1.0,,single_family,23522.0,2731.0,1950.0,...,,285000.0,for_sale,,34.346089,-86.320586,Marshall,"{'type': 'Office', 'photo': None, 'name': 'Can...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12595,9192458129,595000.0,3.0,3.0,2.0,1.0,single_family,8491.0,2933.0,2001.0,...,,,for_sale,17000.0,44.553722,-104.104973,Crook,"{'type': 'Office', 'photo': None, 'name': 'Kel...",,
12596,7507597358,224900.0,5.0,2.0,1.0,2.0,single_family,6970.0,1892.0,1940.0,...,,,for_sale,,43.027517,-108.389472,Fremont,"{'type': 'Office', 'photo': 'https://ap.rdcpix...",,
12597,9902160042,289000.0,1.0,1.0,,1.0,condos,3049.0,509.0,1972.0,...,,,for_sale,,42.701454,-110.932838,Lincoln,"{'type': 'Office', 'photo': None, 'name': 'Hal...",,
12598,9176793477,,,,,,apartment,,,,...,,,for_rent,,41.787608,-107.244210,Carbon,,True,True
