# Scrap redfin.com 

In [1]:
import requests
import json
import concurrent.futures
import requests
import os

def download_image(image_url, save_path):
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        with open(save_path, 'wb') as file:
            file.write(response.content)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {image_url}: {e}")



def get_image_urls(property_url):
    base_url = "https://www.redfin.com"
    property_id = property_url.split("/")[-1]

    headers = {
        'User-Agent': 'google',
        'Accept': 'application/json',
        'Content-Type': 'application/json'
    }

    response = requests.get(f"{base_url}/stingray/api/home/details/aboveTheFold",
                            headers=headers,
                            params={'propertyId': property_id, 'accessLevel': 1})
    
    if response.status_code != 200:
        raise Exception(f"Status code is not 200. Got {response.status_code} instead.\n"
                        f"Property ID: {property_id}\n"
                        f"Data: {response.text}\n"
                        f"Status text: {response.reason}")

    data = response.text[4:]
    json_data = json.loads(data)
    above_the_fold_details = json_data['payload']['mediaBrowserInfo']['photos'] if 'payload' in json_data else None
    
    if not above_the_fold_details:
        raise Exception("No data found")

    image_urls = [photo['photoUrls']['fullScreenPhotoUrl'] for photo in above_the_fold_details]

    return image_urls



def download_images_for_properties(property_urls, save_dir):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []

        for property_url in property_urls:
            property_id = property_url.split("/")[-1]
            try:
                image_urls = get_image_urls(property_url)
                for i, url in enumerate(image_urls):
                    filename = f"{property_id}_{i}.jpg"
                    save_path = os.path.join(save_dir, filename)
                    futures.append(executor.submit(download_image, url, save_path))
            except Exception as e:
                print(e)

        for future in concurrent.futures.as_completed(futures):
            try:
                future.result() 
            except Exception as e:
                print(e)


In [5]:

redfin_urls = [
  
    ]
    
save_dir = "redfin_images"  
os.makedirs(save_dir, exist_ok=True)
download_images_for_properties(redfin_urls, save_dir)

In [4]:
import os
import hashlib

def file_hash(filepath):
    with open(filepath, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

def remove_duplicates(directory):
    unique_files = {}
    duplicates = []

    for filename in os.listdir(directory):
        if filename.endswith('.jpg'):
            filepath = os.path.join(directory, filename)
            filehash = file_hash(filepath)

            if filehash in unique_files:
                duplicates.append(filepath)
            else:
                unique_files[filehash] = filename

    cnt=0
    for filepath in duplicates:
        os.remove(filepath)
        cnt+=1
    print(cnt)

    print("Duplicate removal complete.")

directory = 'redfin_images'  
remove_duplicates(directory)




293
Duplicate removal complete.
