# Scrap redfin.com 

In [10]:
import requests
from bs4 import BeautifulSoup

def fetch_listing_links_from_page(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    page_links = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        listing_links = soup.find_all('a', href=True)
        for link in listing_links:
            href = link['href']
            if  "/home/" in href:
                full_url = f"https://www.redfin.com{href}" if not href.startswith('http') else href
                page_links.append(full_url)
    else:
        print(f"Failed to fetch the page: {url}")
    
    return page_links

base_url = "https://www.redfin.com/city/12839/DC/Washington-DC"
links = []

for page in range(1, 25):  
    print(f"Fetching page {page}")
    page_url = f"{base_url}/page-{page}"
    links.extend(fetch_listing_links_from_page(page_url))

links = list(set(links))
print(f"Found {len(links)} unique listings:")
for link in links:
    print(link)


Fetching page 1
Fetching page 2
Fetching page 3
Fetching page 4
Fetching page 5
Fetching page 6
Fetching page 7
Fetching page 8
Fetching page 9
Fetching page 10
Fetching page 11
Fetching page 12
Fetching page 13
Fetching page 14
Fetching page 15
Fetching page 16
Fetching page 17
Fetching page 18
Fetching page 19
Fetching page 20
Fetching page 21
Fetching page 22
Fetching page 23
Fetching page 24
Found 379 unique listings:
https://www.redfin.com/DC/Washington/1099-22nd-St-NW-20037/unit-409/home/9053934
https://www.redfin.com/DC/Washington/70-N-St-SE-20024/unit-308/home/174050148
https://www.redfin.com/DC/Washington/1732-Montello-Ave-NE-20002/unit-7/home/180540591
https://www.redfin.com/DC/Washington/212-Oakwood-St-SE-20032/unit-122/home/12529507
https://www.redfin.com/DC/Washington/3214-Reservoir-Rd-NW-20007/home/9930652
https://www.redfin.com/DC/Washington/3462-23rd-St-SE-20020/home/10161995
https://www.redfin.com/DC/Washington/5440-Nebraska-Ave-NW-20015/home/9980717
https://www.redfin

In [11]:
import requests
import json
import concurrent.futures
import requests
import os

def download_image(image_url, save_path):
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        with open(save_path, 'wb') as file:
            file.write(response.content)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {image_url}: {e}")



def get_image_urls(property_url):
    base_url = "https://www.redfin.com"
    property_id = property_url.split("/")[-1]

    headers = {
        'User-Agent': 'google',
        'Accept': 'application/json',
        'Content-Type': 'application/json'
    }

    response = requests.get(f"{base_url}/stingray/api/home/details/aboveTheFold",
                            headers=headers,
                            params={'propertyId': property_id, 'accessLevel': 1})
    
    if response.status_code != 200:
        raise Exception(f"Status code is not 200. Got {response.status_code} instead.\n"
                        f"Property ID: {property_id}\n"
                        f"Data: {response.text}\n"
                        f"Status text: {response.reason}")

    data = response.text[4:]
    json_data = json.loads(data)
    above_the_fold_details = json_data['payload']['mediaBrowserInfo']['photos'] if 'payload' in json_data else None
    
    if not above_the_fold_details:
        raise Exception("No data found")

    image_urls = [photo['photoUrls']['fullScreenPhotoUrl'] for photo in above_the_fold_details]

    return image_urls



def download_images_for_properties(property_urls, save_dir):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []

        for property_url in property_urls:
            property_id = property_url.split("/")[-1]
            try:
                image_urls = get_image_urls(property_url)
                for i, url in enumerate(image_urls):
                    filename = f"{property_id}_{i}.jpg"
                    save_path = os.path.join(save_dir, filename)
                    futures.append(executor.submit(download_image, url, save_path))
            except Exception as e:
                print(e)

        for future in concurrent.futures.as_completed(futures):
            try:
                future.result() 
            except Exception as e:
                print(e)


In [12]:


    
save_dir = "redfin_images"  
os.makedirs(save_dir, exist_ok=True)
download_images_for_properties(links, save_dir)

No data found
No data found
No data found
No data found
No data found


In [13]:
import os
import hashlib

def file_hash(filepath):
    with open(filepath, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

def remove_duplicates(directory):
    unique_files = {}
    duplicates = []

    for filename in os.listdir(directory):
        if filename.endswith('.jpg'):
            filepath = os.path.join(directory, filename)
            filehash = file_hash(filepath)

            if filehash in unique_files:
                duplicates.append(filepath)
            else:
                unique_files[filehash] = filename

    cnt=0
    for filepath in duplicates:
        os.remove(filepath)
        cnt+=1
    print(cnt)

    print("Duplicate removal complete.")

directory = 'redfin_images'  
remove_duplicates(directory)




212
Duplicate removal complete.
