<h2>Project</h2>



In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import getpass
import json
import datetime


In [8]:
API_KEY = getpass.getpass() # for security reasons

 ········


In [110]:
url = "https://www.ad.co.il/nadlanrent?sp277=22772,17836"
ad_url = "https://www.ad.co.il/ad/{0}"
headers = {'User-Agent': 'Mozilla/5.0'}

In [111]:
def get_value_by_label(ad_soup, label):
    tds = ad_soup.find_all('td')
    for i, td in enumerate(tds):
        if label in td.get_text(strip=True):
            if i + 1 < len(tds):
                return tds[i + 1].get_text(strip=True)
    return ''

In [112]:
def extract_feature(ad_soup, label_text):
    feature = ad_soup.find('span', string=lambda text: text and label_text in text)
    if feature:
        parent_div = feature.find_parent(class_='card-icon')
        return parent_div and 'disabled' not in parent_div.get('class', [])
    return False

In [113]:
def extract_ad_data(ad_soup):
    data = {
        'property_type': str(get_value_by_label(ad_soup, 'פרטי הנכס')),
        'neighborhood': str(get_value_by_label(ad_soup, 'שכונה')),
        'address': str(get_value_by_label(ad_soup, 'כתובת')),
        'room_num': float(get_value_by_label(ad_soup, 'חדרים')),
        'floor': 0 if 'קרקע' in get_value_by_label(ad_soup, 'קומה') else get_value_by_label(ad_soup, 'קומה').split('מתוך')[0].strip() if 'מתוך' in get_value_by_label(ad_soup, 'קומה') else None,
        'area': int(get_value_by_label(ad_soup, 'שטח בנוי')),
        'garden_area': int(get_value_by_label(ad_soup, 'שטח גינה') or 0) if get_value_by_label(ad_soup, 'שטח גינה').isdigit() else 0,
        'days_to_enter': (lambda v: 0 if v in ['מיידי', 'מיידית'] else (int((datetime.datetime.strptime(v, "%d/%m/%Y") - datetime.datetime.today())).days if v else None))(get_value_by_label(ad_soup, 'תאריך כניסה')),
        'num_of_payments': int(get_value_by_label(ad_soup, 'תשלומים בשנה')),
        'monthly_arnona': int(get_value_by_label(ad_soup, 'ארנונה בחודש') or 0) if get_value_by_label(ad_soup, 'ארנונה בחודש').isdigit() else None,
        'building_tax': int(get_value_by_label(ad_soup, 'ועד בית בחודש') or 0) if get_value_by_label(ad_soup, 'ועד בית בחודש').isdigit() else None,
        'total_floors': 0 if 'קרקע' in get_value_by_label(ad_soup, 'קומה') else get_value_by_label(ad_soup, 'קומה').split('מתוך')[-1].strip() if 'מתוך' in get_value_by_label(ad_soup, 'קומה') else None,
        'description': str(ad_soup.select_one('.single-product-tab p').get_text(strip=True) if ad_soup.select_one('.single-product-tab p') else None),
        'has_parking': int(bool(extract_feature(ad_soup, 'חניה'))),
        'has_stotsge': int(bool(extract_feature(ad_soup, 'מחסן'))),
        'elevator': int(bool(extract_feature(ad_soup, 'מעלית'))),
        'ac': int(bool(extract_feature(ad_soup, 'מזגן'))),
        'handicap': int(bool(extract_feature(ad_soup, 'נגישות'))),
        'has_bars': int(bool(extract_feature(ad_soup, 'סורגים'))),
        'has_safe_room': int(bool(extract_feature(ad_soup, 'ממ'))),
        'has_balcon': int(bool(extract_feature(ad_soup, 'מרפסת'))),
        'is_furnished': int(bool(extract_feature(ad_soup, 'מרוהטת'))),
        'is_renovated': int(bool(extract_feature(ad_soup, 'משופצת'))),
        'price': float(ad_soup.select_one('.card-title + h2').get_text(strip=True).replace('₪', '').replace(',', '').strip()) if ad_soup.select_one('.card-title + h2') else None,
        'num_of_images': int(len(ad_soup.select('#product_slider .swiper-slide img'))),
        'distance_from_center': '',  # Placeholder or needs API call
    }
    return data

In [114]:
def get_distance_from_center(origin_address, api_key=API_KEY):
    try:
        endpoint = "https://routes.googleapis.com/directions/v2:computeRoutes"

        headers = {
            "Content-Type": "application/json",
            "X-Goog-Api-Key": api_key,
            "X-Goog-FieldMask": "routes.distanceMeters,routes.duration"
        }

        data = {
            "origin": {
                "address": origin_address
            },
            "destination": {
                "address": "כיכר דיזינגוף תל אביב"
            },
            "travelMode": "DRIVE",
            "routingPreference": "TRAFFIC_AWARE"
        }

        response = requests.post(endpoint, json=data, headers=headers)

        if response.status_code == 200:
            result = response.json()
            if result.get("routes"):
                return result["routes"][0]["distanceMeters"] / 1000  # return in kilometers
            else:
                return None
        else:
            print(f"Error response from API: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"Error in get_distance_from_center: {e}")
        return None

In [115]:
def scrape_ad_data(limit = None, delay = 1):
    response = requests.get(url, headers = headers)
    soup = BeautifulSoup(response.text, 'lxml')
    data_ids = [div['data-id'] for div in soup.select('#cards div[data-id]')]

    if limit:
        data_ids = data_ids[:limit]

    records = []
    for data_id in data_ids:
        url_id = ad_url.format(data_id) 
       # print(f"Scraping: {url_id}")
        try:
            ad_res = requests.get(url_id, headers = headers)
            ad_soup = BeautifulSoup(ad_res.text, 'lxml')
            record = extract_ad_data(ad_soup)

            full_address = f"{record['address']} {record['neighborhood']} תל אביב"
            record['distance_from_center'] = get_distance_from_center(full_address, API_KEY)

            records.append(record)
            time.sleep(delay)  # Avoid hammering the site
        except Exception as e:
            print(f"Failed to process {url_id}: {e}")


    return records

In [117]:
df = pd.DataFrame(scrape_ad_data())  # Limit to 10 listings for now

In [118]:
df.to_csv('apartments_data.csv', index=False, encoding='utf-8-sig')