In [2]:
import requests
import time
from lxml import etree
import pandas as pd

# Set fixed part of list page URL
base_url = 'http://qd.lianjia.com'

# Set variable part of the page
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}

# Loop through to grab list page info
# Get area
district_url = base_url + '/ershoufang'
district_response = requests.get(url=district_url, headers=headers)
district_html = district_response.content
district_encoding = district_response.encoding
district_parsed = etree.HTML(district_html, parser=etree.HTMLParser(encoding=district_encoding))
areas_links = district_parsed.xpath('//div[@class="position"]/dl[2]/dd/div[1]/div/a/@href')

print(areas_links)

for area_link in areas_links:
    print('Scraping area:', area_link)
    start_time = time.time() 
    areas = []
    prices = []
    house_details = []
    follow_details = []
    price_per_unit = []
    features = []
    transaction_details = []
    basic_details = []

    current_area_count = 0
    full_url = base_url + area_link
    area_response = requests.get(url=full_url, headers=headers)
    area_html = area_response.content
    area_encoding = area_response.encoding
    area_parsed = etree.HTML(area_html, parser=etree.HTMLParser(encoding=area_encoding))
    page_data_results = area_parsed.xpath("//div[@class='contentBottom clear']/div[@class='page-box fr']//@page-data")
    
    if page_data_results:
        total_pages = eval(page_data_results[0])['totalPage']
    else:
        print("No second-hand houses in this area.")
        continue

    for i in range(1, 2):
        try:
            page_url = base_url + area_link + 'pg' + str(i) + '/'
            response = requests.get(url=page_url, headers=headers)
            html_content = response.content
            time.sleep(1)
            encoding = response.encoding
            parsed_content = etree.HTML(html_content, parser=etree.HTMLParser(encoding=encoding))
            
            # Extract total price of houses
            for item in parsed_content.xpath('//div[@class="priceInfo"]'):
                total_price = item.xpath('.//span/text()')[0]
                prices.append(total_price)

            # Extract price per unit
            for item in parsed_content.xpath('//div[@class="unitPrice"]'):
                unit_price = item.xpath('.//span/text()')[0]
                price_per_unit.append(unit_price)

            # Extract house details
            for item in parsed_content.xpath('//div[@class="info clear"]'):
                feature_tags = item.xpath('.//div[@class="tag"]')
                for tag in feature_tags:
                    feature = tag.xpath('.//span/text()')
                    features.append(feature)
                house_detail = item.xpath('.//div[@class="positionInfo"]//a/text()')[0] + '|' + item.xpath('.//div[@class="positionInfo"]//a[2]/text()')[0] + '|' + item.xpath('.//div[@class="houseInfo"]//text()')[0]
                house_details.append(house_detail)

            # Extract basic and transaction details
            house_urls = parsed_content.xpath('//div[@class="info clear"]//div[@class="title"]/a/@href')
            for house_url in house_urls:
                house_response = requests.get(url=house_url, headers=headers)
                house_html = house_response.content
                time.sleep(1)
                house_encoding = house_response.encoding
                house_parsed = etree.HTML(house_html, parser=etree.HTMLParser(encoding=house_encoding))
                transaction_data = house_parsed.xpath('//div[@class="transaction"]//div[@class="content"]//ul//li//span[2]//text()')
                basic_data = house_parsed.xpath('//div[@class="base"]//div[@class="content"]//ul//li/text()')
                transaction_details.append(transaction_data)
                basic_details.append(basic_data)
                areas.append(area_link)
                current_area_count += 1
                
                if current_area_count % 10 == 0:
                    elapsed_time = time.time() - start_time
                    print(f"Current area: {area_link}. {current_area_count} entries scraped. Elapsed time: {elapsed_time:.2f} seconds.")

            # Extract follow details
            for item in parsed_content.xpath('//div[@class="followInfo"]'):
                follow_detail = item.xpath('./text()')[0]
                follow_details.append(follow_detail)

        except Exception as e:
            print(f"Error on page {i} for area {area_link}: {str(e)}")
    
    # Save to a DataFrame and then to CSV for every area
    house_df = pd.DataFrame({
            'area': areas,
            'house_details': house_details,
            'follow_details': follow_details,
            'prices': prices,
            'price_per_unit': price_per_unit,
            'features': features,
            'basic_details': basic_details,
            'transaction_details': transaction_details
        })
    house_df.to_csv("Raw_data.csv", mode='a', header=False, encoding='utf-8', index=False)
    elapsed_time = time.time() - start_time
    print(f"Scraping for area {area_link} finished. {current_area_count} entries in total.")
    print(f"Scraping for area {area_link} finished. {current_area_count} entries in total. Total time taken: {elapsed_time:.2f} seconds.")


['/ershoufang/shinan/', '/ershoufang/shibei/', '/ershoufang/licang/', '/ershoufang/laoshan/', '/ershoufang/huangdao/', '/ershoufang/chengyang/', '/ershoufang/jiaozhou/', '/ershoufang/jimo/', '/ershoufang/pingdu/', '/ershoufang/laixi/']
Scraping area: /ershoufang/shinan/
Current area: /ershoufang/shinan/. 10 entries scraped.
Current area: /ershoufang/shinan/. 20 entries scraped.
Current area: /ershoufang/shinan/. 30 entries scraped.
Scraping for area /ershoufang/shinan/ finished. 30 entries in total.
Scraping area: /ershoufang/shibei/
Current area: /ershoufang/shibei/. 10 entries scraped.
Current area: /ershoufang/shibei/. 20 entries scraped.
Current area: /ershoufang/shibei/. 30 entries scraped.
Scraping for area /ershoufang/shibei/ finished. 30 entries in total.
Scraping area: /ershoufang/licang/
Current area: /ershoufang/licang/. 10 entries scraped.
Current area: /ershoufang/licang/. 20 entries scraped.
Current area: /ershoufang/licang/. 30 entries scraped.
Scraping for area /ershouf