In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
from random import randint, random
from time import sleep

def try_get_page_from_server(page_num):
    
    sleep(randint(0, 3) + random())  # чтобы спать случайное число времени и наш айпи не залочили
    
    link = f'https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&object_type%5B0%5D=100&offer_type=flat&p={page_num}&region=1&room1=1'
    page = requests.get(link, headers={'Browser': 'Firefox'})
    if page.status_code != 200:
        return None
    else:
        soup = BeautifulSoup(page.content, 'html.parser')
        return soup
    

In [None]:
def try_find_tag(soup_object, tag, params, v=False):
    """
    Функция для получения информации о теге (для простых тегов, которые имею уникальные атрибуты типа class и др)
    """
    try:
        res = soup_object.find(tag, params)
        return res.text
    except Exception as E:
        if v:
            print(E)
        return res
    
def try_find_places(soup_object, tag, params):
    """
    Функция для получения информации об округе, районе и метро
    """
    district = None
    area = None
    metro = None

    res = soup_object.find_all(tag, params)
    for elem in res:
        text = elem.text
        if "АО" in text:
            district = text
        elif "р-н" in text:
            area = text
        elif "м." in text:
            metro = text

    return district, area, metro


In [None]:
def get_page_data(soup, global_data_list):
    """
    Функция, которая принимает soup object для веб-странички, парсит её и возвращает инорфмацию в виде списка
    """
    
    for flat_info, aside_block in zip(soup.find_all("div", attrs={'class':"_93444fe79c--container--kZeLu _93444fe79c--link--DqDOy"}),
                                     soup.find_all("div", attrs={'class':"_93444fe79c--container--kZeLu _93444fe79c--agent-cont--iStzo"})):

        # число комнат, метраж, этажи
        try:
            data = try_find_tag(flat_info, "span", {'class': ""})
            room, meters, floor = data.split(', ')

            nrooms = room.split('-')[0]  # число комнат
            meters = meters.split()[0]  # метраж
            if "кв." in room:
                flat_or_appart = 'F'
            elif "апарт." in room:
                flat_or_appart = 'A'
            else:
                flat_or_appart = None

            floor, floors_total = floor.split()[0].split('/')  # на каком этаже квартира и сколько этажей в доме

        except Exception as E:
            
            nrooms, meters, floor, floors_total, flat_or_appart = None, None, None, None, None

        # жилой комплекс
        try:
            zh_comp = try_find_tag(flat_info, "a", {'class': "_93444fe79c--jk--dIktL"})
        except Exception as E:
            
            zh_comp = None

        # сдан ли дом/срок сдачи.
        try:
            deadline = try_find_tag(flat_info, "span", {'data-mark': "Deadline"})
        except Exception as E:
            
            deadline = None

        # время до метро
        try:
            to_metro = try_find_tag(flat_info, "div", {'class': "_93444fe79c--remoteness--q8IXp"})
        except Exception as E:
            
            to_metro = None

        # округ, район, метро
        try:
            district, area, metro = try_find_places(flat_info, "a", {'class': "_93444fe79c--link--NQlVc"})
        except Exception as E:
           
            district, area, metro = None, None, None

        # цена 
        try:
            mainprice = "".join(try_find_tag(flat_info, "span", {'data-mark': "MainPrice"}).split()[:-1])
        except Exception as E:
            
            mainprice = None

        # цена за м²
        try:
            price_per_sq_m = "".join(try_find_tag(flat_info, "p", {'data-mark': "PriceInfo"}).split()[:-1])
        except Exception as E:
           
            price_per_sq_m = None

        # с отделкой или без (в основном без отделки)
        try:
            otdelka = try_find_tag(flat_info, "span", {'class': "_93444fe79c--label--fCs_v _93444fe79c--color_black_100--kPHhJ _93444fe79c--background_gray6_100--VAXKA"})
        except Exception as E:
            
            otdelka = None

        # застройщик
        try:
            developer = try_find_tag(aside_block, "span", {'class': "_93444fe79c--color_current_color--gpi6p _93444fe79c--lineHeight_6u--A1GMI _93444fe79c--fontWeight_bold--ePDnv _93444fe79c--fontSize_16px--RB9YW _93444fe79c--display_block--pDAEx _93444fe79c--text--g9xAG"})
        except Exception as E:
            
            developer = None
        
        global_data_list.append([nrooms, meters, floor, floors_total, flat_or_appart, zh_comp, deadline, to_metro,  district, area, metro, mainprice, price_per_sq_m, otdelka, developer])

In [None]:
data_all = []

page = 1
soup = try_get_page_from_server(page)

while soup is not None:
    print(f'Parsing page num {page}')
    get_page_data(soup, data_all)
    page += 1
    soup = try_get_page_from_server(page)

In [None]:
len(data_all)

In [None]:
cols = ['nrooms', 'meters', 'floor', 'floors_total', 'flat_or_appart', "zh_comp", 'deadline', 'to_metro',  'district', 'area', 'metro', 'mainprice', 'price_per_meter', 'otdelka', 'developer']

df = pd.DataFrame(data=data_all, columns=cols)
print(df.shape)
df

In [None]:
df.to_csv('data_all.tsv', sep='\t', encoding='utf8')