In [1]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import time
import json
from joblib import Parallel, delayed
import re
import numpy as np

In [2]:
test = pd.read_csv('test.csv')

In [3]:
test.columns

Index(['bodyType', 'brand', 'car_url', 'color', 'complectation_dict',
       'description', 'engineDisplacement', 'enginePower', 'equipment_dict',
       'fuelType', 'image', 'mileage', 'modelDate', 'model_info', 'model_name',
       'name', 'numberOfDoors', 'parsing_unixtime', 'priceCurrency',
       'productionDate', 'sell_id', 'super_gen', 'vehicleConfiguration',
       'vehicleTransmission', 'vendor', 'Владельцы', 'Владение', 'ПТС',
       'Привод', 'Руль', 'Состояние', 'Таможня'],
      dtype='object')

In [4]:
test['brand'].unique()

array(['SKODA', 'AUDI', 'HONDA', 'VOLVO', 'BMW', 'NISSAN', 'INFINITI',
       'MERCEDES', 'TOYOTA', 'LEXUS', 'VOLKSWAGEN', 'MITSUBISHI'],
      dtype=object)

In [5]:
brands = list(map(lambda x: x.lower(), test['brand'].unique()))
print(brands)

['skoda', 'audi', 'honda', 'volvo', 'bmw', 'nissan', 'infiniti', 'mercedes', 'toyota', 'lexus', 'volkswagen', 'mitsubishi']


In [None]:
# С помощью цикла for обходим все страницы с объявлениями и собираем ссылки на эти объявления в словарь
links = {}
for brand in brands:
    url = f'https://auto.ru/rossiya/cars/{brand}/all/'
    res = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    page = BeautifulSoup(res.text, 'html.parser')
    page_links = page.find_all('a', class_ = 'Link ListingItemTitle__link')
    car_pages = list(map(lambda x: x.get('href'), page_links))
    for i in range(2, 100):
        url = f'https://auto.ru/rossiya/cars/{brand}/all/?page=i'
        res = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        page = BeautifulSoup(res.text, 'html.parser')
        page_links = page.find_all('a', class_ = 'Link ListingItemTitle__link')
        car_pages.extend(list(map(lambda x: x.get('href'), page_links)))
        time.sleep(0.5)
    links[brand] = car_pages

In [None]:
def parse(link):
    '''функция совершает get запрос по каждой ссылке и возвращает словарь, где ключом является ссылка на объявление, 
    а значением html код страницы с объявлением'''
    page_dict = {}
    res = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
    if res.status_code == 200:
        res.encoding = 'utf-8'
        page = BeautifulSoup(res.text, 'html.parser')
    else:
        page = 0
    page_dict[link] = str(page)
    time.sleep(0.3)
    return page_dict
    
# с помощью цикла for и класса Parallel применяем функцию parse(link) и объединяем все html кода в словарь, отсортированный 
# по брендам автомобилей
brands_dict = {}
for key in links:
    brand = Parallel(n_jobs=5)(delayed(parse)(link) for link in links[key])
    for element in brand:
        for k, v in element.items():
            brands_dict[key][k] = value
    
brands_json = json.dumps(brands_dict)
with open("brands_json.json", "w") as file:
    file.write(brands_json)

In [6]:
def row_create(page):
    '''функция парсит html код каждой страницы и возвращает объект Series, который в последующем станет
    строкой датафрейма'''
    car = {}
    
    page_bs4 = BeautifulSoup(page)
    
    page_card = str(page_bs4.find('div', class_='PageCard'))
    #1 
    body_type = re.findall(r'"body_type":[^,]*,', page)
    if len(body_type) != 0:
        body_type = re.sub(r'"body_type":"|",$', '', body_type[0])
    else:
        body_type = np.nan
    car['body_type'] = body_type
    #2
    brand = re.findall(r'"mark":[^,]*,', page)
    if len(brand) != 0:
        brand = re.sub(r'"mark":"|",$', '', brand[0])
    else:
        brand = np.nan
    car['brand'] = brand
    #3
    color = re.findall(r'"color":[^,]*,', page)
    if len(color) != 0:
        color = re.sub(r'"color":"|",$', '', color[0])
    else:
        color = np.nan
    car['color'] = color
    #4
    complectation_dict = re.findall(r'"complectation":[^}]*}', page)
    if len(complectation_dict) != 0:
        complectation_dict = re.sub(r'"complectation":', '', complectation_dict[0])
    else:
        complectation_dict = np.nan
    car['complectation_dict'] = complectation_dict
    #5
    description = re.findall(r'"description":"[^"]*"', page)
    if len(description) != 0:
        description = re.sub(r'"description":"|"$', '', description[0])
    else:
        description = np.nan
    car['description'] = description
    #6
    engine_displacement = re.findall(r'"engineDisplacement":[^,]*,', page)
    if len(engine_displacement) != 0:
        engine_displacement = re.sub(r'"engineDisplacement":"|",$', '', engine_displacement[0])
    else:
        engine_displacement = np.nan
    car['engine_displacement'] = engine_displacement
    #7
    engine_power = re.findall(r'"enginePower":[^,]*,', page)
    if len(engine_power) != 0:
        engine_power = re.sub(r'"enginePower":"|",$', '', engine_power[0])
    else:
        engine_power = np.nan
    car['engine_power'] = engine_power
    #8
    equipment_dict = re.findall(r'"equipment":[^}]*}', page)
    if len(equipment_dict) != 0:
        equipment_dict = re.sub(r'"equipment":', '', equipment_dict[0])
    else:
        equipment_dict = np.nan
    car['equipment_dict'] = equipment_dict
    #9
    fuel_type = re.findall(r'"engine_type":[^,]*,', page)
    if len(fuel_type) != 0:
        fuel_type = re.sub(r'"engine_type":"|",$', '', fuel_type[0])
    else:
        fuel_type = np.nan
    car['fuel_type'] = fuel_type
    #10
    mileage = re.findall(r'"mileage":[^,]*,', page)
    if len(mileage) != 0:
        mileage = re.sub(r'"mileage":|,$', '', mileage[0])
    else:
        mileage = np.nan
    car['mileage'] = mileage
    #11
    model_info = re.findall(r'"model_info":[^}]*}', page)
    if len(model_info) != 0:
        model_info = re.sub(r'"model_info":', '', model_info[0])
    else:
        model_info = np.nan
    car['model_info'] = model_info
    #12
    model_name = re.findall(r'"model":[^,]*,', page)
    if len(model_name) != 0:
        model_name = re.sub(r'"model":"|",$', '', model_name[0])
    else:
        model_name = np.nan
    car['model_name'] = model_name
    #13
    number_of_doors = re.findall(r'"doors_count":[^,]*,', page)
    if len(number_of_doors) != 0:
        number_of_doors = re.sub(r'"doors_count":|,$', '', number_of_doors[0])
    else:
        number_of_doors = np.nan
    car['number_of_doors'] = number_of_doors
     #14
    production_date = re.findall(r'"productionDate":[^,]*,', page)
    if len(production_date) != 0:
        production_date = int(re.sub(r'"productionDate":|,$', '', production_date[0]))
    else:
        production_date = np.nan
    car['production_date'] = production_date
    #15
    super_gen = re.findall(r'"super_gen":[^}]*}', page)
    if len(super_gen) != 0:
        super_gen = re.sub(r'"super_gen":|}$', '', super_gen[0])
    else:
        super_gen = np.nan
    tech_param = re.findall(r'"tech_param":[^}]*}', page)
    if len(tech_param) != 0:
        tech_param = re.sub(r'"tech_param":{', '', tech_param[0])
    else:
        tech_param = np.nan
    if type(super_gen) == str and type(tech_param) == str:
        super_gen += ',' + tech_param
    car['super_gen'] = super_gen
    #16
    vehicle_transmission = re.findall(r'"transmission":[^,]*,', page)
    if len(vehicle_transmission) != 0:
        vehicle_transmission = re.sub(r'"transmission":"|",$', '', vehicle_transmission[0])
    else:
        vehicle_transmission = np.nan
    car['vehicle_transmission'] = vehicle_transmission
    #17
    vendor = re.findall(r'"vendor":[^,]*,', page)
    if len(vendor) != 0:
        vendor = re.sub(r'"vendor":"|",$', '', vendor[0])
    else:
        vendor = np.nan
    car['vendor'] = vendor
    #18
    owners = re.findall(r'"owners_number":[^,]*,', page)
    if len(owners) != 0:
        owners = int(re.sub(r'"owners_number":|,$', '', owners[0]))
    else:
        owners = 0
    car['owners'] = owners
    #19
    if 'CardInfoRow CardInfoRow_owningTime' in page_card:
        try:
            tenure = re.sub(r'[Вв]ладение', '', page_bs4.find('li', class_='CardInfoRow CardInfoRow_owningTime').text)        
        except AttributeError:
            tenure = np.nan
    else:
        tenure = np.nan
    car['tenure'] = tenure
    #20
    vehicle_passport = re.findall(r'"pts":[^,]*,', page)
    if len(vehicle_passport) != 0:
        vehicle_passport = re.sub(r'"pts":"|"}*,$', '', vehicle_passport[0])
    else:
        vehicle_passport = np.nan
    car['vehicle_passport'] = vehicle_passport
    #21
    if 'CardInfoRow CardInfoRow_drive' in page_card:
        try:
            wheel_drive = mileage = re.sub(r'[Пп]ривод', '', page_bs4.find('li', class_='CardInfoRow CardInfoRow_drive').text)        
        except AttributeError:
            wheel_drive = np.nan
    else:
        wheel_drive = np.nan
    car['wheel_drive'] = wheel_drive
    #22
    wheel = re.findall(r'"steering_wheel":[^,]*,', page)
    if len(wheel) != 0:
        wheel = re.sub(r'"steering_wheel":"|",$', '', wheel[0])
    else:
        wheel = np.nan
    car['wheel'] = wheel
    #23
    if 'CardInfoRow CardInfoRow_state' in page_card:
        try:
            condition  = re.sub(r'[Сс]остояние', '', page_bs4.find('li', class_='CardInfoRow CardInfoRow_state').text)        
        except AttributeError:
            condition = np.nan
    else:
        condition = np.nan
    car['condition'] = condition
    #24
    if 'CardInfoRow CardInfoRow_customs' in page_card:
        try:
            customs = re.sub(r'[Тт]аможня', '', page_bs4.find('li', class_='CardInfoRow CardInfoRow_customs').text)        
        except AttributeError:
            customs = np.nan
    else:
        customs = np.nan
    car['customs'] = customs
    #25
    if 'OfferPriceCaption__price' in page_card:
        try:
            price = int(re.sub(r'\D', '', page_bs4.find('span', class_ = 'OfferPriceCaption__price').text))        
        except AttributeError:
            price = 0
    else:
        price = 0
    car['price'] = price
    
    car_data = pd.Series(car)
    
    return car_data

In [None]:
# создаем датафрейм с данными об автомобилях
data = pd.DataFrame()
for key in links:
    for elem in links[key]:
        ser = row_create(brands_dict[key][elem])
        data = data.append(ser, ignore_index=True)

data.to_csv('cars.csv', index=False)