In [10]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import requests
from requests.exceptions import HTTPError
import re
import datetime

# Functions

In [11]:
# function for writing text between first and last
def find_between_r( s, first, last ):
    try:
        start = s.rindex( first ) + len( first )
        end = s.rindex( last, start )
        return s[start:end]
    except ValueError:
        return ""

In [12]:
def car_url_parser(url):
    # url - start page for chosing auto
    # parse links to cars
    # example url = 'https://auto.ru/cars/kia/optima/all/?listing=listing'
    
    #extract car name from URL
    carName = find_between_r(url,'cars/','/all').replace('/',' ') 
    
    print('Parsing advertisements pages: Start ' + carName.upper())
    
    # parse links to cars
    
    content = urllib.request.urlopen(url)
    soup = BeautifulSoup(content, "lxml")
    
    # total ammount of cars:
    carsN = float(soup.find('span', class_="tabs__count").text)
    # number of pages with ads
    pageN = int(carsN / 37) + 1
    
    # parsing first page cars urls
    
    # select car advertising boxes
    cars_boxes = []
    cars_table = soup.find('table', class_ = 'listing-list listing listing-wrap__listing i-bem')
    for box in cars_table.find_all('tbody'):
        cars_boxes.append(box)

    # create list of car ads urls
    cars_urls = []
    for box in cars_boxes:
        try:
            if box.find_all('a')[0].get('href')[0:10] == '//auto.ru/':
                cars_urls.append(box.find_all('a')[0].get('href'))
        except:
            continue
    
    
    print('--Parsing advertising pages ' + \
    ' %.1f%% (%d/%d)' % (float(1) / float(pageN) * 100, 1, pageN))
    
    # parsing all pages 
    for page in range(2, pageN + 1):
        try:
            url_temp = url + "&page_num_offers=%d" % page
            r = requests.get(url_temp)
            r.raise_for_status()                    
            content = urllib.request.urlopen(url_temp)
            soup = BeautifulSoup(content, "lxml")
            # select car advertising boxes
            cars_boxes = []
            cars_table = soup.find('table', class_ = 'listing-list listing listing-wrap__listing i-bem')
            for box in cars_table.find_all('tbody'):
                cars_boxes.append(box)

            # add car ads urls to list
            for box in cars_boxes:
                try:
                    if box.find_all('a')[0].get('href')[0:10] == '//auto.ru/':
                        cars_urls.append(box.find_all('a')[0].get('href'))
                except:
                    continue      
            print('--Parsing advertising pages '  + \
            ' %.1f%% (%d/%d)' % (float(page) / float(pageN) * 100, page , pageN))
        except HTTPError:
            continue
            
    # remove duplicates
    cars_urls = list(set(cars_urls))
    print('Parsing advertisements pages: End '+ carName.upper())
    return cars_urls

In [13]:
def car_parser(url_list, base_url, log_save_path):

    # url_list - лист со ссылками на сраницы с машинами на avto.ru
    # base_url - ссылка на 1 страницу объявлений
    
    carName = find_between_r(base_url,'cars/','/all').replace('/',' ')
    print('---Parsing advertisements Start: ' + carName.upper())
    #лист в котором будут хратиться данные о машинах
    cars = []
    now = str(datetime.datetime.now())[:10]
    
    problem_pages = []
    #цикл по страницам машин
    i = 1 
    for ad_url in url_list:
        url = 'https:' + ad_url
        if i%10 == 0:
            print('-----Parsing advertisements: ' + carName.upper() + 
                  ' %.1f%% (%d/%d)' %  (float(i) / float(len(url_list)) * 100, i, len(url_list)))
        try:
            content = urllib.request.urlopen(url)
            soup = BeautifulSoup(content, "lxml")
            
            #dictionary with ad params
            car = {}
            
            car['Parsing date'] = now
            car['URL'] = url
            car['Brand'] = soup.find('h1', class_ = 'card__title').get('title')[8:]
            
            #upper head of advertisment
            head = []
            for x in soup.find_all('li', class_ = 'card__stat-item'):
                head.append(x.get_text())
            car['ID'] = re.sub('[^0-9]','',head[0])
            car['Add date'] = head[1][10:]
            car['Views'] =  re.sub('[^0-9]','',head[2])
            
            price = soup.find('h4', class_ = 'card__price-rur').get_text()
            car['Price_rub'] = re.sub('[^0-9]','',price)
            
            seller = soup.find('dd', class_ = 'card__contact-value').get_text()
            car['Seller'] = seller[:seller.index('Написать продавцу')]
            car['Seller_location'] = soup.find('div', class_ = 'sale-location__title').get_text()
            
            #left table
            
            left_table_name = []
            left_table_param = []
            ii = 1
            for dt in soup.find('dl', class_ = 'card__info'):
                if ii%2 == 1:
                    left_table_name.append(dt.get_text())
                else:
                    left_table_param.append(dt.get_text())
                ii += 1
            left_table_dict = dict(zip(left_table_name,left_table_param))
            car = dict(car, **left_table_dict)
            
            #bottom table
            if soup.find('div', class_ = 'seller-details__text') == None:
                car['comment'] = ''
            else:
                car['comment'] = soup.find('div', class_ = 'seller-details__text').get_text()
                
            if soup.find('div', class_ = 'card__package') == None:
                car['complectation'] = ''
            else:
                items = soup.find_all('li', class_ = 'card__package-item')
                car_items = []
                for item in items:
                    car_items.append(item.get_text())
                car['complectation'] = car_items
                
            # add car dictionary to list   
            cars.append(car)    
        except:
            print('Error with advertisement: ' + url)
            problem_pages.append(url)
        i = i + 1
        
    #write errors log file
    with open(log_save_path + 'errors log '+ now + ' ' + carName + '.txt', 'w') as f:
        for page in problem_pages:
            f.write(page + '\n')
            
    print('---Parsing advertisements End: ' + carName.upper())
    return pd.DataFrame(cars)

In [14]:
def main_parser(start_url_list, save_directory):
    #start_url_list = ['https://auto.ru/cars/kia/optima/all/?listing=listing' - car start page
    
    for url in start_url_list:    
        import datetime
        #базовый url
        #url = 'http://auto.ru/cars/kia/optima/all/'
        startTime = datetime.datetime.now()
        #парсинг ссылок на страницы объявлиний
        url_list = car_url_parser(url)
        #url_list_test = url_list[3000:3300]
        #парсинг объявлений по ссылкам 
        startTime1 = datetime.datetime.now()
        df_cars = car_parser(url_list, url, save_directory)
        
        #сохранение в файл
        
        now = datetime.datetime.now()
        carName = find_between_r(url,'cars/','/all').replace('/','_')
        carName = carName + '_' + str(now)[:10] + '.pickle'
        carName = carName.replace(' ','-').replace(':','_').replace('-','_')
        path = save_directory + carName
        
        print('--Saving File to ' + path)
        df_cars.to_pickle(path)
        print('Done')
        print('===')
        print('===')
        
        #статистика
        TimeElapsed1 = datetime.datetime.now() - startTime1
        TimeElapsed = datetime.datetime.now() - startTime
        print('Full parsing time: ' + str(TimeElapsed))
        print('Total number of advertisements parsed: ' + str(len(df_cars)))
        print('Average parsing time per advertisement: ' + str(TimeElapsed1 / len(df_cars)))
        print('===')
        print('===')

# Parsing D-class automobiles

In [15]:
start_urls = ['https://auto.ru/cars/kia/optima/all/?listing=listing', 
              'https://auto.ru/cars/honda/accord/all/?listing=listing',
             'https://auto.ru/cars/hyundai/i40/all/?listing=listing',
             'https://auto.ru/cars/toyota/camry/all/?listing=listing',
             'https://auto.ru/cars/hyundai/sonata/all/?listing=listing',
             'https://auto.ru/cars/mazda/6/all/?listing=listing',
             'https://auto.ru/cars/bmw/5er/all/?listing=listing',
             'https://auto.ru/cars/audi/a6/all/?listing=listing',
             'https://auto.ru/cars/ford/mondeo/all/?listing=listing',
             'https://auto.ru/cars/infiniti/g35/all/?listing=listing',
             'https://auto.ru/cars/nissan/teana/all/?listing=listing',
             'https://auto.ru/cars/opel/insignia/all/?listing=listing',
             'https://auto.ru/cars/volvo/s60/all/?listing=listing']

save_dir = 'C:/Users/Vasiliy Poteriaev/Google Диск/analytics/auto_ru/data/D_class/'
#save_dir = 'C:/Users/Vasily/Google Диск/analytics/auto_ru/data/'

In [16]:
main_parser(start_url_list = start_urls, save_directory=save_dir)

Parsing advertisements pages: Start KIA OPTIMA
--Parsing advertising pages  16.7% (1/6)
--Parsing advertising pages  33.3% (2/6)
--Parsing advertising pages  50.0% (3/6)
--Parsing advertising pages  66.7% (4/6)
--Parsing advertising pages  83.3% (5/6)
--Parsing advertising pages  100.0% (6/6)
Parsing advertisements pages: End KIA OPTIMA
---Parsing advertisements Start: KIA OPTIMA
-----Parsing advertisements: KIA OPTIMA 4.7% (10/213)
Error with advertisement: https://auto.ru/cars/used/sale/1034979175-4c44/
-----Parsing advertisements: KIA OPTIMA 9.4% (20/213)
-----Parsing advertisements: KIA OPTIMA 14.1% (30/213)
-----Parsing advertisements: KIA OPTIMA 18.8% (40/213)
-----Parsing advertisements: KIA OPTIMA 23.5% (50/213)
-----Parsing advertisements: KIA OPTIMA 28.2% (60/213)
-----Parsing advertisements: KIA OPTIMA 32.9% (70/213)
Error with advertisement: https://auto.ru/cars/used/sale/1032916839-d733/
-----Parsing advertisements: KIA OPTIMA 37.6% (80/213)
-----Parsing advertisements: KI