# ***This notebook was written to get train data from auto.ru***



In [None]:
# import dependencies
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json
import time
import math
import csv

In [None]:
def list_to_dict(data_list):
    """
    Converts list to dictionary 
    :param data_list: list
    :return: dict
    """
    result = [{}]
    for item in data_list:
        key, val = item.split(":", 1)
        val = val.strip()
        if key in result[-1]:
            result.append({})
        result[-1][key] = val
    return result[0]


def get_pages_for_brand(marka):
    """
    Calculates the amount of pages per brand
    :param marka: str
    :return: int
    """
    url = f'https://auto.ru/moskva/cars/{marka}/used/'
    response = requests.get(url)
    page = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
    no_ad = re.sub("[^0-9.]", "", page.find('span', class_='ButtonWithLoader__content').text)
    ad_per_page = len(page.find_all('div', class_='ListingItem'))
    return int(math.ceil(int(no_ad)/ad_per_page))

In [None]:
# get brands from test set
test_df = pd.read_csv("test.csv")
brands = test_df.brand.unique()

In [None]:
# loop over each brand
for brand in brands:
    list_all_info = []
    print(brand)
    amnt_pages = get_pages_for_brand(brand)
    
    for i in range(1, amnt_pages+1):
        print(f"This is page number {i}")
        url = f'https://auto.ru/moskva/cars/{brand}/used/?page={i}'

        response = requests.get(url)
        page = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
        cars_url_list = page.find_all('a', class_='Link ListingItemTitle__link')

        for ind_adv in cars_url_list:
            ind_url = ind_adv['href']
            car_page = None
            try:
                car_response = requests.get(ind_url)
                time.sleep(0.5)
                car_page = BeautifulSoup(car_response.content.decode('utf-8'), 'html.parser')
                car_info_list = [child.get_text(': ').replace('\xa0', ' ') for child in
                                 car_page.find(class_='CardInfo').children]
                car_info_dict = list_to_dict(car_info_list)
            except:
                print("error in car with url" + url)
                print("car page " + str(car_page))
                continue

            # data from the first page
            try:
                description = car_page.find('div', class_='CardDescription__text').text
            except:
                description = None
            try:
                ownership_duration = car_info_dict['Владение']
            except:
                ownership_duration = None

            try:
                json_car = json.loads(car_page.find('script', type="application/json", id='initial-state').string)
            except:
                json_car = None
            try:
                complectation_dict = json_car['card']['vehicle_info']['complectation']
            except:
                complectation_dict = None
            try:
                equipment_dict = json_car['card']['vehicle_info']['equipment']
            except:
                equipment_dict = None
            try:
                model_info = json_car['card']['vehicle_info']['model_info']
            except:
                model_info = None
            try:
                name = json_car['card']['vehicle_info']['tech_param']['human_name']
            except:
                name = None
            try:
                n_doors = json_car['card']['vehicle_info']['configuration']['doors_count']
            except:
                n_doors = None
            try:
                sale_id = json_car['card']['id']
            except:
                sale_id = None
            try:
                super_gen = json_car['card']['vehicle_info']['tech_param']
            except:
                super_gen = None
            try:
                vehicle_config = json_car['card']['vehicle_info']['configuration']['body_type']
            except:
                vehicle_config = None
            try:
                vehicle_trans = super_gen['transmission']
            except:
                vehicle_trans = None
            try:
                vendor = json_car['card']['vehicle_info']['vendor']
            except:
                vendor = None
            try:
                price_text = json_car['card']['price_info']['price']
                priceCurrency = json_car['card']['price_info']['currency']
            except:
                price_text = None
                priceCurrency = None
            try:
                region = json_car['card']['seller']['location']['address']
            except:
                region = None
            try:
                n_views = json_car['card']['counters']['all']
            except:
                n_views = None
            try:
                taxes = json_car['card']['owner_expenses']['transport_tax']['tax_by_year']
            except:
                taxes = None

            try:
                eng_disp = re.sub("[^0-9.]", "", car_info_dict['Двигатель'].split('/')[0])
            except:
                eng_disp = None

            try:
                eng_power = re.sub("[^0-9]", "", car_info_dict['Двигатель'].split('/')[1])
            except:
                eng_power = None

            try:
                fuel = re.sub("[^а-яА-Я]", "", car_info_dict['Двигатель'].split('/')[2])
            except:
                fuel = None

            try:
                milage = re.sub("[^0-9]", "", car_info_dict['Пробег'])
            except:
                milage = None

            try:
                modelDate = json_car['card']['vehicle_info']['super_gen']['year_from']
            except:
                modelDate = None

            # pack all info to dict
            dict_ind_car = {'bodyType': car_info_dict['Кузов'] if 'Кузов' in car_info_dict else None,
                            'brand': brand,
                            'car_url': ind_url,
                            'color': car_info_dict['Цвет'] if 'Цвет' in car_info_dict else None,
                            'complectation_dict': complectation_dict,
                            'description': description,
                            'engineDisplacement': eng_disp,
                            'enginePower': eng_power,
                            'equipment_dict': equipment_dict,
                            'fuelType': fuel,
                            'image': None,
                            'mileage': milage,
                            'modelDate': modelDate,
                            'model_info': model_info,
                            'model_name': model_info['code'],
                            'name': name,
                            'numberOfDoors': n_doors,
                            'parsing_unixtime': int(time.time()),
                            'priceCurrency': priceCurrency,
                            'productionDate': car_info_dict['год выпуска'] if 'год выпуска' in car_info_dict else None,
                            'sell_id': sale_id,
                            'super_gen': super_gen,
                            'vehicleConfiguration': vehicle_config,
                            'vehicleTransmission': vehicle_trans,
                            'vendor': vendor,
                            'Владельцы': car_info_dict['Владельцы'] if 'Владельцы' in car_info_dict else None,
                            'Владение': car_info_dict['Владение'] if 'Владение' in car_info_dict else None,
                            'ПТС': car_info_dict['ПТС'] if 'ПТС' in car_info_dict else None,
                            'Привод': car_info_dict['Привод'] if 'Привод' in car_info_dict else None,
                            'Руль': car_info_dict['Руль'] if 'Руль' in car_info_dict else None,
                            'Состояние': car_info_dict['Состояние'] if 'Состояние' in car_info_dict else None,
                            'Таможня': car_info_dict['Таможня'] if 'Таможня' in car_info_dict else None,
                            'Region': region,
                            'Taxes': taxes,
                            'Views': n_views,
                            'Price': price_text}

            list_all_info.append(dict_ind_car)

    keys = list_all_info[0].keys()
    
    # save data to csv per brand 
    with open(f'{brand}.csv', 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(list_all_info)

In [None]:
# merge csv-files of each brand to create one csv with all information
df_list = []
for brand_name in brands:
    df_list.append(pd.read_csv(f'{brand_name}.csv'))

# drop duplicates
df = pd.concat(df_list)
df_dropped = df.drop_duplicates(subset=['car_url'])

# save df as a new csv => our train set
df_dropped.to_csv('merged.csv', sep=';')