In [3]:
import requests
import json
import csv
from bs4 import BeautifulSoup
import traceback
import cloudscraper
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
def get_cars(
  make="BMW", 
  model="5 SERIES", 
  postcode="SW1A 0AA", 
  radius=1500, 
  min_year=1995, 
  max_year=1995, 
  include_writeoff="include", 
  max_attempts_per_page=1, 
  verbose=False,
  min_price= 0,
  max_price = 99999):

    # To bypass Cloudflare protection
    scraper = cloudscraper.create_scraper()

    # Basic variables
    results = []
    n_this_year_results = 0

    url = "https://www.autotrader.co.uk/results-car-search"

    keywords = {}
    keywords["mileage"] = ["miles"]
    keywords["BHP"] = ["BHP"]
    keywords["transmission"] = ["Automatic", "Manual"]
    keywords["fuel"] = [
      "Petrol", 
      "Diesel", 
      "Electric", 
      "Hybrid – Diesel/Electric Plug-in", 
      "Hybrid – Petrol/Electric", 
      "Hybrid – Petrol/Electric Plug-in"
    ]
    keywords["owners"] = ["owners"]
    keywords["body"] = [
      "Coupe", 
      "Convertible", 
      "Estate", 
      "Hatchback", 
      "MPV", 
      "Pickup", 
      "SUV", 
      "Saloon"
    ]
    keywords["ULEZ"] = ["ULEZ"]
    keywords["year"] = [" reg)"]
    keywords["engine"] = ["engine"]

    # Set up parameters for query to autotrader.co.uk
    params = {
        "sort": "relevance",
        "postcode": postcode,
        "price-from" : min_price,
        "price-to" : max_price,
        "radius": radius,
        "make": make,
        "model": model,
        "search-results-price-type": "total-price",
        "search-results-year": "select-year",
    }

    if (include_writeoff == "include"):
        params["writeoff-categories"] = "on"
    elif (include_writeoff == "exclude"):
        params["exclude-writeoff-categories"] = "on"
    elif (include_writeoff == "writeoff-only"):
        params["only-writeoff-categories"] = "on"
        
    year = min_year
    page = 1
    attempt = 1

    try:
        while year <= max_year:
            params["year-from"] = year
            params["year-to"] = year
            params["page"] = page

            r = scraper.get(url, params=params)
            if verbose:
                print("Year:     ", year)
                print("Page:     ", page)
                print("Response: ", r)

            try:
                if r.status_code != 200:   # If not successful (e.g. due to bot protection)
                    attempt = attempt + 1  # Log as an attempt
                    if attempt <= max_attempts_per_page:
                        if verbose:
                            print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                    else:
                        page = page + 1
                        attempt = 1
                        if verbose:
                            print("Exception. All attempts exhausted for this page. Skipping to next page #", page)

                else:

                    j = r.json()
                    s = BeautifulSoup(j["html"], features="html.parser")

                    articles = s.find_all("article", attrs={"data-standout-type":""})

                    # If no results or reached end of results...
                    if len(articles) == 0 or r.url[r.url.find("page=")+5:] != str(page):
                        if verbose:
                            print("Found total", n_this_year_results, "results for year", year, "across", page-1, "pages")
                            if year+1 <= max_year:
                                print("Moving on to year", year + 1)
                                print("---------------------------------")

                        # Increment year and reset relevant variables
                        type(year)
                        year = year + 1
                        page = 1
                        attempt = 1
                        n_this_year_results = 0
                    else:
                        for article in articles:
                            car = {}
                            car["name"] = article.find("h3", {"class": "product-card-details__title"}).text.strip()             
                            car["link"] = "https://www.autotrader.co.uk" + \
                                  article.find("a", {"class": "listing-fpa-link"})["href"][: article.find("a", {"class": "listing-fpa-link"})["href"] \
                                  .find("?")]
                            car["price"] = article.find("div", {"class": "product-card-pricing__price"}).text.strip()

                            seller_info = article.find("ul", {"class": "product-card-seller-info__specs"}).text.strip()
                            car["seller"] = " ".join(seller_info.split())

                            key_specs_bs_list = article.find("ul", {"class": "listing-key-specs"}).find_all("li")
                            
                            for key_spec_bs_li in key_specs_bs_list:

                                key_spec_bs = key_spec_bs_li.text

                                if any(keyword in key_spec_bs for keyword in keywords["mileage"]):
                                    car["mileage"] = int(key_spec_bs[:key_spec_bs.find(" miles")].replace(",",""))
                                elif any(keyword in key_spec_bs for keyword in keywords["BHP"]):
                                    car["BHP"] = int(key_spec_bs[:key_spec_bs.find("BHP")])
                                elif any(keyword in key_spec_bs for keyword in keywords["transmission"]):
                                    car["transmission"] = key_spec_bs
                                elif any(keyword in key_spec_bs for keyword in keywords["fuel"]):
                                    car["fuel"] = key_spec_bs
                                elif any(keyword in key_spec_bs for keyword in keywords["owners"]):
                                    car["owners"] = int(key_spec_bs[:key_spec_bs.find(" owners")])
                                elif any(keyword in key_spec_bs for keyword in keywords["body"]):
                                    car["body"] = key_spec_bs
                                elif any(keyword in key_spec_bs for keyword in keywords["ULEZ"]):
                                    car["ULEZ"] = key_spec_bs
                                elif any(keyword in key_spec_bs for keyword in keywords["year"]):
                                    car["year"] = key_spec_bs
                                elif key_spec_bs[1] == "." and key_spec_bs[3] == "L":
                                    car["engine"] = key_spec_bs

                            results.append(car)
                            n_this_year_results = n_this_year_results + 1

                        page = page + 1
                        attempt = 1

                        if verbose:
                            print("Car count: ", len(results))
                            print("---------------------------------")

            except KeyboardInterrupt:
                break

            except:
                traceback.print_exc()
                attempt = attempt + 1
                if attempt <= max_attempts_per_page:
                    if verbose:
                        print("Exception. Starting attempt #", attempt, "and keeping at page #", page)
                else:
                    page = page + 1
                    attempt = 1
                    if verbose:
                        print("Exception. All attempts exhausted for this page. Skipping to next page #", page)

    except KeyboardInterrupt:
        pass

    return pd.DataFrame(results)

In [3]:
#### parameters needed #####
#location 1 and location 2 to compare
#radius for each 
#car price and comparison
#minimum and maximum purchase price 

In [5]:
#leave as blank to include all within that range 
#postcode as 10 
#loation 1 is the area to buy the car and 2 is to sell 

make_1="" 
model_1="" 
postcode_1="MK3 6JS" 
radius_1=30 
min_year_1=2005 
max_year_1=2010 
min_price_1= 0
max_price_1 = 10000

make_2="" 
model_2="" 
postcode_2="DT1 3GJ" 
radius_2=30 
min_year_2=2005 
max_year_2=2010 
min_price_2= 0
max_price_2 = 10000


location_1 = get_cars(make = make_1, 
                      model = model_1,
                      postcode = postcode_1,
                      radius = radius_1 ,
                      min_year = min_year_1 ,
                      max_year = max_year_1 ,
                      min_price = min_price_1,
                      max_price = max_price_1)

location_2 = get_cars(make = make_2, 
                      model = model_2,
                      postcode = postcode_2,
                      radius = radius_2 ,
                      min_year = min_year_2 ,
                      max_year = max_year_2 ,
                      min_price = min_price_2,
                      max_price = max_price_2)

location_1['year'] = location_1['year'].apply(lambda x: int(str(x).split('(')[0]))
location_2['year'] = location_2['year'].apply(lambda x: int(str(x).split('(')[0]))

location_2['year + 1'] = location_2['year'] + 1
location_1['year + 1'] = location_1['year'] + 1

ValueError: invalid literal for int() with base 10: 'nan'

In [50]:
location_1['year'] = location_1['year'].dropna().apply(lambda x: int(str(x).split('(')[0]))
location_2['year'] = location_2['year'].dropna().apply(lambda x: int(str(x).split('(')[0]))

location_2['year + 1'] = location_2['year'] + 1
location_1['year + 1'] = location_1['year'] + 1
 

In [51]:
location_1['year']

0       2005.0
1       2005.0
2       2005.0
3       2005.0
4       2005.0
         ...  
3817    2010.0
3818    2010.0
3819    2010.0
3820    2010.0
3821    2010.0
Name: year, Length: 3822, dtype: float64

In [44]:
#give a take a year for model to model.
#minimal mileage differen between both.
#find the price difference 

In [95]:
df1 = location_1.merge(location_2, how = 'inner', left_on = ['name','year'] ,right_on = ['name','year'],suffixes = ('_1','_2'))
df1['year_1'] = df1['year']
df1['year_2'] = df1['year']
df2 = location_1.merge(location_2 ,how = 'inner', left_on = ['name','year'] ,right_on = ['name','year + 1'],suffixes = ('_1','_2'))
df3 = location_1.merge(location_2 ,how = 'inner', right_on = ['name','year'] ,left_on = ['name','year + 1'],suffixes = ('_1','_2'))

finalcars_pre = pd.concat([df1,df2,df3],axis = 0)[['name','link_1','price_1','price_2','year_1','year_2','mileage_1','mileage_2','body_1','body_2','engine_1','engine_2','body_1','body_2']]
finalcars = finalcars_pre[]


In [96]:
finalcars[finalcars['name'] == 'Renault Kangoo']

Unnamed: 0,name,link_1,price_1,price_2,year_1,year_2,mileage_1,mileage_2,body_1,body_2,engine_1,engine_2,body_1.1,body_2.1
0,Renault Kangoo,https://www.autotrader.co.uk/car-details/20221...,£990,£990,2005.0,2005.0,94000,94000.0,MPV,MPV,1.6L,1.6L,MPV,MPV
1,Renault Kangoo,https://www.autotrader.co.uk/car-details/20221...,£990,"£2,490",2005.0,2005.0,94000,73000.0,MPV,MPV,1.6L,1.5L,MPV,MPV
2,Renault Kangoo,https://www.autotrader.co.uk/car-details/20230...,"£2,490",£990,2005.0,2005.0,73000,94000.0,MPV,MPV,1.5L,1.6L,MPV,MPV
3,Renault Kangoo,https://www.autotrader.co.uk/car-details/20230...,"£2,490","£2,490",2005.0,2005.0,73000,73000.0,MPV,MPV,1.5L,1.5L,MPV,MPV
4,Renault Kangoo,https://www.autotrader.co.uk/car-details/20230...,"£2,795",£990,2005.0,2005.0,112350,94000.0,MPV,MPV,1.9L,1.6L,MPV,MPV
5,Renault Kangoo,https://www.autotrader.co.uk/car-details/20230...,"£2,795","£2,490",2005.0,2005.0,112350,73000.0,MPV,MPV,1.9L,1.5L,MPV,MPV


In [94]:
location_2

Unnamed: 0,name,link,price,seller,year,body,mileage,engine,BHP,transmission,fuel,owners,cyear,year + 1
0,Saab 9-3,https://www.autotrader.co.uk/car-details/20230...,"£1,495",4.5 (34 reviews) bournemouth (29 miles),2005,Convertible,93000.0,2.0L,150.0,Manual,Petrol,4.0,2005,2006
1,Renault Kangoo,https://www.autotrader.co.uk/car-details/20221...,£990,4.6 (317 reviews),2005,MPV,94000.0,1.6L,95.0,Automatic,Petrol,,2005,2006
2,Hyundai Coupe,https://www.autotrader.co.uk/car-details/20230...,"£2,490",4.6 (317 reviews),2005,Coupe,75000.0,2.0L,141.0,Manual,Petrol,,2005,2006
3,Honda Jazz,https://www.autotrader.co.uk/car-details/20230...,"£2,500",4.9 (259 reviews),2005,Hatchback,60704.0,1.2L,75.0,Manual,Petrol,2.0,2005,2006
4,Mazda3,https://www.autotrader.co.uk/car-details/20230...,"£3,495",4.6 (36 reviews) ringwood (28 miles),2005,Hatchback,40000.0,1.6L,103.0,Manual,Petrol,,2005,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Toyota RAV4,https://www.autotrader.co.uk/car-details/20230...,"£6,695",4.8 (179 reviews) sherborne (18 miles),2010,SUV,86487.0,2.2L,,Manual,Diesel,6.0,2010,2011
610,Volkswagen Polo,https://www.autotrader.co.uk/car-details/20230...,"£6,400",beaminster (16 miles),2010,Hatchback,47435.0,1.2L,,Manual,Diesel,2.0,2010,2011
611,Mercedes-Benz A Class,https://www.autotrader.co.uk/car-details/20230...,"£8,750",4.9 (75 reviews) ringwood (29 miles),2010,Hatchback,20000.0,2.0L,82.0,Automatic,Diesel,,2010,2011
612,KIA Sportage,https://www.autotrader.co.uk/car-details/20230...,"£7,495",3.8 (95 reviews) christchurch (29 miles),2010,SUV,57000.0,2.0L,,Manual,Diesel,,2010,2011
