In [1]:
from bs4 import BeautifulSoup
import requests
from tqdm import *
import json
import re
import os

In [2]:
base = "https://www.cars.com"
if os.path.exists("links.json"):
    with open("links.json", "r") as f:
        links = json.load(f)
else:
    links = []
    for i in trange(100):
        url = f"{base}/shopping/results/?page={i}&page_size=100&list_price_max=&makes[]=&maximum_distance=all&models[]=&stock_type=all&zip=48113"
        cars_raw = BeautifulSoup(requests.get(url).text)
        cards = cars_raw.find_all("div", {"class": "vehicle-card"})
        links += [base+c.find_next()["href"] for c in cards]

In [4]:
infos = []
fails = []
success = 0
total = 0
features_enum = ['Convenience', 'Entertainment', 'Exterior', 'Safety', 'Seating']
for l in tqdm(links):
    if total % 1500 == 0:
        with open("info.json", "w") as f:
            json.dump(infos, f, indent = 2)
        with open("fails.json", "w") as f:
            json.dump(fails, f, indent = 2)
        print(f"Current {success}/{total}")
    try:
        car_info_raw = BeautifulSoup(requests.get(l).text)
        avaliable = car_info_raw.find_all("p", {"class": "sds-notification__desc"})
        total += 1
        if len(avaliable) > 0:
            continue
        info_list = car_info_raw.find_all("dl", {"class": "fancy-description-list"})[0].find_all_next("dd")
        remove_space = lambda x: re.sub(r"^\s+|\s+$",  "", x)
        basics = {
            'exterior_color': remove_space(info_list[0].text),
            'interior_color': remove_space(info_list[1].text),
            'drive_train': remove_space(info_list[2].text),
            'mpg': remove_space(info_list[3].find_next().find_next().find_next().text),
            'fuel_type': remove_space(info_list[4].text),
            'transmission': remove_space(info_list[5].text),
            'engine': remove_space(info_list[6].text),
            'mileage': int("".join(re.findall(r"[0-9]+", info_list[9].text)))
        }
        features_name = [x.text for x in car_info_raw.find_all("dl", {"class": "fancy-description-list"})[1].find_all_next("dt")]
        features_list = car_info_raw.find_all("dl", {"class": "fancy-description-list"})[1].find_all_next("dd")
        features = {k: [] for k in features_enum}
        for f in features_enum:
            if f in features_name:
                features[f] = [x.text for x in features_list[features_name.index(f)].find_next().find_all()]
        features = {
            'heated_seats': 'Heated Seats' in features['Convenience'],
            'heated_steering_wheel': 'Heated Steering Wheel' in features['Convenience'],
            'nav_sys': 'Navigation System' in features['Convenience'],
            'remote_start': 'Remote Start' in features['Convenience'],
            'carplay': 'Apple CarPlay/Android Auto' in features['Entertainment'],
            'bluetooth': 'Bluetooth' in features['Entertainment'],
            'brake_assist': 'Brake Assist' in features['Safety'],
            'blind_spot_monitor': 'Blind Spot Monitor' in features['Safety']
        }
        info = {
            'name': car_info_raw.find_all("h1", {'class': 'listing-title'})[0].text,
            'price': int("".join(re.findall(r"[0-9]+", car_info_raw.find_all("span", {'class': 'primary-price'})[0].text))),
            'basics': basics,
            'features': features
        }
        infos.append(info)
        success += 1
    except Exception as e:
        print(e)
        fails.append(
            {
                'link': l,
                'reason': f"{e}"
            }
        )
        continue
with open("info.json", "w") as f:
    json.dump(infos, f, indent = 2)

  0%|          | 0/9905 [00:00<?, ?it/s]

Current 0/0


 14%|█▍        | 1411/9905 [31:16<3:11:43,  1.35s/it]

list index out of range


 15%|█▌        | 1500/9905 [33:29<3:44:37,  1.60s/it]

Current 1384/1500


 15%|█▌        | 1529/9905 [34:07<3:02:05,  1.30s/it]

list index out of range


 23%|██▎       | 2233/9905 [49:15<2:16:36,  1.07s/it]

list index out of range


 23%|██▎       | 2293/9905 [50:28<3:13:25,  1.52s/it]

list index out of range


 27%|██▋       | 2649/9905 [58:08<3:26:46,  1.71s/it]

list index out of range


 29%|██▉       | 2912/9905 [1:03:47<1:21:45,  1.43it/s]

list index out of range


 29%|██▉       | 2921/9905 [1:03:57<2:17:46,  1.18s/it]

list index out of range


 30%|███       | 2974/9905 [1:05:08<2:20:17,  1.21s/it]

list index out of range


 30%|███       | 3000/9905 [1:05:36<2:13:51,  1.16s/it]

Current 2769/3000


 31%|███▏      | 3111/9905 [1:07:59<2:32:20,  1.35s/it]

list index out of range


 45%|████▌     | 4500/9905 [1:36:39<2:03:43,  1.37s/it]

Current 4177/4500


 52%|█████▏    | 5135/9905 [1:50:29<11:48:41,  8.91s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 52%|█████▏    | 5195/9905 [1:51:49<1:43:29,  1.32s/it]

list index out of range


 54%|█████▎    | 5313/9905 [1:54:23<1:11:42,  1.07it/s]

list index out of range


 61%|██████    | 6001/9905 [2:09:09<1:15:40,  1.16s/it]

Current 5584/6000


 62%|██████▏   | 6111/9905 [2:11:32<1:15:19,  1.19s/it]

list index out of range


 76%|███████▌  | 7501/9905 [2:40:32<1:07:10,  1.68s/it]

Current 6977/7500


 86%|████████▌ | 8529/9905 [3:01:06<36:23,  1.59s/it]

list index out of range


 87%|████████▋ | 8584/9905 [3:02:32<25:18,  1.15s/it]

invalid literal for int() with base 10: ''


 88%|████████▊ | 8668/9905 [3:04:40<33:29,  1.62s/it]

list index out of range


 90%|█████████ | 8917/9905 [3:09:55<21:34,  1.31s/it]

list index out of range


 91%|█████████ | 9001/9905 [3:12:26<21:12,  1.41s/it]

Current 8369/9000


 93%|█████████▎| 9213/9905 [3:16:51<15:10,  1.32s/it]

list index out of range


 94%|█████████▍| 9311/9905 [3:18:55<07:16,  1.36it/s]

list index out of range


100%|██████████| 9905/9905 [3:31:40<00:00,  1.28s/it]
