In [1]:
import requests
from bs4 import BeautifulSoup
import time, os
import pandas as pd
import numpy as np

In [6]:
# header
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
}

In [3]:
def get_urls(page_url):
    '''
    Get all the cars urls on one page
    '''
    
    response = requests.get(page_url, headers=headers)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    number=len(soup.find_all('a', class_='vehicle-card-link js-gallery-click-link'))
    
    urls = []
    for i in range(number):
        url = soup.find_all('a', class_='vehicle-card-link js-gallery-click-link')[i]['href']
        urls.append(f'http://www.cars.com/{url}')
    return urls

In [4]:
def get_city_urls(page, zipcode):
    '''
    Get all cars urls for a city
    '''
    
    urls = []
    for i in range(page):
        base = 'https://www.cars.com/shopping/results/?list_price_max=&makes[]=&maximum_distance=50&models[]=&page={}&page_size=100&stock_type=used&zip={}'.format(i+1, zipcode)
        urls.append(get_urls(base))
        
    city_urls = []
    for sub in urls:
        for j in sub:
            city_urls.append(j)
    return city_urls

Los Angeles = 90012
San Diego = 92101
San Francisco = 94117
Portland = 97205
Seattle = 98101
Las Vegas = 89101
Phoenix = 85001
Dallas = 75201
Houston = 77002
San Antonio = 78205
New York = 10001
Washington, DC = 20500
Chicago = 60603
Miami = 33131
Tampa = 33602
orlando = 32805
Denver = 80202
Moreno Valley = 92555
Salt Lake City = 84111
Albuquerque = 87104
New Orleans = 70116
Memphis = 38104
Nashville = 37203
St Louis = 63102
Atlanta = 30313
Birmingham = 35204
Charlotte = 28207
Charleston = 29424
Pittsburgh = 15212
Detroit = 48226
Dayton = 45402
Richmond = 23220
Minneapolis = 55402
Norfolk = 23504
Colorado Springs = 80909
Philadelphia = 19106
Jacksonville = 32207
Oklahoma City = 73102
Milwaukee = 53201
Indianapoli = 46225
Cincinnati = 45202
Washington, DC = 20500
Green Bay = 54303
Jackson = 39217
Sacramento = 95816

In [37]:
i = get_city_urls(50, 95816)

In [38]:
i

['http://www.cars.com//vehicledetail/3aa57942-03f4-4702-ade8-6ffbb5a188ab/?results_page_number=1&search_instance_id=44191edd-f77d-4db3-a90b-531dff6a98d2&search_zipcode=95816&sponsored%3F=true&vertical_position=1',
 'http://www.cars.com//vehicledetail/34411d4f-45d0-40ac-91b1-3cc44125ebc7/?results_page_number=1&search_instance_id=44191edd-f77d-4db3-a90b-531dff6a98d2&search_zipcode=95816&sponsored%3F=false&vertical_position=2',
 'http://www.cars.com//vehicledetail/59322584-d674-42e0-b596-59102ecfec01/?results_page_number=1&search_instance_id=44191edd-f77d-4db3-a90b-531dff6a98d2&search_zipcode=95816&sponsored%3F=false&vertical_position=3',
 'http://www.cars.com//vehicledetail/833208a3-02a6-4587-b504-26bf6b038684/?results_page_number=1&search_instance_id=44191edd-f77d-4db3-a90b-531dff6a98d2&search_zipcode=95816&sponsored%3F=false&vertical_position=4',
 'http://www.cars.com//vehicledetail/27eeab52-4637-4407-ac16-1a5a68fdd77d/?results_page_number=1&search_instance_id=44191edd-f77d-4db3-a90b-5

In [39]:
len(i)

4947

In [40]:
def get_features(urls):
    '''
    Get car information, clean it, and compose in a dataframe
    '''
    
    headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
}
    
    cars = {}
    
    i = 0
    for url in urls:
        response = requests.get(url, headers=headers)
        page = response.text
        soup = BeautifulSoup(page, 'lxml')
        
        try:
            name = soup.find(class_='listing-title').text
        except (AttributeError, IndexError, ValueError) as e:
            name = np.nan
            
        try:
            price = int(''.join(soup.find(class_='primary-price').text.strip('$').split(',')))
        except (AttributeError, IndexError, ValueError) as e:
            price = np.nan
            
        try:
            mpg = (int(soup.find(class_='sds-tooltip').find('span').text.split('–')[0]) + int(soup.find(class_='sds-tooltip').find('span').text.split('–')[1]))/2
        except (AttributeError, IndexError, ValueError) as e:
            mpg = np.nan
        
        try:
            mi = int(''.join(soup.find(class_='listing-mileage').text.strip(' .mi').split(',')))
        except (AttributeError, IndexError, ValueError) as e:
            mi = np.nan
        
        try:
            dt = soup.find(class_='fancy-description-list').find_all('dd')[2].text.strip(' ')
        except (AttributeError, IndexError, ValueError) as e:
            dt = np.nan
            
        try:
            fuel = soup.find(class_='fancy-description-list').find_all('dd')[4].text.strip(' ')
        except (AttributeError, IndexError, ValueError) as e:
            fuel = np.nan
        
        try:
            trans = soup.find(class_='fancy-description-list').find_all('dd')[5].text
        except (AttributeError, IndexError, ValueError) as e:
            trans = np.nan
        
        try:
            engine = soup.find(class_='fancy-description-list').find_all('dd')[6].text
        except (AttributeError, IndexError, ValueError) as e:
            engine = np.nan
            
        try:
            ent = len(soup.find_all(class_='vehicle-features-list')[1].find_all('li'))
        except (AttributeError, IndexError, ValueError) as e:
            ent = np.nan
            
        try:
            safe = len(soup.find_all(class_='vehicle-features-list')[3].find_all('li'))
        except (AttributeError, IndexError, ValueError) as e:
            try:
                safe = len(soup.find_all(class_='vehicle-features-list')[2].find_all('li'))
            except (AttributeError, IndexError, ValueError) as e:
                safe = np.nan
        
        cars[name] = [price, mpg, mi, dt, fuel, trans, engine, ent, safe]
    
        cars_df = pd.DataFrame(cars).T
        cars_df.columns = ['price','mpg', 'mileage', 'drivetrain', 'fuel_type', 
                           'transmission', 'engine', 'num_of_entertainment',
                           'num_of_safety']
        i += 1
        if i % 100 == 0:
            print(i)
        
    return cars_df

In [29]:
df = get_features(i)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000


In [30]:
# Check data frame
df

Unnamed: 0,price,mpg,mileage,drivetrain,fuel_type,transmission,engine,num_of_entertainment,num_of_safety
2019 BMW Z4 sDrive30i,54590,28.5,10499,Rear-wheel Drive,Gasoline,8-Speed Automatic,2.0L I4 16V GDI DOHC Turbo,3,4
2019 Subaru Crosstrek 2.0i Premium,22300,30,5680,All-wheel Drive,Gasoline,Automatic CVT,2.0L H4 16V GDI DOHC,4,5
2019 Subaru Ascent Touring 7-Passenger,40469,23,28974,All-wheel Drive,Gasoline,Automatic CVT,2.4L H4 16V GDI DOHC Turbo,4,4
2020 Subaru Outback Limited XT,36600,26.5,27359,All-wheel Drive,Gasoline,Automatic CVT,2.4L H4 16V GDI DOHC Turbo,4,3
2021 Kia Stinger GT1,41999,12.5,12655,All-wheel Drive,Gasoline,Automatic,Twin Turbo Premium Unleaded V-6 3.3 L/204,3,7
...,...,...,...,...,...,...,...,...,...
2018 GMC Sierra 3500 SLE,49995,,47222,Four-wheel Drive,E85 Flex Fuel,6-Speed Automatic,6.0L V8 16V MPFI OHV Flexible Fuel,1,3
2011 Ford F-150 Lariat,17995,16.5,154272,Four-wheel Drive,Gasoline,6-Speed Automatic,3.5L V6 24V GDI DOHC Twin Turbo,2,2
2016 RAM 2500 Tradesman,30995,,192063,Four-wheel Drive,Diesel,6-Speed Automatic,6.7L I6 24V DDI OHV Turbo Diesel,1,2
2005 Dodge Ram 2500 SLT Quad Cab,29995,,154892,Four-wheel Drive,Diesel,Automatic,5.9L I6 24V DDI OHV Turbo Diesel,,


In [31]:
# Check duplicates 
df[df.duplicated()]

Unnamed: 0,price,mpg,mileage,drivetrain,fuel_type,transmission,engine,num_of_entertainment,num_of_safety


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2874 entries, 2019 BMW Z4 sDrive30i to 2018 MINI Countryman John Cooper Works ALL4
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   price                 2874 non-null   object
 1   mpg                   2593 non-null   object
 2   mileage               2874 non-null   object
 3   drivetrain            2874 non-null   object
 4   fuel_type             2874 non-null   object
 5   transmission          2874 non-null   object
 6   engine                2874 non-null   object
 7   num_of_entertainment  2766 non-null   object
 8   num_of_safety         2586 non-null   object
dtypes: object(9)
memory usage: 224.5+ KB


In [33]:
df.to_csv('city_29.csv')