In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import  numpy as np

In [2]:
car_urls = []
for page in range(1,51):
    url = f'https://www.cars.com/for-sale/searchresults.action/?bsId=20211&dealerType=all&mkId=20017&page={page}&perPage=100&rd=99999&searchSource=GN_REFINEMENT&sort=distance-nearest&stkTypId=28881&yrId=27381,34923,39723,47272,51683,56007,58487,30031936,35797618,36362520,36620293&zc=92835'
    
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    
    car_links = soup.find_all('a', class_="shop-srp-listings__listing")

    for link in car_links:
        car_urls.append(f'https://www.cars.com{link["href"]}')

In [3]:
s = pd.Series(car_urls)
s.nunique(), len(s)

(5000, 5000)

In [4]:
car_data = {'title':[], 'price':[], 'mileage':[], 'exterior_color':[], 'interior_color':[], 'fuel_type':[],'engine':[], 'transmission':[], \
            'city_mpg':[], 'hwy_mpg':[]}

for car_url in car_urls:
    car_response = requests.get(car_url)

    car_soup = bs(car_response.text, 'html.parser')

    title = np.NAN
    price = np.NAN
    mileage = np.NAN
    exterior = np.NAN
    interior = np.NAN
    fuel = np.NAN
    engine = np.NAN
    transmission = np.NAN
    city = np.NAN
    hwy = np.NAN
    
    if car_soup.find('h1', class_="vehicle-info__title"):
        title= car_soup.find('h1', class_="vehicle-info__title").text.strip()
        
    if car_soup.find('div', class_="vehicle-info__price"):
        price = car_soup.find('div', class_="vehicle-info__price").text.strip()\
            .replace('$','').replace(',','')
        
    car_details = car_soup.find_all("li", class_="vdp-details-basics__item")

    for detail in car_details:
        detail_text = detail.text.strip().split(': ')

        if detail_text[0] == "Fuel Type":
            fuel =  detail_text[1]

        elif detail_text[0] == "City MPG":
            city = detail_text[1]

        elif detail_text[0] == "Highway MPG":
            hwy = detail_text[1]

        elif detail_text[0] == "Engine":
            engine = detail_text[1]

        elif detail_text[0] == "Mileage":
            mileage = detail_text[1].replace(',','')

        elif detail_text[0] == "Transmission":
            transmission = detail_text[1]

        elif detail_text[0] == "Exterior Color":
            exterior = detail_text[1]

        elif detail_text[0] == "Interior Color":
            interior = detail_text[1]

    car_data['title'].append(title) 
    car_data['price'].append(price) 
    car_data['mileage'].append(mileage) 
    car_data['exterior_color'].append(exterior)
    car_data['interior_color'].append(interior) 
    car_data['fuel_type'].append(fuel) 
    car_data['engine'].append(engine) 
    car_data['transmission'].append(transmission) 
    car_data['city_mpg'].append(city) 
    car_data['hwy_mpg'].append(hwy)
    

In [5]:
df = pd.DataFrame(car_data)

In [6]:
df.count()

title             4994
price             4994
mileage           4994
exterior_color    4898
interior_color    4164
fuel_type         4994
engine            4459
transmission      4974
city_mpg          4676
hwy_mpg           4676
dtype: int64

In [7]:
df = df.dropna(subset=['title'])
len(df)

4994

In [11]:
df["brand"] = "Honda"
df["model"] = df["title"].map(lambda x: x.split('Honda')[1])
df["year"] = df["title"].map(lambda x: x.replace('Certified ', '')[:4])

df = df.drop('title', 1)

df = df[['brand', 'model','year', 'price', 'mileage', 'exterior_color', 'interior_color', 'fuel_type',
       'engine', 'transmission', 'city_mpg', 'hwy_mpg']]

In [12]:
df.head()

Unnamed: 0,brand,model,year,price,mileage,exterior_color,interior_color,fuel_type,engine,transmission,city_mpg,hwy_mpg
0,Honda,Civic CNG,2012,5995,139167,Silver,Gray,Compressed Natural Gas,4 Cylinder Engine,Automatic,,
1,Honda,Accord Sport,2014,15000,79160,White,Black,Gasoline,4-cylinder,Automatic,24.0,34.0
2,Honda,Civic EX-L,2012,10995,48508,Silver,Gray,Gasoline,1.8L I4 16V MPFI SOHC,5-Speed Automatic,28.0,39.0
3,Honda,Accord LX-P,2012,11495,73068,White,Beige,Gasoline,2.4L I4 16V MPFI DOHC,5-Speed Automatic,23.0,34.0
4,Honda,Accord LX,2011,10495,83704,,,Gasoline,2.4L I4 16V MPFI DOHC,5-Speed Automatic,23.0,33.0


In [13]:
df.to_csv("../data/honda.csv", index=False)

In [14]:
df_test = pd.read_csv("../data/honda.csv")
df_test.head()

Unnamed: 0,brand,model,year,price,mileage,exterior_color,interior_color,fuel_type,engine,transmission,city_mpg,hwy_mpg
0,Honda,Civic CNG,2012,5995,139167,Silver,Gray,Compressed Natural Gas,4 Cylinder Engine,Automatic,,
1,Honda,Accord Sport,2014,15000,79160,White,Black,Gasoline,4-cylinder,Automatic,24.0,34.0
2,Honda,Civic EX-L,2012,10995,48508,Silver,Gray,Gasoline,1.8L I4 16V MPFI SOHC,5-Speed Automatic,28.0,39.0
3,Honda,Accord LX-P,2012,11495,73068,White,Beige,Gasoline,2.4L I4 16V MPFI DOHC,5-Speed Automatic,23.0,34.0
4,Honda,Accord LX,2011,10495,83704,,,Gasoline,2.4L I4 16V MPFI DOHC,5-Speed Automatic,23.0,33.0


In [16]:
df['model'].value_counts()

 Civic LX                                  1194
 Accord LX                                  717
 Accord Sport                               618
 Civic EX                                   544
 Accord EX-L                                350
 Civic EX-T                                 288
 Accord EX                                  186
 Civic EX-L                                 176
 Accord Touring                             135
 Civic Si                                   113
 Accord Sport SE                             86
 Civic Touring                               78
 Clarity Plug-In Hybrid Base                 41
 Accord Hybrid Touring                       36
 Accord Touring 2.0T                         35
 Accord SE                                   32
 Clarity Plug-In Hybrid Touring              30
 Accord EX-L 2.0T                            29
 Accord Hybrid EX-L                          28
 Civic Sport                                 26
 Accord LX-P                            