In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import  numpy as np

In [2]:
car_urls = []
for page in range(1,51):
    url = f'https://www.cars.com/for-sale/searchresults.action/?bsId=20211&dealerType=all&mkId=20068&page={page}&perPage=100&rd=99999&searchSource=GN_REFINEMENT&sort=distance-nearest&stkTypId=28881&yrId=27381,34923,39723,47272,51683,56007,58487,30031936,35797618,36362520,36620293&zc=92835'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    
    car_links = soup.find_all('a', class_="shop-srp-listings__listing")

    for link in car_links:
        car_urls.append(f'https://www.cars.com{link["href"]}')

In [4]:
s = pd.Series(car_urls)
s.nunique(), len(s)

(5000, 5000)

In [5]:
car_data = {'title':[], 'price':[], 'mileage':[], 'exterior_color':[], 'interior_color':[], 'fuel_type':[],'engine':[], 'transmission':[], \
            'city_mpg':[], 'hwy_mpg':[]}

for car_url in car_urls:
    car_response = requests.get(car_url)

    car_soup = bs(car_response.text, 'html.parser')

    title = np.NAN
    price = np.NAN
    mileage = np.NAN
    exterior = np.NAN
    interior = np.NAN
    fuel = np.NAN
    engine = np.NAN
    transmission = np.NAN
    city = np.NAN
    hwy = np.NAN
    
    if car_soup.find('h1', class_="vehicle-info__title"):
        title= car_soup.find('h1', class_="vehicle-info__title").text.strip()
        
    if car_soup.find('div', class_="vehicle-info__price"):
        price = car_soup.find('div', class_="vehicle-info__price").text.strip()\
            .replace('$','').replace(',','')
        
    car_details = car_soup.find_all("li", class_="vdp-details-basics__item")

    for detail in car_details:
        detail_text = detail.text.strip().split(': ')

        if detail_text[0] == "Fuel Type":
            fuel =  detail_text[1]

        elif detail_text[0] == "City MPG":
            city = detail_text[1]

        elif detail_text[0] == "Highway MPG":
            hwy = detail_text[1]

        elif detail_text[0] == "Engine":
            engine = detail_text[1]

        elif detail_text[0] == "Mileage":
            mileage = detail_text[1].replace(',','')

        elif detail_text[0] == "Transmission":
            transmission = detail_text[1]

        elif detail_text[0] == "Exterior Color":
            exterior = detail_text[1]

        elif detail_text[0] == "Interior Color":
            interior = detail_text[1]

    car_data['title'].append(title) 
    car_data['price'].append(price) 
    car_data['mileage'].append(mileage) 
    car_data['exterior_color'].append(exterior)
    car_data['interior_color'].append(interior) 
    car_data['fuel_type'].append(fuel) 
    car_data['engine'].append(engine) 
    car_data['transmission'].append(transmission) 
    car_data['city_mpg'].append(city) 
    car_data['hwy_mpg'].append(hwy)
    

In [6]:
df = pd.DataFrame(car_data)

In [7]:
df.count()

title             4995
price             4995
mileage           4995
exterior_color    4943
interior_color    4448
fuel_type         4995
engine            4965
transmission      4971
city_mpg          4797
hwy_mpg           4797
dtype: int64

In [8]:
df = df.dropna(subset=['title'])
len(df)

4995

In [9]:
df["brand"] = "Kia"
df["model"] = df["title"].map(lambda x: x.split('Kia')[1])
df["year"] = df["title"].map(lambda x: x.replace('Certified ', '')[:4])

df = df.drop('title', 1)

df = df[['brand', 'model','year', 'price', 'mileage', 'exterior_color', 'interior_color', 'fuel_type',
       'engine', 'transmission', 'city_mpg', 'hwy_mpg']]

In [10]:
df["year"].unique()

array(['2013', '2016', '2018', '2019', '2015', '2014', '2017', '2012',
       '2010', '2011'], dtype=object)

In [11]:
df.head()

Unnamed: 0,brand,model,year,price,mileage,exterior_color,interior_color,fuel_type,engine,transmission,city_mpg,hwy_mpg
0,Kia,Rio LX,2013,6995,68085,Black,Black,Gasoline,1.6L I4 16V GDI DOHC,6-Speed Automatic,29,37
1,Kia,Forte LX,2016,9995,71669,Dark Blue,Black,Gasoline,1.8L I4 16V MPFI DOHC,6-Speed Automatic,25,37
2,Kia,Optima EX,2013,9995,89047,,,Gasoline,2.4L I4 16V GDI DOHC,6-Speed Automatic,24,35
3,Kia,Forte EX,2013,6995,91000,Silver,Stone,Gasoline,2.0L I4 16V MPFI DOHC,6-Speed Automatic,26,36
4,Kia,Forte LX,2016,7995,124277,Black,Black,Gasoline,1.8L I4 16V MPFI DOHC,6-Speed Automatic,25,37


In [12]:
df.to_csv("../data/kia.csv", index=False)

In [13]:
df_test = pd.read_csv("../data/kia.csv")
df_test.head()

Unnamed: 0,brand,model,year,price,mileage,exterior_color,interior_color,fuel_type,engine,transmission,city_mpg,hwy_mpg
0,Kia,Rio LX,2013,6995,68085,Black,Black,Gasoline,1.6L I4 16V GDI DOHC,6-Speed Automatic,29.0,37.0
1,Kia,Forte LX,2016,9995,71669,Dark Blue,Black,Gasoline,1.8L I4 16V MPFI DOHC,6-Speed Automatic,25.0,37.0
2,Kia,Optima EX,2013,9995,89047,,,Gasoline,2.4L I4 16V GDI DOHC,6-Speed Automatic,24.0,35.0
3,Kia,Forte EX,2013,6995,91000,Silver,Stone,Gasoline,2.0L I4 16V MPFI DOHC,6-Speed Automatic,26.0,36.0
4,Kia,Forte LX,2016,7995,124277,Black,Black,Gasoline,1.8L I4 16V MPFI DOHC,6-Speed Automatic,25.0,37.0


In [14]:
df['model'].value_counts()

 Optima LX                            1483
 Forte LX                              773
 Optima EX                             762
 Rio LX                                187
 Forte EX                              179
 Optima SX Turbo                       167
 Optima SX                             152
 Forte LXS                             144
 Cadenza Premium                       123
 Optima SXL Turbo                      102
 Optima Hybrid EX                       94
 Rio S                                  89
 K900 Luxury 3.8L                       66
 Stinger GT2                            56
 Optima Plug-In Hybrid EX               53
 K900 Luxury                            51
 Optima S                               47
 Forte S                                46
 Optima Hybrid Base                     36
 Stinger Premium                        30
 Optima LX Turbo                        28
 Stinger Base                           27
 Cadenza SX Limited                     26
 Optima Hyb