In [2]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import  numpy as np

In [3]:
car_urls = []
for page in range(1,51):
    url = f'https://www.cars.com/for-sale/searchresults.action/?bsId=20211&dealerType=all&mkId=20088&page={page}&perPage=100&rd=500&searchSource=GN_REFINEMENT&sort=distance-nearest&stkTypId=28881&yrId=27381,34923,39723,47272,51683,56007,58487,30031936,35797618,36362520,36620293&zc=92835'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    
    car_links = soup.find_all('a', class_="shop-srp-listings__listing")

    for link in car_links:
        car_urls.append(f'https://www.cars.com{link["href"]}')

In [4]:
s = pd.Series(car_urls)
s.nunique(), len(s)

(5000, 5000)

In [5]:
car_data = {'title':[], 'price':[], 'mileage':[], 'exterior_color':[], 'interior_color':[], 'fuel_type':[],'engine':[], 'transmission':[], \
            'city_mpg':[], 'hwy_mpg':[]}

for car_url in car_urls:
    car_response = requests.get(car_url)

    car_soup = bs(car_response.text, 'html.parser')

    title = np.NAN
    price = np.NAN
    mileage = np.NAN
    exterior = np.NAN
    interior = np.NAN
    fuel = np.NAN
    engine = np.NAN
    transmission = np.NAN
    city = np.NAN
    hwy = np.NAN
    
    if car_soup.find('h1', class_="vehicle-info__title"):
        title= car_soup.find('h1', class_="vehicle-info__title").text.strip()
        
    if car_soup.find('div', class_="vehicle-info__price"):
        price = car_soup.find('div', class_="vehicle-info__price").text.strip()\
            .replace('$','').replace(',','')
        
    car_details = car_soup.find_all("li", class_="vdp-details-basics__item")

    for detail in car_details:
        detail_text = detail.text.strip().split(': ')

        if detail_text[0] == "Fuel Type":
            fuel =  detail_text[1]

        elif detail_text[0] == "City MPG":
            city = detail_text[1]

        elif detail_text[0] == "Highway MPG":
            hwy = detail_text[1]

        elif detail_text[0] == "Engine":
            engine = detail_text[1]

        elif detail_text[0] == "Mileage":
            mileage = detail_text[1].replace(',','')

        elif detail_text[0] == "Transmission":
            transmission = detail_text[1]

        elif detail_text[0] == "Exterior Color":
            exterior = detail_text[1]

        elif detail_text[0] == "Interior Color":
            interior = detail_text[1]

    car_data['title'].append(title) 
    car_data['price'].append(price) 
    car_data['mileage'].append(mileage) 
    car_data['exterior_color'].append(exterior)
    car_data['interior_color'].append(interior) 
    car_data['fuel_type'].append(fuel) 
    car_data['engine'].append(engine) 
    car_data['transmission'].append(transmission) 
    car_data['city_mpg'].append(city) 
    car_data['hwy_mpg'].append(hwy)
    

In [6]:
df = pd.DataFrame(car_data)

In [7]:
df.count()

title             4998
price             4998
mileage           4998
exterior_color    4794
interior_color    4030
fuel_type         4998
engine            4988
transmission      4962
city_mpg          4479
hwy_mpg           4479
dtype: int64

In [8]:
df = df.dropna(subset=['title'])
len(df)

4998

In [9]:
df["brand"] = "Toyota"
df["model"] = df["title"].map(lambda x: x.split('Toyota')[1])
df["year"] = df["title"].map(lambda x: x.replace('Certified ', '')[:4])

df = df.drop('title', 1)

df = df[['brand', 'model','year', 'price', 'mileage', 'exterior_color', 'interior_color', 'fuel_type',
       'engine', 'transmission', 'city_mpg', 'hwy_mpg']]

In [10]:
df.head()

Unnamed: 0,brand,model,year,price,mileage,exterior_color,interior_color,fuel_type,engine,transmission,city_mpg,hwy_mpg
0,Toyota,Corolla S,2013,10495,79970,,,Gasoline,1.8L I4 16V MPFI DOHC,4-Speed Automatic,27.0,34.0
1,Toyota,Corolla LE,2010,6995,91972,Black,Gray,Gasoline,1.8L I4 16V MPFI DOHC,4-Speed Automatic,26.0,34.0
2,Toyota,Camry SE,2010,8995,103402,,,Gasoline,3.5L V6 24V MPFI DOHC,6-Speed Automatic,22.0,33.0
3,Toyota,Camry PACKAGE-MANUAL TRANSMISSION-RARE!!,2011,7495,117396,Tan,Tan,Gasoline,2.5L I4 16V MPFI DOHC,6-Speed Manual,,
4,Toyota,Corolla LE,2011,7995,136685,Black,Gray,Gasoline,1.8L I4 16V MPFI DOHC,4-Speed Automatic,26.0,34.0


In [11]:
df.to_csv("../data/toyota.csv", index=False)

In [12]:
df_test = pd.read_csv("../data/toyota.csv")
df_test.head()

Unnamed: 0,brand,model,year,price,mileage,exterior_color,interior_color,fuel_type,engine,transmission,city_mpg,hwy_mpg
0,Toyota,Corolla S,2013,10495,79970,,,Gasoline,1.8L I4 16V MPFI DOHC,4-Speed Automatic,27.0,34.0
1,Toyota,Corolla LE,2010,6995,91972,Black,Gray,Gasoline,1.8L I4 16V MPFI DOHC,4-Speed Automatic,26.0,34.0
2,Toyota,Camry SE,2010,8995,103402,,,Gasoline,3.5L V6 24V MPFI DOHC,6-Speed Automatic,22.0,33.0
3,Toyota,Camry PACKAGE-MANUAL TRANSMISSION-RARE!!,2011,7495,117396,Tan,Tan,Gasoline,2.5L I4 16V MPFI DOHC,6-Speed Manual,,
4,Toyota,Corolla LE,2011,7995,136685,Black,Gray,Gasoline,1.8L I4 16V MPFI DOHC,4-Speed Automatic,26.0,34.0


In [13]:
df['model'].value_counts()

 Corolla LE                                  1014
 Camry SE                                     929
 Camry LE                                     785
 Corolla SE                                   216
 Corolla S Plus                               199
 Camry XSE                                    182
 Avalon XLE                                   178
 Corolla L                                    162
 Camry 4-DOOR SE SEDAN                        127
 Camry XLE                                    122
 Camry 4-DOOR LE SEDAN                        102
 Yaris iA Base                                 83
 Corolla S                                     80
 Corolla LE Plus                               55
 Corolla S PLUS                                49
 Camry Hybrid LE                               48
 Camry Hybrid XLE                              39
 Mirai Base                                    38
 Avalon Touring                                31
 Avalon Limited                                29
