In [1]:
import itertools
import pandas as pd
import numpy as np
import scipy.stats
import re
import requests
import math
from bs4 import BeautifulSoup
from selenium import webdriver
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg
import matplotlib.figure
import seaborn as sb
import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_curve, roc_auc_score

### HTML soups from all 500 web pages

In [2]:
def get_soups(website_number):
    get_url = requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=' + str(website_number))
    return BeautifulSoup(get_url.content, 'lxml')

In [None]:
soups = list(map(get_soups, list(range(1, 8))))

## URLs scraping

In [None]:
fst_page_urls = np.array([])

for ind in range(33):
    finding = soups[0].find_all('a', {'data-test': 'usedListing'})[ind]
    fst_page_urls = np.append(fst_page_urls, re.findall('href=".+" style', str(finding)[:280]))

def urls_scraper(soup):
    nth_urls = []
    def urlsppage(nth):
        finding = soup.find_all('a', {'data-test': 'usedListing'})[nth]
        return re.findall('href="/.+" style', str(finding)[:280])[0]
    nth_urls = list(map(urlsppage, list(range(30))))
    return nth_urls

rest_urls_list = list(map(urls_scraper, soups[1:]))
rest_pages_urls = np.array(rest_urls_list).flatten()
all_urls = np.append(fst_page_urls, rest_pages_urls)
url_formatter = np.vectorize(lambda url: 'https://truecar.com' + url[6: -7])
urls = url_formatter(all_urls)

In [None]:
def scraper(feature_as_argument):
    
    def feature_from_url(url):
        nth_request = requests.get(url)
        nth_soup = BeautifulSoup(nth_request.content, 'lxml')
        nth_search = re.search(feature_as_argument + '</h4><ul><li>.+</li', str(nth_soup))
        try:
            return re.findall('li>.+</l', str(nth_search))[0][3: -3]
        except:
            return np.NaN
    
    return list(map(feature_from_url, urls))

## First part of vehicles features scraping

In [None]:
drive_types = scraper('Drive Type')
fuel_types = scraper('Fuel Type')
mileages = scraper('Mileage')
transmissions = scraper('Transmission')
MPGs = scraper('MPG')
styles = scraper('Style')
options_levels = scraper('Options Level')
bed_lengths = scraper('Bed Length')
engines = scraper('Engine')
exterior_colors = scraper('Exterior Color')
interior_colors = scraper('Interior Color')

## Vehicles Years, Makes and Models scraping

In [None]:
def scraper2(index):
    
    def feature_from_url(url):
        nth_request = requests.get(url)
        nth_soup = BeautifulSoup(nth_request.content, 'lxml')
        nth_finding = nth_soup.find_all('div', {'class': 'text-truncate heading-3 margin-right-2 margin-right-sm-3'})
        try:
            if index == 2:
                return re.findall('>.+<', str(nth_search))[0][1: -1].split()[2:]
            else:
                return re.findall('>.+<', str(nth_search))[0][1: -1].split()[index]
        except:
            return np.NaN
        
    return list(map(feature_from_url, urls))

In [None]:
years = scraper2(0)
makes = scraper2(1)
models = scraper2(2)

## Vehicles Prices scraping

In [None]:
def prices_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('div', {'data-qa': 'LabelBlock-text'})
    try:
        return re.findall('[0-9]+,[0-9]+', str(nth_soup))[0]
    except:
        return np.NaN

In [None]:
prices = list(map(prices_scraper, urls))

## Vehicles Locations (Cities and States) scraping

In [None]:
def cities_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('span', {'data-qa': 'used-vdp-header-location'})
    try:
        return re.findall('">.+<!', str(nth_soup))[0][2: -12]
    except:
        return np.NaN

In [None]:
def states_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('span', {'data-qa': 'used-vdp-header-location'})
    try:
        return re.findall('[A-W][A-Z]', str(nth_soup))[0]
    except:
        return np.NaN

In [None]:
cities = list(map(cities_scraper, urls))
states = list(map(states_scraper, urls))

## Vehicles Conditions scraping

In [None]:
def conditions_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('li', {'class': '_h9wfdq'})
    try:
        return re.findall('">[0-9]<!', str(nth_soup[0]))[0][2: -2] + re.findall('->.+</l', str(nth_soup[0]))[0][2: -3]
    except:
        return np.NaN

In [None]:
conditions = list(map(conditions_scraper, urls))

## Dataset

In [None]:
features = {
    'Make': makes, 'Model': models, 'Year': years, 'Mileage': mileages, 'Transmission': transmissions,
    'Engine': engines, 'Exterior Color': exterior_colors, 'Interior Color': interior_colors,
    'MPG': MPGs, 'Fuel Type': fuel_types, 'Drive Type': drive_types, 'Location (City)': cities,
    'Location (State)': states, 'Style': styles, 'Condition (Accidents)': conditions,
    'Options Level': options_levels, 'Bed Length': bed_lengths, 'Price': prices
}

In [None]:
vehicles_data = pd.DataFrame(features)