In [1]:
import itertools
import pandas as pd
import numpy as np
import scipy.stats
import re
import requests
import math
from bs4 import BeautifulSoup
from selenium import webdriver
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg
import matplotlib.figure
% matplotlib inline
import seaborn as sb
import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_curve, roc_auc_score

UsageError: Line magic function `%` not found.


### HTML soups from all 500 web pages

In [2]:
def get_soups(website_number):
    get_url = requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=' + str(website_number))
    return BeautifulSoup(get_url.content, 'lxml')

In [3]:
soups = list(map(get_soups, list(range(2501, 2506))))

## URLs scraping

In [4]:
def urls_scraper(soup):
    nth_urls = []
    def urlsppage(nth):
        finding = soup.find_all('a', {'data-test': 'usedListing'})[nth]
        return re.findall('href="/.+" style', str(finding)[:280])[0]
    nth_urls = list(map(urlsppage, list(range(30))))
    return nth_urls

urls_list = list(map(urls_scraper, soups))
pages_urls = np.array(urls_list).flatten()
url_formatter = np.vectorize(lambda url: 'https://truecar.com' + url[6: -7])
urls = url_formatter(pages_urls)

In [5]:
def fscraper(url, feature):
    nth_request = requests.get(url)
    nth_soup = BeautifulSoup(nth_request.content, 'lxml')
    nth_search = re.search(feature + '</h4><ul><li>.+</li', str(nth_soup))
    try:
        return re.findall('li>.+</l', str(nth_search))[0][3: -3]
    except:
        return np.NaN

## First part of vehicles features scraping

In [6]:
drive_types = list(map(fscraper, urls, itertools.repeat('Drive Type')))
fuel_types = list(map(fscraper, urls, itertools.repeat('Fuel Type')))

In [7]:
mileages = list(map(fscraper, urls, itertools.repeat('Mileage')))
transmissions = list(map(fscraper, urls, itertools.repeat('Transmission')))

In [8]:
MPGs = list(map(fscraper, urls, itertools.repeat('MPG')))
options_levels = list(map(fscraper, urls, itertools.repeat('Options Level')))
bed_lengths = list(map(fscraper, urls, itertools.repeat('Bed Length')))

In [9]:
engines = list(map(fscraper, urls, itertools.repeat('Engine')))
exterior_colors = list(map(fscraper, urls, itertools.repeat('Exterior Color')))
interior_colors = list(map(fscraper, urls, itertools.repeat('Interior Color')))
styles = list(map(fscraper, urls, itertools.repeat('Style')))

## Vehicles Years, Makes and Models scraping

In [10]:
'''def ymm_scraper(url, index):
    nth_request = requests.get(url)
    nth_soup = BeautifulSoup(nth_request.content, 'lxml')
    nth_finding = nth_soup.find_all('div', {'class': 'text-truncate heading-3 margin-right-2 margin-right-sm-3'})
    try:
        if index == 2:
            return re.findall('>.+<', str(nth_search))[0][1: -1].split()[2:]
        else:
            return re.findall('>.+<', str(nth_search))[0][1: -1].split()[index]
    except:
        return np.NaN'''

In [43]:
def ymm_scraper(url, feat_name):
    nth_request = requests.get(url)
    nth_soup = BeautifulSoup(nth_request.content, 'lxml')
    nth_finding = nth_soup.find_all('div', {'class': 'text-truncate heading-3 margin-right-2 margin-right-sm-3'})
    try:
        if feat_name == 'year':
            return re.findall('>.+</', str(nth_finding))[0].split()[0][1:]
        elif feat_name == 'make':
            return ' '.join(re.findall('>.+</', str(nth_finding))[0].split()[1: -1])[1: -2]
        else:
            return re.findall('>.+</', str(nth_finding))[0].split()[-1][0][: -2]
    except:
        return np.NaN

In [None]:
years = list(map(ymm_scraper, urls, itertools.repeat('year')))
makes = list(map(ymm_scraper, urls, itertools.repeat('make')))
models = list(map(ymm_scraper, urls, itertools.repeat('models')))

## Vehicles Prices scraping

In [12]:
def prices_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('div', {'data-qa': 'LabelBlock-text'})
    try:
        return re.findall('[0-9]+,[0-9]+', str(nth_soup))[0]
    except:
        return np.NaN

In [13]:
prices = list(map(prices_scraper, urls))

## Vehicles Locations (Cities and States) scraping

In [14]:
def cities_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('span', {'data-qa': 'used-vdp-header-location'})
    try:
        return re.findall('">.+<!', str(nth_soup))[0][2: -12]
    except:
        return np.NaN

In [15]:
def states_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('span', {'data-qa': 'used-vdp-header-location'})
    try:
        return re.findall('[A-W][A-Z]', str(nth_soup))[0]
    except:
        return np.NaN

In [16]:
cities = list(map(cities_scraper, urls))
states = list(map(states_scraper, urls))

## Vehicles Conditions scraping

In [17]:
def conditions_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml').find_all('li', {'class': '_h9wfdq'})
    try:
        return re.findall('">[0-9]<!', str(nth_soup[0]))[0][2: -2] + re.findall('->.+</l', str(nth_soup[0]))[0][2: -3]
    except:
        return np.NaN

In [18]:
conditions = list(map(conditions_scraper, urls))

## Dataset

In [None]:
features = {
    'Make': makes, 'Model': models, 'Year': years, 'Mileage': mileages, 'Transmission': transmissions,
    'Engine': engines, 'Exterior Color': exterior_colors, 'Interior Color': interior_colors,
    'MPG': MPGs, 'Fuel Type': fuel_types, 'Drive Type': drive_types, 'Location (City)': cities,
    'Location (State)': states, 'Style': styles, 'Condition (Accidents)': conditions,
    'Options Level': options_levels, 'Bed Length': bed_lengths, 'Price': prices
}

In [None]:
vehicles_data = pd.DataFrame(features)

In [None]:
vehicles_data

In [42]:
vehicles_data.to_csv('C:Users/aleja/Desktop/v2501_2506')

Unnamed: 0,Make,Model,Year,Mileage,Transmission,Engine,Exterior Color,Interior Color,MPG,Fuel Type,Drive Type,Location (City),Location (State),Style,Condition (Accidents),Options Level,Bed Length,Price
0,ercedes-Be,,2012,35795,Automatic,2.0L Inline-4 Gas,,Charcoal Black,26 cty / 36 hwy,Gas,FWD,Tacoma,WA,SE Hatchback,2 reported accidents,Standard,,7999
1,ercedes-Be,,2012,128146,Automatic,2.7L V-6 Gas Turbocharged,Unknown,Medium Earth Gray,19 cty / 26 hwy,Gas,RWD,Waxahachie,TX,XL SuperCrew 6.5\' Box RWD,0 reported accidents,,Standard Bed,15991
2,ercedes-Be,,2012,135000,Automatic,5.0L V-8 Gas,Black,Beige,16 cty / 22 hwy,Gas,AWD,Covina,CA,E 500 4MATIC Wagon,0 reported accidents,Standard,,5950
3,ercedes-Be,,2012,135670,Automatic,1.8L Inline-4 Gas,,Gray,28 cty / 38 hwy,Gas,FWD,Miami,FL,,0 reported accidents,Standard,,3999
4,ercedes-Be,,2012,152543,Automatic,1.8L Inline-4 Hybrid,White,Biege,51 cty / 48 hwy,Hybrid,FWD,Byhalia,MS,Five,1 reported accidents,Standard,,6500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,ercedes-Be,,2012,46943,Automatic,4.6L V-8 Gas,Starfire Pearl,Alabaster,16 cty / 24 hwy,Gas,RWD,Sarasota,FL,LS 460 RWD,0 reported accidents,Standard,,17990
146,ercedes-Be,,2012,124026,Automatic,1.8L Inline-4 Gas,,Gray,25 cty / 36 hwy,Gas,FWD,Everett,WA,LX Sedan Automatic,1 reported accidents,Standard,,4990
147,ercedes-Be,,2012,42269,Automatic,,Pure White,Unknown,29 cty / 39 hwy,Diesel,FWD,Dallas,TX,SportWagen TDI DSG,0 reported accidents,Standard,,12000
148,ercedes-Be,,2012,7641,Automatic,3.6L V-6 Gas,Billet Clearcoat,Black,17 cty / 25 hwy,Gas,FWD,Gladstone,MO,SXT,0 reported accidents,Fully Loaded,,20469
