In [60]:
import itertools
import pandas as pd
import numpy as np
import scipy.stats
import re
import requests
import math
from bs4 import BeautifulSoup
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg
import matplotlib.figure
import seaborn as sb
import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_curve, roc_auc_score

### VEHICLES PRICES SCRAPING FROM WEB PAGE LISTINGS

In [3]:
fst_page_url = 'https://www.truecar.com/used-cars-for-sale/listings/'
url_requested = requests.get(fst_page_url)
fst_page_soup = BeautifulSoup(url_requested.content, 'lxml')
fst_page_soup

<!DOCTYPE html>
<html lang="en-US">
<head>
<title data-rh="true">Used Cars for Sale | TrueCar</title>
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" data-rh="true" name="viewport"/><meta content="Search over 879,013 used Cars. TrueCar has over 1,416,276 listings nationwide, updated daily. Come find a great deal on used Cars in your area today!" data-rh="true" name="description"/>
<link data-qa="MasterHelmet" data-rh="true" href="https://listings-prod.tcimg.net/listings/" rel="preconnect"/><link data-qa="MasterHelmet" data-rh="true" href="https://static.tcimg.net" rel="preconnect"/><link data-qa="MasterHelmet" data-rh="true" href="https://consumer.tcimg.net/assets" rel="preconnect"/><link data-rh="true" href="https://static.tcimg.net/pac/7/7ec678517aba12406d2c80bd5064492171ac8f86.ico" rel="icon" type="image/x-icon"/><link data-rh="true" href="https://www.truecar.com/used-cars-for-sale/listings/" rel="canonical"/><link data-rh="true" href="https://

In [4]:
fst_page_prices = fst_page_soup.find_all('h4', {'data-test': 'vehicleCardPricingBlockPrice'})
fst_page_prices

[<h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$20,995</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$25,470</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$23,634</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$7,495</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$8,500</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$9,995</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$8,799</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$28,300</h4>,
 <h4 class="

In [5]:
re.findall('[0-9]+,[0-9]+', str(fst_page_prices))

['20,995',
 '25,470',
 '23,634',
 '7,495',
 '8,500',
 '9,995',
 '8,799',
 '28,300',
 '26,990',
 '26,419',
 '12,924',
 '24,995',
 '29,940',
 '7,900',
 '10,988',
 '7,944',
 '11,000',
 '30,799',
 '9,991',
 '31,945',
 '7,991',
 '7,889',
 '12,785',
 '21,997',
 '8,979',
 '3,990',
 '5,500',
 '25,279',
 '27,550',
 '8,000',
 '13,999',
 '7,099',
 '10,598']

In [6]:
# ANOTHER WAY TO SCRAPE THE PRICES
# listings_soup = BeautifulSoup(requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=2').content, 'lxml')
# listings_soup.find_all('h4', {'class': 'heading-3 margin-y-1 font-weight-bold'})

In [61]:
# FUNCTION TO GET THE RAW HTML SOUPS FROM ALL 600 WEB PAGES

def get_soups(website_number):
    get_url = requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=' + str(website_number))
    return BeautifulSoup(get_url.content, 'lxml')

In [62]:
soups = list(map(get_soups, list(range(1, 10))))

In [9]:
# FUNCTION TO SCRAPE THE CARS PRICES FOR EACH WEB PAGE

def pricesscraper(soup):
    nth_page_prices_soup = soup.find_all('h4', {'data-test': 'vehicleCardPricingBlockPrice'})
    nth_page_prices = re.findall('[0-9]+,[0-9]+', str(nth_page_prices_soup))
    return nth_page_prices

In [10]:
prices = list(map(pricesscraper, soups))
prices

[['20,995',
  '25,470',
  '23,634',
  '7,495',
  '8,500',
  '9,995',
  '8,799',
  '28,300',
  '26,990',
  '26,419',
  '12,924',
  '24,995',
  '29,940',
  '7,900',
  '10,988',
  '7,944',
  '11,000',
  '30,799',
  '9,991',
  '31,945',
  '7,991',
  '7,889',
  '12,785',
  '21,997',
  '8,979',
  '3,990',
  '5,500',
  '25,279',
  '27,550',
  '8,000',
  '13,999',
  '7,099',
  '10,598'],
 ['12,995',
  '10,099',
  '26,995',
  '27,950',
  '8,995',
  '10,488',
  '13,000',
  '10,988',
  '3,495',
  '32,000',
  '30,487',
  '26,995',
  '14,459',
  '8,981',
  '6,499',
  '4,900',
  '9,900',
  '9,999',
  '8,980',
  '8,995',
  '13,290',
  '12,995',
  '33,975',
  '4,488',
  '9,600',
  '14,490',
  '10,999',
  '12,988',
  '26,500',
  '5,499'],
 ['6,895',
  '4,495',
  '4,199',
  '15,990',
  '5,995',
  '9,985',
  '28,000',
  '9,500',
  '19,500',
  '6,990',
  '6,985',
  '3,999',
  '16,695',
  '9,991',
  '30,991',
  '12,998',
  '25,995',
  '8,500',
  '8,995',
  '4,994',
  '10,750',
  '13,563',
  '28,900',
  '10

In [11]:
prices = str(prices).replace('[', '').replace(']', '').split(', ')
prices

["'20,995'",
 "'25,470'",
 "'23,634'",
 "'7,495'",
 "'8,500'",
 "'9,995'",
 "'8,799'",
 "'28,300'",
 "'26,990'",
 "'26,419'",
 "'12,924'",
 "'24,995'",
 "'29,940'",
 "'7,900'",
 "'10,988'",
 "'7,944'",
 "'11,000'",
 "'30,799'",
 "'9,991'",
 "'31,945'",
 "'7,991'",
 "'7,889'",
 "'12,785'",
 "'21,997'",
 "'8,979'",
 "'3,990'",
 "'5,500'",
 "'25,279'",
 "'27,550'",
 "'8,000'",
 "'13,999'",
 "'7,099'",
 "'10,598'",
 "'12,995'",
 "'10,099'",
 "'26,995'",
 "'27,950'",
 "'8,995'",
 "'10,488'",
 "'13,000'",
 "'10,988'",
 "'3,495'",
 "'32,000'",
 "'30,487'",
 "'26,995'",
 "'14,459'",
 "'8,981'",
 "'6,499'",
 "'4,900'",
 "'9,900'",
 "'9,999'",
 "'8,980'",
 "'8,995'",
 "'13,290'",
 "'12,995'",
 "'33,975'",
 "'4,488'",
 "'9,600'",
 "'14,490'",
 "'10,999'",
 "'12,988'",
 "'26,500'",
 "'5,499'",
 "'6,895'",
 "'4,495'",
 "'4,199'",
 "'15,990'",
 "'5,995'",
 "'9,985'",
 "'28,000'",
 "'9,500'",
 "'19,500'",
 "'6,990'",
 "'6,985'",
 "'3,999'",
 "'16,695'",
 "'9,991'",
 "'30,991'",
 "'12,998'",
 "'25,995

In [12]:
prices = list(map(lambda x: x[1:-1], prices))
print(len(prices))
prices

273


['20,995',
 '25,470',
 '23,634',
 '7,495',
 '8,500',
 '9,995',
 '8,799',
 '28,300',
 '26,990',
 '26,419',
 '12,924',
 '24,995',
 '29,940',
 '7,900',
 '10,988',
 '7,944',
 '11,000',
 '30,799',
 '9,991',
 '31,945',
 '7,991',
 '7,889',
 '12,785',
 '21,997',
 '8,979',
 '3,990',
 '5,500',
 '25,279',
 '27,550',
 '8,000',
 '13,999',
 '7,099',
 '10,598',
 '12,995',
 '10,099',
 '26,995',
 '27,950',
 '8,995',
 '10,488',
 '13,000',
 '10,988',
 '3,495',
 '32,000',
 '30,487',
 '26,995',
 '14,459',
 '8,981',
 '6,499',
 '4,900',
 '9,900',
 '9,999',
 '8,980',
 '8,995',
 '13,290',
 '12,995',
 '33,975',
 '4,488',
 '9,600',
 '14,490',
 '10,999',
 '12,988',
 '26,500',
 '5,499',
 '6,895',
 '4,495',
 '4,199',
 '15,990',
 '5,995',
 '9,985',
 '28,000',
 '9,500',
 '19,500',
 '6,990',
 '6,985',
 '3,999',
 '16,695',
 '9,991',
 '30,991',
 '12,998',
 '25,995',
 '8,500',
 '8,995',
 '4,994',
 '10,750',
 '13,563',
 '28,900',
 '10,500',
 '7,988',
 '5,995',
 '22,977',
 '7,995',
 '1,995',
 '5,500',
 '11,990',
 '26,795',

### VEHICLES YEARS SCRAPING FROM WEB PAGE LISTINGS

In [14]:
fst_page_years_soup = fst_page_soup.find_all('span', {'class': 'vehicle-card-year'})
fst_page_years_soup

[<span class="vehicle-card-year font-size-1">2016</span>,
 <span class="vehicle-card-year font-size-1">2015</span>,
 <span class="vehicle-card-year font-size-1">2016</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2018</span>,
 <span class="vehicle-card-year font-size-1">2018</span>,
 <span class="vehicle-card-year font-size-1">2016</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2018</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="vehicle-card-year font-size-1">2017</span>,
 <span class="

In [15]:
re.findall('[12][0-9]{3}', str(fst_page_years_soup))

['2016',
 '2015',
 '2016',
 '2017',
 '2017',
 '2017',
 '2017',
 '2018',
 '2018',
 '2016',
 '2017',
 '2017',
 '2018',
 '2017',
 '2017',
 '2017',
 '2017',
 '2018',
 '2018',
 '2018',
 '2019',
 '2018',
 '2016',
 '2016',
 '2017',
 '2008',
 '2016',
 '2018',
 '2018',
 '2017',
 '2017',
 '2017',
 '2019']

In [16]:
# FUNCTION TO SCRAPE A FEATURE FROM EACH SOUP AND RETURN THE FEATURES LIST

def scraper(tag, element, element_description, regex):
    def features_scraper(soup):
        nth_page_features_soup = soup.find_all(tag, {element: element_description})
        nth_page_features = re.findall(regex, str(nth_page_features_soup))
        return nth_page_features
    features = list(map(features_scraper, soups))
    features = str(features).replace('[', '').replace(']', '').split(', ')
    features = list(map(lambda x: x[1: -1], features))
    return features

In [17]:
years = scraper('span', 'class', 'vehicle-card-year', '[12][0-9]{3}')
print(len(years))
years

273


['2016',
 '2015',
 '2016',
 '2017',
 '2017',
 '2017',
 '2017',
 '2018',
 '2018',
 '2016',
 '2017',
 '2017',
 '2018',
 '2017',
 '2017',
 '2017',
 '2017',
 '2018',
 '2018',
 '2018',
 '2019',
 '2018',
 '2016',
 '2016',
 '2017',
 '2008',
 '2016',
 '2018',
 '2018',
 '2017',
 '2017',
 '2017',
 '2019',
 '2016',
 '2017',
 '2018',
 '2018',
 '2017',
 '2018',
 '2017',
 '2017',
 '2005',
 '2016',
 '2019',
 '2018',
 '2017',
 '2017',
 '2007',
 '2014',
 '2017',
 '2009',
 '2017',
 '2013',
 '2017',
 '2017',
 '2019',
 '2012',
 '2017',
 '2006',
 '2006',
 '2016',
 '2016',
 '2015',
 '2016',
 '1999',
 '2012',
 '2019',
 '2010',
 '2017',
 '2018',
 '2017',
 '2018',
 '1999',
 '2017',
 '2008',
 '2017',
 '2018',
 '2019',
 '2017',
 '2018',
 '2017',
 '2011',
 '2008',
 '2017',
 '2016',
 '2016',
 '2017',
 '1999',
 '2014',
 '2018',
 '2014',
 '2002',
 '2002',
 '2017',
 '2017',
 '2017',
 '2017',
 '2008',
 '2016',
 '2017',
 '2019',
 '2016',
 '2017',
 '2013',
 '2012',
 '2015',
 '2014',
 '2010',
 '2018',
 '2017',
 '2017',
 

### VEHICLES LOCATIONS STATES SCRAPING FROM WEB PAGE LISTINGS

In [19]:
fst_page_states_soup = fst_page_soup.find_all('div', {'data-test': 'vehicleCardLocation'})
fst_page_states_soup

[<div class="vehicle-card-location font-size-1 margin-top-1" data-qa="Location" data-test="vehicleCardLocation"><svg class="icon icon-before vehicle-card-icon icon-color-default" data-qa="IconLocationPin" style="width:16px;height:16px;stroke-width:1.5" viewbox="0 0 24 24"><path d="M18.91 12.33L12.62 23 6.23 12.2"></path><path d="M6.44 12.55a7.5 7.5 0 0 1-1.35-3.93v-.39a7.54 7.54 0 0 1 15.08 0v.69a7.5 7.5 0 0 1-1.37 3.68"></path><circle cx="12.57" cy="8.09" r="2.7"></circle></svg>Lexington<!-- -->, <!-- -->NC</div>,
 <div class="vehicle-card-location font-size-1 margin-top-1" data-qa="Location" data-test="vehicleCardLocation"><svg class="icon icon-before vehicle-card-icon icon-color-default" data-qa="IconLocationPin" style="width:16px;height:16px;stroke-width:1.5" viewbox="0 0 24 24"><path d="M18.91 12.33L12.62 23 6.23 12.2"></path><path d="M6.44 12.55a7.5 7.5 0 0 1-1.35-3.93v-.39a7.54 7.54 0 0 1 15.08 0v.69a7.5 7.5 0 0 1-1.37 3.68"></path><circle cx="12.57" cy="8.09" r="2.7"></circle><

In [20]:
re.findall('[A-W][A-Y]', str(fst_page_states_soup))

['NC',
 'TX',
 'CA',
 'FL',
 'TX',
 'TX',
 'NC',
 'NC',
 'IN',
 'NH',
 'NC',
 'OH',
 'NE',
 'GA',
 'FL',
 'FL',
 'NE',
 'FL',
 'WA',
 'FL',
 'FL',
 'FL',
 'CA',
 'TX',
 'CA',
 'FL',
 'TX',
 'ND',
 'FL',
 'GA',
 'FL',
 'FL']

In [21]:
states = scraper('div', 'data-test', 'vehicleCardLocation', '[A-Z]{2}')
print(len(states))
states

273


['NC',
 'TX',
 'CA',
 'FL',
 'TX',
 'TX',
 'NC',
 'NC',
 'IN',
 'NH',
 'NC',
 'OH',
 'NE',
 'AZ',
 'GA',
 'FL',
 'FL',
 'NE',
 'FL',
 'WA',
 'FL',
 'FL',
 'FL',
 'CA',
 'TX',
 'CA',
 'FL',
 'TX',
 'ND',
 'FL',
 'GA',
 'FL',
 'FL',
 'NC',
 'NC',
 'CA',
 'ND',
 'CA',
 'MO',
 'SC',
 'OR',
 'NY',
 'CA',
 'KY',
 'CA',
 'NC',
 'CA',
 'CA',
 'PA',
 'FL',
 'CA',
 'NC',
 'GA',
 'MT',
 'CO',
 'IN',
 'FL',
 'PA',
 'MO',
 'AZ',
 'FL',
 'MN',
 'TN',
 'PA',
 'CA',
 'FL',
 'AL',
 'OR',
 'GA',
 'KS',
 'NC',
 'ND',
 'TX',
 'GA',
 'AL',
 'VA',
 'FL',
 'TX',
 'WA',
 'CA',
 'TX',
 'CA',
 'CT',
 'VA',
 'AK',
 'NY',
 'TX',
 'OK',
 'TN',
 'GA',
 'AL',
 'NJ',
 'MI',
 'VA',
 'MN',
 'MA',
 'GA',
 'CA',
 'FL',
 'AL',
 'FL',
 'MO',
 'CT',
 'IN',
 'NJ',
 'NC',
 'CA',
 'SC',
 'TX',
 'CA',
 'VA',
 'TX',
 'FL',
 'MA',
 'NC',
 'TN',
 'WA',
 'OR',
 'GA',
 'FL',
 'NJ',
 'VA',
 'KY',
 'MI',
 'PA',
 'SC',
 'TX',
 'CA',
 'TX',
 'FL',
 'WA',
 'CT',
 'FL',
 'AZ',
 'TX',
 'FL',
 'TX',
 'PA',
 'NE',
 'IN',
 'OH',
 'LA',
 'FL',

### VEHICLES LOCATIONS CITIES SCRAPING FROM WEB PAGE LISTINGS

In [23]:
fst_page_cities_soup = fst_page_soup.find_all('div', {'data-test': 'vehicleCardLocation'})
fst_page_cities_soup[0]

<div class="vehicle-card-location font-size-1 margin-top-1" data-qa="Location" data-test="vehicleCardLocation"><svg class="icon icon-before vehicle-card-icon icon-color-default" data-qa="IconLocationPin" style="width:16px;height:16px;stroke-width:1.5" viewbox="0 0 24 24"><path d="M18.91 12.33L12.62 23 6.23 12.2"></path><path d="M6.44 12.55a7.5 7.5 0 0 1-1.35-3.93v-.39a7.54 7.54 0 0 1 15.08 0v.69a7.5 7.5 0 0 1-1.37 3.68"></path><circle cx="12.57" cy="8.09" r="2.7"></circle></svg>Lexington<!-- -->, <!-- -->NC</div>

In [24]:
re.findall('[A-Z][a-z]+[. ]*[A-Z]*[a-z]*[. ]*[A-Z]*[a-z]*', str(fst_page_cities_soup))

['Location',
 'CardLocation',
 'IconLocationPin',
 'Lexington',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Stafford',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Inglewood',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'St. Petersburg',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Austin',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Euless',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Denver',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Lumberton',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Silver Lake',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Gorham',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Shelby',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Hamilton',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Blair',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Phoenix',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Duluth',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Homosas

In [25]:
cities_unf = scraper('div', 'data-test', 'vehicleCardLocation', '[A-Z][a-z]+[. ]*[A-Z]*[a-z]*[. ]*[A-Z]*[a-z]*')
print(len(cities_unf))
cities_unf

1092


['Location',
 'CardLocation',
 'IconLocationPin',
 'Lexington',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Stafford',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Inglewood',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'St. Petersburg',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Austin',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Euless',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Denver',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Lumberton',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Silver Lake',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Gorham',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Shelby',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Hamilton',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Blair',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Phoenix',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Duluth',
 'Location',
 'CardLocation',
 'IconLocationPin',
 'Homosas

In [26]:
cities = [city for city in cities_unf if cities_unf.index(city) in list(range(3, len(cities_unf), 4))]
print(len(cities))
cities

273


['Lexington',
 'Stafford',
 'Inglewood',
 'St. Petersburg',
 'Austin',
 'Euless',
 'Denver',
 'Lumberton',
 'Silver Lake',
 'Gorham',
 'Shelby',
 'Hamilton',
 'Blair',
 'Phoenix',
 'Duluth',
 'Homosassa',
 'Clearwater',
 'Wahoo',
 'Hollywood',
 'Puyallup',
 'Hollywood',
 'Miami',
 'Plantation',
 'Sacramento',
 'Granbury',
 'El Cajon',
 'Jacksonville',
 'Houston',
 'Minot',
 'Pembroke Pines',
 'Smyrna',
 'Miami',
 'Tampa',
 'Jacksonville',
 'Charlotte',
 'Dublin',
 'Minot',
 'Fontana',
 'Belton',
 'Florence',
 'Milwaukie',
 'Jamaica',
 'Modesto',
 'Columbia',
 'Dublin',
 'Youngsville',
 'Moreno Valley',
 'Roseville',
 'Walnutport',
 'Fort Myers',
 'Costa Mesa',
 'Raleigh',
 'Savannah',
 'Great Falls',
 'Greeley',
 'Greenwood',
 'Pensacola',
 'Greensburg',
 'Manchester',
 'Mesa',
 'Deland',
 'Inver Grove Heights',
 'Murfreesboro',
 'Pottsville',
 'Rocklin',
 'New Port Richey',
 'Decatur',
 'Medford',
 'Duluth',
 'Olathe',
 'Wilson',
 'Minot',
 'Houston',
 'Duluth',
 'Birmingham',
 'Winch

### VEHICLES EXTERIOR COLORS SCRAPING FROM WEP PAGE LISTINGS

In [28]:
fst_page_colors_soup = fst_page_soup.find_all('div', {'data-test': 'vehicleCardColors'})
fst_page_colors_soup[0]

<div class="vehicle-card-location font-size-1 margin-top-1 text-truncate" data-qa="ExteriorInteriorColor" data-test="vehicleCardColors"><svg class="icon icon-before vehicle-card-icon icon-color-default" data-qa="IconPaintBucket" style="width:16px;height:16px;stroke-width:1.5" viewbox="0 0 24 24"><path d="M18.45 18.94l2.13-3.61 2.16 3.65"></path><path d="M22.67 18.86a2.54 2.54 0 0 1 .46 1.33v.13a2.55 2.55 0 1 1-5.1 0v-.23a2.54 2.54 0 0 1 .46-1.24"></path><circle cx="14.5" cy="9.5" r="1"></circle><path d="M20.42 12.26L10.33 22.35 1.3 13.31 11.39 3.22M14.5 9V1M10.64 2.09l10.84 10.84"></path></svg>Silver<!-- --> exterior, <!-- -->Black<!-- --> interior</div>

In [29]:
re.findall('->[A-Z][a-z]+', str(fst_page_colors_soup))

['->Black',
 '->Unknown',
 '->Black',
 '->Black',
 '->Unknown',
 '->Gray',
 '->Black',
 '->Black',
 '->Gray',
 '->Gray',
 '->Beige',
 '->Gray',
 '->Unknown',
 '->Black',
 '->Black',
 '->Gray',
 '->Unknown',
 '->Gray',
 '->Gray',
 '->Blue',
 '->Black',
 '->Black',
 '->Gray',
 '->Gray',
 '->Black',
 '->Black',
 '->Black',
 '->Gray',
 '->Unknown',
 '->Beige',
 '->Black',
 '->Black',
 '->Black']

In [30]:
exterior_colors_unf = scraper('div', 'data-test', 'vehicleCardColors', 'g>[A-Z][a-z]+')
len(exterior_colors_unf)

273

In [31]:
exterior_colors = list(map(lambda color: color[2:], exterior_colors_unf))
print(len(exterior_colors))
exterior_colors

273


['Silver',
 'Copper',
 'Black',
 'Gray',
 'Silver',
 'White',
 'Black',
 'Silver',
 'Gray',
 'Black',
 'Black',
 'Silver',
 'Gray',
 'Black',
 'White',
 'Blue',
 'Black',
 'Silver',
 'White',
 'Silver',
 'Black',
 'Black',
 'Gray',
 'White',
 'Black',
 'Black',
 'Silver',
 'Black',
 'Silver',
 'Black',
 'Gray',
 'White',
 'White',
 'White',
 'Black',
 'White',
 'Silver',
 'Black',
 'Silver',
 'Red',
 'Blue',
 'Red',
 'White',
 'White',
 'White',
 'Orange',
 'Silver',
 'Black',
 'White',
 'Gray',
 'White',
 'Black',
 'Gray',
 'White',
 'Black',
 'White',
 'Black',
 'Tan',
 'Black',
 'Red',
 'White',
 'Gray',
 'Black',
 'Black',
 'Black',
 'White',
 'White',
 'White',
 'Black',
 'Gray',
 'White',
 'Black',
 'White',
 'Blue',
 'Black',
 'White',
 'Silver',
 'White',
 'Blue',
 'White',
 'Red',
 'Black',
 'Black',
 'Black',
 'Silver',
 'Silver',
 'Silver',
 'White',
 'Gray',
 'Silver',
 'White',
 'Silver',
 'White',
 'White',
 'White',
 'Gray',
 'Gray',
 'White',
 'White',
 'White',
 'Black

### VEHICLES INTERIOR COLORS SCRAPING FROM WEB PAGE LISTINGS

In [33]:
re.findall('->[A-Z][a-z]+', str(fst_page_colors_soup))

['->Black',
 '->Unknown',
 '->Black',
 '->Black',
 '->Unknown',
 '->Gray',
 '->Black',
 '->Black',
 '->Gray',
 '->Gray',
 '->Beige',
 '->Gray',
 '->Unknown',
 '->Black',
 '->Black',
 '->Gray',
 '->Unknown',
 '->Gray',
 '->Gray',
 '->Blue',
 '->Black',
 '->Black',
 '->Gray',
 '->Gray',
 '->Black',
 '->Black',
 '->Black',
 '->Gray',
 '->Unknown',
 '->Beige',
 '->Black',
 '->Black',
 '->Black']

In [34]:
interior_colors_unf = scraper('div', 'data-test', 'vehicleCardColors', '->[A-Z][a-z]+')
interior_colors = list(map(lambda color: color[2:], interior_colors_unf))
print(len(interior_colors))
interior_colors

273


['Black',
 'Unknown',
 'Black',
 'Black',
 'Unknown',
 'Gray',
 'Black',
 'Black',
 'Gray',
 'Gray',
 'Beige',
 'Gray',
 'Unknown',
 'Black',
 'Black',
 'Gray',
 'Unknown',
 'Gray',
 'Gray',
 'Blue',
 'Black',
 'Black',
 'Gray',
 'Gray',
 'Black',
 'Black',
 'Black',
 'Gray',
 'Unknown',
 'Beige',
 'Black',
 'Black',
 'Black',
 'Gray',
 'Black',
 'Gray',
 'Unknown',
 'Gray',
 'Black',
 'Black',
 'Gray',
 'Unknown',
 'Unknown',
 'Gray',
 'Gray',
 'Black',
 'Gray',
 'Black',
 'Black',
 'Gray',
 'Black',
 'Unknown',
 'Unknown',
 'Black',
 'Unknown',
 'Unknown',
 'Black',
 'Unknown',
 'Black',
 'Brown',
 'Gray',
 'Gray',
 'Black',
 'Black',
 'Unknown',
 'Black',
 'Black',
 'Gray',
 'Unknown',
 'Gray',
 'Black',
 'Black',
 'Beige',
 'Black',
 'Unknown',
 'Black',
 'Black',
 'Beige',
 'Black',
 'Gray',
 'Gray',
 'Beige',
 'Beige',
 'Black',
 'Black',
 'Black',
 'Black',
 'Unknown',
 'Black',
 'Black',
 'Unknown',
 'Beige',
 'Black',
 'Black',
 'Unknown',
 'Gray',
 'Black',
 'Black',
 'Black'

In [35]:
# VEHICLES CONDITION (NUMBER OF ACCIDENTS) SCRAPING FROM WEB PAGE LISTINGS

In [36]:
fst_page_accidents_soup = fst_page_soup.find_all('div', {'data-test': 'vehicleCardCondition'})
fst_page_accidents_soup[0]

<div class="vehicle-card-location font-size-1 margin-top-1" data-qa="ConditionHistory" data-test="vehicleCardCondition"><svg class="icon icon-before vehicle-card-icon icon-color-default" data-qa="IconCarFront" style="width:16px;height:16px;stroke-width:1.5" viewbox="0 0 24 24"><path d="M22 13.13v6.23a.7.7 0 0 1-.68.72h-2.71a.69.69 0 0 1-.65-.72v-1.28H6v1.28a.69.69 0 0 1-.65.72H2.64a.7.7 0 0 1-.64-.72v-5.91a2.84 2.84 0 0 1 .34-1.35l2.09-4A3.76 3.76 0 0 1 7.6 6.08H17a3.79 3.79 0 0 1 3.4 2.47L21.75 12a2.93 2.93 0 0 1 .25 1.13z"></path><path d="M7.96 14.08h-3"></path><path d="M2.83 11.08H.46M23.46 11.08h-2.09M18.46 11.08h-13"></path><path d="M18.96 14.08h-3"></path></svg>No accidents, <!-- -->2 Owners, <!-- -->Personal use</div>

In [37]:
re.findall('[0-9]*[A-z]* accident[s]*', str(fst_page_accidents_soup))

['No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 'No accidents',
 'No accidents',
 '1 accident',
 '1 accident',
 'No accidents',
 '1 accident',
 '1 accident',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents']

In [38]:
accidents = scraper('div', 'data-test', 'vehicleCardCondition', '[0-9]*[A-z]* accident[s]*')
print(len(accidents))
accidents

273


['No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 'No accidents',
 'No accidents',
 '1 accident',
 '1 accident',
 'No accidents',
 '1 accident',
 '1 accident',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '2 accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 'No accidents',
 'No accidents',
 'No accidents',
 'No accidents',
 '1 accident',
 'No accidents',
 'No accidents',
 '1 accident',
 'No accide

#### URLs Scraping

In [63]:
fst_page_urls = np.array([])
for ind in range(33):
    finding = soups[0].find_all('a', {'data-test': 'usedListing'})[ind]
    fst_page_urls = np.append(fst_page_urls, re.findall('href=".+" style', str(finding)[:280]))
print(len(fst_page_urls))
fst_page_urls

33


array(['href="/used-cars-for-sale/listing/KNMAT2MT5KP503559/2019-nissan-rogue/?sponsoredVehiclePosition=0" style',
       'href="/used-cars-for-sale/listing/3MZBPACL9LM118904/2020-mazda-mazda3/?sponsoredVehiclePosition=1" style',
       'href="/used-cars-for-sale/listing/JTEZU5JR3G5115447/2016-toyota-4runner/?sponsoredVehiclePosition=2" style',
       'href="/used-cars-for-sale/listing/3N1AB7AP7KY242301/2019-nissan-sentra/" style',
       'href="/used-cars-for-sale/listing/1FMCU0GD6HUE65294/2017-ford-escape/" style',
       'href="/used-cars-for-sale/listing/3N1AB7AP0KY301527/2019-nissan-sentra/" style',
       'href="/used-cars-for-sale/listing/1FMCU9G94HUA28507/2017-ford-escape/" style',
       'href="/used-cars-for-sale/listing/3N1AB7AP4KY339083/2019-nissan-sentra/" style',
       'href="/used-cars-for-sale/listing/2GNAXJEV5J6299317/2018-chevrolet-equinox/" style',
       'href="/used-cars-for-sale/listing/5N1AT2MT5KC715204/2019-nissan-rogue/" style',
       'href="/used-cars-for-sa

In [64]:
def urls_scraper(soup):
    nth_urls = []
    for nth in range(30):
        finding = soup.find_all('a', {'data-test': 'usedListing'})[nth]
        nth_urls.append(re.findall('href="/.+" style', str(finding)[:280])[0])
    return nth_urls
rest_urls_list = list(map(urls_scraper, soups[1:]))
rest_urls_list

[['href="/used-cars-for-sale/listing/1FTFW1CT4BKD51456/2011-ford-f-150/" style',
  'href="/used-cars-for-sale/listing/1N4BZ0CP2HC309220/2017-nissan-leaf/" style',
  'href="/used-cars-for-sale/listing/1N4BL4CV1KC194821/2019-nissan-altima/" style',
  'href="/used-cars-for-sale/listing/WMWLU5C53K2G80053/2019-mini-clubman/" style',
  'href="/used-cars-for-sale/listing/JN1BJ0RPXHM381014/2017-infiniti-qx50/" style',
  'href="/used-cars-for-sale/listing/2G4GT5GX2G9149205/2016-buick-regal/" style',
  'href="/used-cars-for-sale/listing/JTDZN3EU0D3237159/2013-toyota-prius-v/" style',
  'href="/used-cars-for-sale/listing/JN8AZ2NE7E9063338/2014-infiniti-qx80/" style',
  'href="/used-cars-for-sale/listing/1G1ZJ57B18F212898/2008-chevrolet-malibu/" style',
  'href="/used-cars-for-sale/listing/3N1CP5CU6JL545101/2018-nissan-kicks/" style',
  'href="/used-cars-for-sale/listing/1VWDT7A39HC033015/2017-volkswagen-passat/" style',
  'href="/used-cars-for-sale/listing/2C3CCABTXHH647034/2017-chrysler-300/" st

In [65]:
rest_pages_urls = np.array(rest_urls_list).flatten()
all_urls = np.append(fst_page_urls, rest_pages_urls)
url_formatter = np.vectorize(lambda url: 'https://truecar.com' + url[6: -7])
urls = url_formatter(all_urls)
print(len(urls))
urls

273


array(['https://truecar.com/used-cars-for-sale/listing/KNMAT2MT5KP503559/2019-nissan-rogue/?sponsoredVehiclePosition=0',
       'https://truecar.com/used-cars-for-sale/listing/3MZBPACL9LM118904/2020-mazda-mazda3/?sponsoredVehiclePosition=1',
       'https://truecar.com/used-cars-for-sale/listing/JTEZU5JR3G5115447/2016-toyota-4runner/?sponsoredVehiclePosition=2',
       'https://truecar.com/used-cars-for-sale/listing/3N1AB7AP7KY242301/2019-nissan-sentra/',
       'https://truecar.com/used-cars-for-sale/listing/1FMCU0GD6HUE65294/2017-ford-escape/',
       'https://truecar.com/used-cars-for-sale/listing/3N1AB7AP0KY301527/2019-nissan-sentra/',
       'https://truecar.com/used-cars-for-sale/listing/1FMCU9G94HUA28507/2017-ford-escape/',
       'https://truecar.com/used-cars-for-sale/listing/3N1AB7AP4KY339083/2019-nissan-sentra/',
       'https://truecar.com/used-cars-for-sale/listing/2GNAXJEV5J6299317/2018-chevrolet-equinox/',
       'https://truecar.com/used-cars-for-sale/listing/5N1AT2MT5K

### VEHICLE STYLES SCRAPING FROM WEB PAGES LISTINGS

In [7]:
def feature_scraper_from_url(feature_as_argument):
    def feature_from_url(url):
        nth_request = requests.get(url)
        nth_soup = BeautifulSoup(nth_request.content, 'lxml')
        nth_search = re.search(feature_as_argument + '</h4><ul><li>.+</li', str(nth_soup))
        return re.findall('li>.+</l', str(nth_search))
    features_unf = list(map(feature_from_url, urls))
    features = list(map(lambda f: str(f)[5: -5], features_unf))
    return features

In [44]:
styles = feature_scraper_from_url('Style')
print(len(styles))
styles

10


['FE Manual',
 'Turbo Automatic',
 'Trailhawk 4WD',
 'SE FWD',
 'SE 4WD',
 "XLT SuperCrew 5.5\\\\' Box 4WD",
 'LT with 1LT FWD',
 "XLT SuperCrew 5.5\\\\' Box 2WD",
 'Limited FWD',
 'SE I4 Automatic']

### VEHICLE OPTIONS LEVELS SCRAPING FROM WEB PAGES LISTINGS

In [28]:
options_level = feature_scraper_from_url('Options Level')
print(len(options_level))
options_level

10


['Well Equipped',
 '',
 'Well Equipped',
 'Minimal Options',
 '',
 '',
 'Well Equipped',
 '',
 'Standard',
 '']

### VEHICLE BED LENGTHS SCRAPING FROM WEB PAGES LISTINGS

In [29]:
bed_lengths = feature_scraper_from_url('Bed Length')
print(len(bed_lengths))
bed_lengths

10


['Standard Bed', '', '', '', '', 'Short Bed', '', 'Short Bed', '', 'Short Bed']

### VEHICLES MILEAGE PER GALLONS SCRAPING FROM WEB PAGES LISTINGS

In [30]:
MPGs = feature_scraper_from_url('MPG')
print(len(MPGs))
MPGs

10


['17 cty / 22 hwy',
 '21 cty / 30 hwy',
 '20 cty / 28 hwy',
 '23 cty / 30 hwy',
 '22 cty / 28 hwy',
 '17 cty / 23 hwy',
 '26 cty / 32 hwy',
 '20 cty / 25 hwy',
 '24 cty / 33 hwy',
 '17 cty / 23 hwy']

### VEHICLES DRIVE TYPES SCRAPING FROM WEB PAGES LISTINGS

In [31]:
drive_types = feature_scraper_from_url('Drive Type')
print(len(drive_types))
drive_types

10


['4WD', 'FWD', 'AWD', 'FWD', '4WD', '4WD', 'FWD', 'RWD', 'FWD', '4WD']

### VEHICLES FUEL TYPES SCRAPING FROM WEB PAGES LISTINGS

In [35]:
fuel_types = feature_scraper_from_url('Fuel Type')
print(len(fuel_types))
fuel_types

10


['Gas', 'Gas', 'Gas', 'Gas', 'Gas', 'Gas', 'Gas', 'Gas', 'Gas', 'Gas']

### VEHICLES TRANSMISSIONS SCRAPING FROM WEB PAGES LISTINGS

In [32]:
transmissions = feature_scraper_from_url('Transmission')
print(len(transmissions))
transmissions

10


['Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic',
 'Automatic']

### VEHICLES MILEAGES SCRAPING FROM WEB PAGES LISTINGS

In [34]:
mileages = feature_scraper_from_url('Mileage')
print(len(mileages))
mileages

10


['27,917',
 '27,295',
 '43,100',
 '152,738',
 '143,192',
 '46,020',
 '135,297',
 '15,681',
 '2,816',
 '47,692']

### VEHICLES ENGINES SCRAPING FROM WEB PAGES LISTINGS

In [58]:
re.findall('<li>.+', str(b))

['<li>1.6L Inline-4 Gas Turbocharged>']

In [8]:
def engines_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml')
    nth_finding = re.search('Engine</h4><ul><li>.+</li>', str(nth_soup))
    return re.findall('<li>.+', str(nth_finding))

In [26]:
engines_unf = list(map(engines_scraper, urls))
engines_unf

[['<li>1.8L Inline-4 Gas</li></ul></d>'],
 ['<li>2.4L Inline-4 Gas</li></ul></d>'],
 ['<li>1.8L Inline-4 Gas</li></ul></d>'],
 ['<li>2.5L Inline-4 Gas</li></ul></d>'],
 ['<li>1.8L Inline-4 Hybrid</li></ul>>'],
 ['<li>3.6L V-6 Gas</li></ul></div></>'],
 ['<li>1.6L Inline-4 Gas</li></ul></d>'],
 ['<li>1.8L Inline-4 Gas</li></ul></d>'],
 ['<li>1.8L Inline-4 Gas</li></ul></d>'],
 ['<li>4.0L V-6 Gas</li></ul></div></>'],
 ['<li>2.0L Inline-4 Gas</li></ul></d>'],
 ['<li>1.6L Inline-4 Gas</li></ul></d>'],
 ['<li>2.0L Inline-4 Gas</li></ul></d>'],
 ['<li>2.5L Inline-4 Gas</li></ul></d>'],
 ['<li>2.5L Inline-4 Gas</li></ul></d>'],
 ['<li>2.0L Inline-4 Gas Turbocharged>'],
 ['<li>2.0L Inline-4 Gas</li></ul></d>'],
 ['<li>2.0L Inline-4 Hybrid</li></ul>>'],
 ['<li>2.5L Inline-4 Gas</li></ul></d>'],
 ['<li>2.5L Inline-4 Gas</li></ul></d>'],
 ['<li>2.0L Inline-4 Hybrid</li></ul>>'],
 ['<li>2.5L Inline-4 Gas</li></ul></d>'],
 ['<li>3.6L V-6 Gas</li></ul></div></>'],
 ['<li>2.0L Inline-4 Hybrid</li></

In [66]:
def engine_from_url(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml')
    nth_finding = re.findall('Engine</h4><ul><li>.+</li>', str(nth_soup))
    return re.findall('[^><]+', str(nth_finding))[4]

In [67]:
engines = list(map(engine_from_url, urls))
print(len(engines))
engines

273


['2.5L Inline-4 Gas',
 '2.5L Inline-4 Gas',
 '4.0L V-6 Gas',
 '1.8L Inline-4 Gas',
 '1.5L Inline-4 Gas Turbocharged',
 '1.8L Inline-4 Gas',
 '2.0L Inline-4 Gas Turbocharged',
 '1.8L Inline-4 Gas',
 '1.5L Inline-4 Gas Turbocharged',
 '2.5L Inline-4 Gas',
 '2.0L Inline-4 Gas Turbocharged',
 '2.0L Inline-4 Gas Turbocharged',
 '3.5L V-6 Gas',
 '3.6L V-6 Gas',
 '3.0L Inline-6 Gas Turbocharged',
 '1.8L Inline-4 Gas',
 '3.8L V-6 Gas',
 '2.5L Inline-4 Gas',
 '3.5L V-6 Gas Turbocharged',
 '1.8L Inline-4 Gas',
 '2.5L Inline-4 Gas',
 '1.8L Inline-4 Gas',
 '1.6L Inline-4 Gas',
 '1.8L Inline-4 Gas',
 '2.0L Inline-4 Gas',
 '5.3L V-8 Gas',
 '1.8L Inline-4 Gas',
 '2.5L Inline-4 Gas',
 '1.8L Inline-4 Gas',
 '5.0L V-8 Gas',
 '4.0L V-6 Gas',
 '1.8L Inline-4 Gas',
 '1.8L Inline-4 Gas',
 '3.5L V-6 Gas Turbocharged',
 'L - Electric',
 '2.5L Inline-4 Gas',
 '2.0L Inline-4 Gas Turbocharged',
 '3.7L V-6 Gas',
 '2.0L Inline-4 Gas Turbocharged',
 '1.8L Inline-4 Hybrid',
 '5.6L V-8 Gas',
 '2.4L Inline-4 Gas',
 '1

In [21]:
# VEHICLE MAKES SCRAPING FROM WEB PAGES LISTINGS

In [22]:
def MakeModel_from_url(url, index):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml')
    nth_finding = nth_soup.find_all('div', {'class': 'text-truncate heading-3 margin-right-2 margin-right-sm-3'})
    nth_regex_finding = re.findall('[0-9]{4} .+<', str(nth_finding))
    make_model = re.findall('[A-z]+ [A-z]+', str(nth_regex_finding))[0]
    return make_model.split()[index]

In [None]:
makes = list(map(MakeModel_from_url, urls, itertools.repeat(0, len(urls))))
print(len(makes))
makes

In [23]:
# VEHICLE MODELS SCRAPING FROM WEB PAGES LISTINGS

In [None]:
models = list(map(MakeModel_from_url, urls, itertools.repeat(1, len(urls))))
print(len(models))
models