In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import re
import requests
import math
from bs4 import BeautifulSoup
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg
import matplotlib.figure
import seaborn as sb
import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_curve, roc_auc_score

In [6]:
'''listings_soup = BeautifulSoup(requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=2').content, 'lxml')
listings_soup.find_all('h4', {'class': 'heading-3 margin-y-1 font-weight-bold'})'''
page1_listings_soup = BeautifulSoup(requests.get('https://www.truecar.com/used-cars-for-sale/listings/').content, 'lxml')
page1_prices_soup = page1_listings_soup.find_all('h4', {'data-test': 'vehicleCardPricingBlockPrice'})
page1_prices_soup

[<h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$10,450</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$13,273</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$39,988</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$26,500</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$7,995</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$26,419</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$30,475</h4>,
 <h4 class="heading-3 margin-y-1 font-weight-bold" data-qa="Heading" data-test="vehicleCardPricingBlockPrice">$7,995</h4>,
 <h4 class

In [7]:
pricespage1 = re.findall('[0-9]+,[0-9]+', str(page1_prices_soup))
pricespage1

['10,450',
 '13,273',
 '39,988',
 '26,500',
 '7,995',
 '26,419',
 '30,475',
 '7,995',
 '9,991',
 '9,888',
 '28,300',
 '8,799',
 '7,991',
 '29,940',
 '31,945',
 '12,924',
 '30,680',
 '32,977',
 '30,998',
 '32,011',
 '21,997',
 '13,000',
 '27,550',
 '26,988',
 '24,995',
 '29,999',
 '9,991',
 '15,995',
 '27,950',
 '9,000',
 '8,000',
 '25,488',
 '27,888']

In [9]:
page1_prices = []
for price in pricespage1:   
    page1_prices.append(str(price).replace(',', '.'))
page1_prices

['10.450',
 '13.273',
 '39.988',
 '26.500',
 '7.995',
 '26.419',
 '30.475',
 '7.995',
 '9.991',
 '9.888',
 '28.300',
 '8.799',
 '7.991',
 '29.940',
 '31.945',
 '12.924',
 '30.680',
 '32.977',
 '30.998',
 '32.011',
 '21.997',
 '13.000',
 '27.550',
 '26.988',
 '24.995',
 '29.999',
 '9.991',
 '15.995',
 '27.950',
 '9.000',
 '8.000',
 '25.488',
 '27.888']

In [10]:
#Scraping rest of pages listings prices

other_pages_prices = []
for number in range(2, 101):
    page_listings_soup = BeautifulSoup(requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=' + str(number)).content, 'lxml')
    page_prices_soup = page_listings_soup.find_all('h4', {'data-test': 'vehicleCardPricingBlockPrice'})
    page_prices = list(re.findall('[0-9]+,[0-9]+', str(page_prices_soup)))
    other_pages_prices += page_prices
other_pages_prices

['9,900',
 '10,000',
 '14,933',
 '10,500',
 '5,500',
 '6,290',
 '8,495',
 '10,200',
 '7,990',
 '19,000',
 '3,500',
 '7,999',
 '31,674',
 '14,419',
 '6,752',
 '5,995',
 '6,991',
 '5,767',
 '8,999',
 '30,990',
 '33,207',
 '16,791',
 '22,995',
 '5,499',
 '11,491',
 '23,590',
 '22,577',
 '9,985',
 '18,499',
 '10,999',
 '11,998',
 '34,980',
 '9,250',
 '8,379',
 '26,995',
 '10,995',
 '11,994',
 '8,995',
 '8,900',
 '8,990',
 '5,999',
 '4,288',
 '10,889',
 '16,000',
 '28,500',
 '26,995',
 '1,500',
 '10,991',
 '8,980',
 '18,537',
 '4,795',
 '28,583',
 '6,995',
 '8,995',
 '15,246',
 '3,999',
 '13,495',
 '13,000',
 '5,300',
 '2,499',
 '12,491',
 '6,400',
 '6,991',
 '13,390',
 '11,292',
 '4,382',
 '5,987',
 '26,795',
 '13,500',
 '5,999',
 '11,990',
 '2,699',
 '4,300',
 '20,000',
 '28,750',
 '19,489',
 '4,995',
 '3,495',
 '4,950',
 '11,979',
 '14,990',
 '11,990',
 '16,950',
 '3,599',
 '12,995',
 '4,599',
 '8,995',
 '25,999',
 '10,387',
 '18,387',
 '19,776',
 '2,850',
 '19,888',
 '12,794',
 '6,895',

In [11]:
rest_pages_prices = []
for price in other_pages_prices:   
    rest_pages_prices.append(price.replace(',', '.'))
rest_pages_prices

['9.900',
 '10.000',
 '14.933',
 '10.500',
 '5.500',
 '6.290',
 '8.495',
 '10.200',
 '7.990',
 '19.000',
 '3.500',
 '7.999',
 '31.674',
 '14.419',
 '6.752',
 '5.995',
 '6.991',
 '5.767',
 '8.999',
 '30.990',
 '33.207',
 '16.791',
 '22.995',
 '5.499',
 '11.491',
 '23.590',
 '22.577',
 '9.985',
 '18.499',
 '10.999',
 '11.998',
 '34.980',
 '9.250',
 '8.379',
 '26.995',
 '10.995',
 '11.994',
 '8.995',
 '8.900',
 '8.990',
 '5.999',
 '4.288',
 '10.889',
 '16.000',
 '28.500',
 '26.995',
 '1.500',
 '10.991',
 '8.980',
 '18.537',
 '4.795',
 '28.583',
 '6.995',
 '8.995',
 '15.246',
 '3.999',
 '13.495',
 '13.000',
 '5.300',
 '2.499',
 '12.491',
 '6.400',
 '6.991',
 '13.390',
 '11.292',
 '4.382',
 '5.987',
 '26.795',
 '13.500',
 '5.999',
 '11.990',
 '2.699',
 '4.300',
 '20.000',
 '28.750',
 '19.489',
 '4.995',
 '3.495',
 '4.950',
 '11.979',
 '14.990',
 '11.990',
 '16.950',
 '3.599',
 '12.995',
 '4.599',
 '8.995',
 '25.999',
 '10.387',
 '18.387',
 '19.776',
 '2.850',
 '19.888',
 '12.794',
 '6.895',

In [12]:
prices = page1_prices + rest_pages_prices
len(prices)

3003