In [4]:
import itertools
import pandas as pd
import numpy as np
import scipy.stats
import re
import requests
import math
from bs4 import BeautifulSoup
from selenium import webdriver
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.backends.backend_agg
import matplotlib.figure
import seaborn as sb
import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import roc_curve, roc_auc_score

### VEHICLES PRICES SCRAPING FROM WEB PAGE LISTINGS

In [None]:
fst_page_url = 'https://www.truecar.com/used-cars-for-sale/listings/'
url_requested = requests.get(fst_page_url)
fst_page_soup = BeautifulSoup(url_requested.content, 'lxml')
fst_page_prices = fst_page_soup.find_all('h4', {'data-test': 'vehicleCardPricingBlockPrice'})
re.findall('[0-9]+,[0-9]+', str(fst_page_prices))

### HTML soups from all 500 web pages

In [2]:
def get_soups(website_number):
    get_url = requests.get('https://www.truecar.com/used-cars-for-sale/listings/?page=' + str(website_number))
    return BeautifulSoup(get_url.content, 'lxml')

In [3]:
soups = list(map(get_soups, list(range(1, 301))))

KeyboardInterrupt: 

In [None]:
def pricesscraper(soup):
    nth_page_prices_soup = soup.find_all('h4', {'data-test': 'vehicleCardPricingBlockPrice'})
    nth_page_prices = re.findall('[0-9]+,[0-9]+', str(nth_page_prices_soup))
    return nth_page_prices

In [None]:
prices = list(map(pricesscraper, soups))
prices = str(prices).replace('[', '').replace(']', '').split(', ')
prices = list(map(lambda x: x[1:-1], prices))
print(len(prices))
prices

### VEHICLES YEARS SCRAPING FROM WEB PAGE LISTINGS

In [None]:
fst_page_years_soup = fst_page_soup.find_all('span', {'class': 'vehicle-card-year'})
re.findall('[12][0-9]{3}', str(fst_page_years_soup))

In [None]:
# FUNCTION TO SCRAPE A FEATURE FROM EACH SOUP AND RETURN THE FEATURES LIST

def scraper(tag, element, element_description, regex):
    
    def features_scraper(soup):
        nth_page_features_soup = soup.find_all(tag, {element: element_description})
        nth_page_features = re.findall(regex, str(nth_page_features_soup))
        return nth_page_features
    
    features = list(map(features_scraper, soups))
    features = str(features).replace('[', '').replace(']', '').split(', ')
    features = list(map(lambda x: x[1: -1], features))
    return features

In [None]:
years = scraper('span', 'class', 'vehicle-card-year', '[12][0-9]{3}')
print(len(years))
years

### VEHICLES LOCATIONS STATES SCRAPING FROM WEB PAGE LISTINGS

In [None]:
fst_page_states_soup = fst_page_soup.find_all('div', {'data-test': 'vehicleCardLocation'})
re.findall('[A-W][A-Y]', str(fst_page_states_soup))
states = scraper('div', 'data-test', 'vehicleCardLocation', '[A-Z]{2}')
print(len(states))
states

### VEHICLES LOCATIONS CITIES SCRAPING FROM WEB PAGE LISTINGS

In [None]:
fst_page_cities_soup = fst_page_soup.find_all('div', {'data-test': 'vehicleCardLocation'})
re.findall('[A-Z][a-z]+[. ]*[A-Z]*[a-z]*[. ]*[A-Z]*[a-z]*', str(fst_page_cities_soup))

In [None]:
cities_unf = scraper('div', 'data-test', 'vehicleCardLocation', '[A-Z][a-z]+[. ]*[A-Z]*[a-z]*[. ]*[A-Z]*[a-z]*')
cities = [city for city in cities_unf if cities_unf.index(city) in list(range(3, len(cities_unf), 4))]
print(len(cities))
cities

### VEHICLES EXTERIOR COLORS SCRAPING FROM WEP PAGE LISTINGS

In [None]:
fst_page_colors_soup = fst_page_soup.find_all('div', {'data-test': 'vehicleCardColors'})
re.findall('->[A-Z][a-z]+', str(fst_page_colors_soup))

In [None]:
exterior_colors_unf = scraper('div', 'data-test', 'vehicleCardColors', 'g>[A-Z][a-z]+')
len(exterior_colors_unf)

In [None]:
exterior_colors = list(map(lambda color: color[2:], exterior_colors_unf))
print(len(exterior_colors))
exterior_colors

### VEHICLES INTERIOR COLORS SCRAPING FROM WEB PAGE LISTINGS

In [None]:
re.findall('->[A-Z][a-z]+', str(fst_page_colors_soup))

In [None]:
interior_colors_unf = scraper('div', 'data-test', 'vehicleCardColors', '->[A-Z][a-z]+')
interior_colors = list(map(lambda color: color[2:], interior_colors_unf))
print(len(interior_colors))
interior_colors

### VEHICLES CONDITION (NUMBER OF ACCIDENTS) SCRAPING FROM WEB PAGE LISTINGS

In [None]:
fst_page_accidents_soup = fst_page_soup.find_all('div', {'data-test': 'vehicleCardCondition'})
re.findall('[0-9]*[A-z]* accident[s]*', str(fst_page_accidents_soup))

In [None]:
accidents = scraper('div', 'data-test', 'vehicleCardCondition', '[0-9]*[A-z]* accident[s]*')
print(len(accidents))
accidents

#### URLs Scraping

In [None]:
fst_page_urls = np.array([])

for ind in range(33):
    finding = soups[0].find_all('a', {'data-test': 'usedListing'})[ind]
    fst_page_urls = np.append(fst_page_urls, re.findall('href=".+" style', str(finding)[:280]))

In [None]:
def urls_scraper(soup):
    nth_urls = []
    for nth in range(30):
        finding = soup.find_all('a', {'data-test': 'usedListing'})[nth]
        nth_urls.append(re.findall('href="/.+" style', str(finding)[:280])[0])
    return nth_urls

rest_urls_list = list(map(urls_scraper, soups[1:]))

In [None]:
rest_pages_urls = np.array(rest_urls_list).flatten()
all_urls = np.append(fst_page_urls, rest_pages_urls)
url_formatter = np.vectorize(lambda url: 'https://truecar.com' + url[6: -7])
urls = url_formatter(all_urls)
print(len(urls))
urls

### VEHICLE STYLES SCRAPING FROM WEB PAGES LISTINGS

In [None]:
def feature_scraper_from_url(feature_as_argument):
    
    def feature_from_url(url):
        nth_request = requests.get(url)
        nth_soup = BeautifulSoup(nth_request.content, 'lxml')
        nth_search = re.search(feature_as_argument + '</h4><ul><li>.+</li', str(nth_soup))
        return re.findall('li>.+</l', str(nth_search))
    
    features_unf = list(map(feature_from_url, urls))
    features = list(map(lambda f: str(f)[5: -5], features_unf))
    return features

In [None]:
styles = feature_scraper_from_url('Style')
print(len(styles))
styles

### VEHICLE OPTIONS LEVELS SCRAPING FROM WEB PAGES LISTINGS

In [None]:
options_level = feature_scraper_from_url('Options Level')
print(len(options_level))
options_level

### VEHICLE BED LENGTHS SCRAPING FROM WEB PAGES LISTINGS

In [None]:
bed_lengths = feature_scraper_from_url('Bed Length')
print(len(bed_lengths))
bed_lengths

### VEHICLES MILEAGE PER GALLONS SCRAPING FROM WEB PAGES LISTINGS

In [None]:
MPGs = feature_scraper_from_url('MPG')
print(len(MPGs))
MPGs

### VEHICLES DRIVE TYPES SCRAPING FROM WEB PAGES LISTINGS

In [None]:
drive_types = feature_scraper_from_url('Drive Type')
print(len(drive_types))
drive_types

### VEHICLES FUEL TYPES SCRAPING FROM WEB PAGES LISTINGS

In [None]:
fuel_types = feature_scraper_from_url('Fuel Type')
print(len(fuel_types))
fuel_types

### VEHICLES TRANSMISSIONS SCRAPING FROM WEB PAGES LISTINGS

In [None]:
transmissions = feature_scraper_from_url('Transmission')
print(len(transmissions))
transmissions

### VEHICLES MILEAGES SCRAPING FROM WEB PAGES LISTINGS

In [None]:
mileages = feature_scraper_from_url('Mileage')
print(len(mileages))
mileages

### VEHICLES ENGINES SCRAPING FROM WEB PAGES LISTINGS

In [None]:
re.findall('<li>.+', str(b))

In [None]:
def engines_scraper(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml')
    nth_finding = re.search('Engine</h4><ul><li>.+</li>', str(nth_soup))
    return re.findall('<li>.+', str(nth_finding))

In [None]:
engines_unf = list(map(engines_scraper, urls))

def engine_from_url(url):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml')
    nth_finding = re.findall('Engine</h4><ul><li>.+</li>', str(nth_soup))
    return re.findall('[^><]+', str(nth_finding))[4]

In [None]:
engines = list(map(engine_from_url, urls))
print(len(engines))
engines

### VEHICLE MAKES SCRAPING FROM WEB PAGES LISTINGS

In [None]:
def make_model_from_url(url, index):
    nth_request = requests.get(url).content
    nth_soup = BeautifulSoup(nth_request, 'lxml')
    nth_finding = nth_soup.find_all('div', {'class': 'text-truncate heading-3 margin-right-2 margin-right-sm-3'})
    nth_regex_finding = re.findall('[0-9]{4} .+<', str(nth_finding))
    make_model = re.findall('[A-z]+ [A-z]+', str(nth_regex_finding))[0]
    return make_model.split()[index]

In [None]:
makes = list(map(make_model_from_url, urls, itertools.repeat(0, len(urls))))
print(len(makes))
makes

### VEHICLE MODELS SCRAPING FROM WEB PAGES LISTINGS

In [None]:
models = list(map(make_model_from_url, urls, itertools.repeat(1, len(urls))))
print(len(models))
models

## CREATING THE DATAFRAME

In [None]:
columns = {
    'Make': makes, 'Model': models, 'Year': years, 'Price': prices, 'Engine': engines, 'Mileage': mileages,
    'Interior Color': interior_colors, 'Exterior Color': exterior_colors, 'Drive Type': drive_types, 
    'Fuel Type': fuel_types, 'Transmission': transmissions, 'MPG': MPGs, 'Style': styles, 
    'Bed Length': bed_lengths, 'Location (City)': cities, 'Location (State)': states
}

In [None]:
vehicles = pd.DataFrame(columns)
vehicles.head()