# **Imports and defining variables**

In [56]:
import pandas as pd
import re
import requests
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

In [90]:
navigator = 'Microsoft Edge (Scrapping; This is my system)'
url_allbrands_homepage = 'https://www.auto-data.net/en/allbrands'

In [91]:
html = requests.get(url, headers={'User-Agent': navigator})
html

<Response [200]>

# **Scraping**

## Creating the soup and cleaning the information to get only text

In [76]:
soup = BeautifulSoup(html.text, 'html.parser')

In [60]:
brands = soup.find_all('a', {'class' : 'marki_blok'})
brands[0:5]

[<a class="marki_blok" href="/en/abarth-brand-200" title="Abarth - Technical Specs, Fuel consumption, Dimensions"><img alt="Abarth - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Abarth.png"/><strong>Abarth</strong></a>,
 <a class="marki_blok" href="/en/ac-brand-1" title="AC - Technical Specs, Fuel consumption, Dimensions"><img alt="AC - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/AC.png"/><strong>AC</strong></a>,
 <a class="marki_blok" href="/en/acura-brand-6" title="Acura - Technical Specs, Fuel consumption, Dimensions"><img alt="Acura - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Acura.png"/><strong>Acura</strong></a>,
 <a class="marki_blok" href="/en/aiways-brand-301" title="Aiways - Technical Specs, Fuel consumption, Dimensions"><img alt="Aiways - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Aiways.png"/><strong>Aiways</strong></a>,
 <a class="marki_blok" href="/en/aixam-brand-255" title="Aixam - Technical Sp

In [61]:
#This loop is to get only the part to use in link ie whatever is after 'href='
brand_list = re.findall('href="(\S+)"',str(brands))
brand_list[0:5]

['/en/abarth-brand-200',
 '/en/ac-brand-1',
 '/en/acura-brand-6',
 '/en/aiways-brand-301',
 '/en/aixam-brand-255']

In [62]:
#Looping to remove the /en since the url already has it and removing it makes it so it will default to another language
brand_list_final = []
for i in range(len(brand_list)):
    brand_list_final += re.findall('en/(\S+)', brand_list[i])
brand_list_final[0:5]

['abarth-brand-200',
 'ac-brand-1',
 'acura-brand-6',
 'aiways-brand-301',
 'aixam-brand-255']

## Creating the loops for the actual scraping

In [149]:
#Looping to get the actual url for each brand
urls_list = []
for urls in brand_list:
    url = 'https://www.auto-data.net'
    urls_list += [url + urls]
    

urls_list[0:5]
# " ".join(urls_list)
# urls_list

['https://www.auto-data.net/en/abarth-brand-200',
 'https://www.auto-data.net/en/ac-brand-1',
 'https://www.auto-data.net/en/acura-brand-6',
 'https://www.auto-data.net/en/aiways-brand-301',
 'https://www.auto-data.net/en/aixam-brand-255']

In [93]:
url

'https://www.auto-data.net'

# **Selenium attempt**

In [None]:
!pip install webdriver-manager

In [133]:
from getpass import getpass
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys

In [137]:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.auto-data.net/en/allbrands")



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [C:\Users\Utilizador\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


In [140]:
soup_selenium = BeautifulSoup(driver.page_source, 'html.parser')


In [163]:
models_list = []
for page2 in urls_list:                                                #Inside each brand, find the models
     html2 = requests.get(page2, headers={'User-Agent': navigator})
     soup2 = BeautifulSoup(html2.text, 'html.parser')
     models = soup2.find_all('a', {'class' : 'modeli'}) 
     for model in models:
          models_list += re.findall('en/(\S+)', str(model))

models_list

In [66]:
page3 = 'https://www.auto-data.net/fr/acura-cl-ii-generation-7196'    #Inside each model, find the correct generation/series

In [67]:
page4 = 'https://www.auto-data.net/en/alfa-romeo-mito-1.6-jtdm-120hp-16680'   #This is the final page, retrieve the desired information from here
#
html4 = requests.get(page4, headers={'User-Agent': navigator})

soup4 = BeautifulSoup(html4.text, 'html.parser')
specs = soup4.find_all('table', {'class' : 'cardetailsout car2'})
specs[0:5]

[<table class="cardetailsout car2"><caption><h2 class="car">Alfa Romeo MiTo 1.6 JTDm (120 Hp) 2009, 2010, 2011, 2012, 2013 Specs</h2></caption><tr class="no"><th class="no" colspan="2"><strong class="car" id="_general">General information</strong></th></tr>
 <tr><th>Brand</th><td><a href="/en/alfa-romeo-brand-11">Alfa Romeo</a></td></tr>
 <tr><th>Model </th><td><a href="/en/alfa-romeo-mito-model-178">MiTo</a></td></tr>
 <tr><th>Generation  </th><td><a href="/en/alfa-romeo-mito-generation-363">MiTo</a></td></tr>
 <tr><th>Modification (Engine) </th><td>1.6 JTDm (120 Hp) </td></tr>
 <tr><th>Start of production </th><td>2009 year </td></tr>
 <tr><th>End of production </th><td>2013 year </td></tr>
 <tr><th>Powertrain Architecture </th><td>Internal Combustion engine </td></tr>
 <tr><th>Body type</th><td>Hatchback </td></tr>
 <tr><th>Seats </th><td>5 </td></tr>
 <tr><th>Doors </th><td>2/3 </td></tr>
 <tr class="no"><th class="no" colspan="2"><strong class="car" id="_performance">Performance s

In [68]:
!pip install html5lib



In [55]:
what = pd.read_html(page4)[1]  
what

what2 = what[(what['General information'] == 'CO2 emissions') 
|(what['General information'] == 'Body type')
|(what['General information'] == 'Fuel tank capacity')
|(what['General information'] == 'Fuel consumption (economy) - combined')
|(what['General information'] == 'Fuel Type')
|(what['General information'] == 'Power')]
what2.index = what2['General information']
what3 = what2.T
what3.drop(axis = 0, labels = 'General information', inplace = True)
what3['Fuel tank(l)'] = what3['Fuel tank capacity'].apply(lambda x : x.split(' ')[0])
what3['Fuel consumption(l/100km)'] = what3['Fuel consumption (economy) - combined'].apply(lambda x : x.split(' ')[0])
what3['Power(Hp)'] = what3['Power'].apply(lambda x: x.split(' ')[0])
what3['Range (Km)'] = what3[['Fuel tank(l)','Fuel consumption(l/100km)']].apply(lambda item: int(item[0])/float(item[1])*100,axis=1)

what4 = what3
what4.drop(columns = ['Fuel consumption (economy) - combined', 'Fuel tank capacity'], inplace = True)
what4

General information,Body type,CO2 emissions,Fuel Type,Power,Fuel tank(l),Fuel consumption(l/100km),Power(Hp),Range (Km)
General information.1,Hatchback,122 g/km,Diesel,120 Hp @ 3750 rpm.,45,4.8,120,937.5
