# **Imports and defining variables**

In [1]:
import pandas as pd
import re
import requests
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

In [2]:
navigator = 'Microsoft Edge (Scrapping; This is my system)'
url_allbrands_homepage = 'https://www.auto-data.net/en/allbrands'
url = 'https://www.auto-data.net'

In [3]:
html = requests.get(url_allbrands_homepage, headers={'User-Agent': navigator})
html

<Response [200]>

# **Scraping**

## Creating the soup and cleaning the information to get only text

In [4]:
soup = BeautifulSoup(html.text, 'html.parser')

In [5]:
brands = soup.find_all('a', {'class' : 'marki_blok'})
brands[0:5]

[<a class="marki_blok" href="/en/abarth-brand-200" title="Abarth - Technical Specs, Fuel consumption, Dimensions"><img alt="Abarth - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Abarth.png"/><strong>Abarth</strong></a>,
 <a class="marki_blok" href="/en/ac-brand-1" title="AC - Technical Specs, Fuel consumption, Dimensions"><img alt="AC - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/AC.png"/><strong>AC</strong></a>,
 <a class="marki_blok" href="/en/acura-brand-6" title="Acura - Technical Specs, Fuel consumption, Dimensions"><img alt="Acura - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Acura.png"/><strong>Acura</strong></a>,
 <a class="marki_blok" href="/en/aiways-brand-301" title="Aiways - Technical Specs, Fuel consumption, Dimensions"><img alt="Aiways - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Aiways.png"/><strong>Aiways</strong></a>,
 <a class="marki_blok" href="/en/aixam-brand-255" title="Aixam - Technical Sp

In [6]:
#This loop is to get only the part to use in link ie whatever is after 'href='
brand_list = re.findall('href="(\S+)"',str(brands))
brand_list[0:5]

['/en/abarth-brand-200',
 '/en/ac-brand-1',
 '/en/acura-brand-6',
 '/en/aiways-brand-301',
 '/en/aixam-brand-255']

In [7]:
#Looping to remove the /en since the url already has it and removing it makes it so it will default to another language
brand_list_final = []
for i in range(len(brand_list)):
    brand_list_final += re.findall('en/(\S+)', brand_list[i])
brand_list_final[0:5]

['abarth-brand-200',
 'ac-brand-1',
 'acura-brand-6',
 'aiways-brand-301',
 'aixam-brand-255']

## Creating the loops for the actual scraping

In [8]:
#Looping to get the actual url for each brand
urls_list = []
for urls in brand_list:
    url = 'https://www.auto-data.net'
    urls_list += [url + urls]
    

urls_list[0:5]
# " ".join(urls_list)
# urls_list

['https://www.auto-data.net/en/abarth-brand-200',
 'https://www.auto-data.net/en/ac-brand-1',
 'https://www.auto-data.net/en/acura-brand-6',
 'https://www.auto-data.net/en/aiways-brand-301',
 'https://www.auto-data.net/en/aixam-brand-255']

In [9]:
url

'https://www.auto-data.net'

In [10]:
###Looping for the model inside each brand
models_list = []
models_list_final = []
for page2 in urls_list:                                               
     html2 = requests.get(page2, headers={'User-Agent': navigator})
     soup2 = BeautifulSoup(html2.text, 'html.parser')
     models = soup2.find_all('a', {'class' : 'modeli'}) 
     models_list += models
     for model in models_list:
        models_list_final += re.findall('en/(\S+)', str(model))

models_list_final                 

['abarth-124-spider-model-2152"',
 'abarth-500-model-2872"',
 'abarth-595-model-2149"',
 'abarth-695-model-2150"',
 'abarth-124-spider-model-2152"',
 'abarth-500-model-2872"',
 'abarth-595-model-2149"',
 'abarth-695-model-2150"',
 'ac-ace-model-1"',
 'ac-aceca-model-2"',
 'ac-cobra-model-3"',
 'abarth-124-spider-model-2152"',
 'abarth-500-model-2872"',
 'abarth-595-model-2149"',
 'abarth-695-model-2150"',
 'ac-ace-model-1"',
 'ac-aceca-model-2"',
 'ac-cobra-model-3"',
 'acura-cl-model-138"',
 'acura-csx-model-143"',
 'acura-el-model-148"',
 'acura-ilx-model-1994"',
 'acura-integra-model-2258"',
 'acura-mdx-model-144"',
 'acura-nsx-model-149"',
 'acura-rdx-model-140"',
 'acura-rl-model-145"',
 'acura-rlx-model-1982"',
 'acura-rsx-model-141"',
 'acura-slx-model-146"',
 'acura-tl-model-142"',
 'acura-tlx-model-2241"',
 'acura-tsx-model-147"',
 'acura-zdx-model-1995"',
 'abarth-124-spider-model-2152"',
 'abarth-500-model-2872"',
 'abarth-595-model-2149"',
 'abarth-695-model-2150"',
 'ac-ac

In [11]:
#Removing the quotes from the end
models_list_final2 = [x[:-1] for x in models_list_final]
models_list_final2

['abarth-124-spider-model-2152',
 'abarth-500-model-2872',
 'abarth-595-model-2149',
 'abarth-695-model-2150',
 'abarth-124-spider-model-2152',
 'abarth-500-model-2872',
 'abarth-595-model-2149',
 'abarth-695-model-2150',
 'ac-ace-model-1',
 'ac-aceca-model-2',
 'ac-cobra-model-3',
 'abarth-124-spider-model-2152',
 'abarth-500-model-2872',
 'abarth-595-model-2149',
 'abarth-695-model-2150',
 'ac-ace-model-1',
 'ac-aceca-model-2',
 'ac-cobra-model-3',
 'acura-cl-model-138',
 'acura-csx-model-143',
 'acura-el-model-148',
 'acura-ilx-model-1994',
 'acura-integra-model-2258',
 'acura-mdx-model-144',
 'acura-nsx-model-149',
 'acura-rdx-model-140',
 'acura-rl-model-145',
 'acura-rlx-model-1982',
 'acura-rsx-model-141',
 'acura-slx-model-146',
 'acura-tl-model-142',
 'acura-tlx-model-2241',
 'acura-tsx-model-147',
 'acura-zdx-model-1995',
 'abarth-124-spider-model-2152',
 'abarth-500-model-2872',
 'abarth-595-model-2149',
 'abarth-695-model-2150',
 'ac-ace-model-1',
 'ac-aceca-model-2',
 'ac-

In [12]:
urls_list2 = []
for urls2 in models_list_final2:
    url = 'https://www.auto-data.net/en'
    urls_list2 += [url + "/" +urls2]

urls_list2

['https://www.auto-data.net/en/abarth-124-spider-model-2152',
 'https://www.auto-data.net/en/abarth-500-model-2872',
 'https://www.auto-data.net/en/abarth-595-model-2149',
 'https://www.auto-data.net/en/abarth-695-model-2150',
 'https://www.auto-data.net/en/abarth-124-spider-model-2152',
 'https://www.auto-data.net/en/abarth-500-model-2872',
 'https://www.auto-data.net/en/abarth-595-model-2149',
 'https://www.auto-data.net/en/abarth-695-model-2150',
 'https://www.auto-data.net/en/ac-ace-model-1',
 'https://www.auto-data.net/en/ac-aceca-model-2',
 'https://www.auto-data.net/en/ac-cobra-model-3',
 'https://www.auto-data.net/en/abarth-124-spider-model-2152',
 'https://www.auto-data.net/en/abarth-500-model-2872',
 'https://www.auto-data.net/en/abarth-595-model-2149',
 'https://www.auto-data.net/en/abarth-695-model-2150',
 'https://www.auto-data.net/en/ac-ace-model-1',
 'https://www.auto-data.net/en/ac-aceca-model-2',
 'https://www.auto-data.net/en/ac-cobra-model-3',
 'https://www.auto-data

In [14]:
###Looping for the correct generation/series in each model
series_list = []
series_list_final = []
for page3 in urls_list2:
    html3 = requests.get(page3, headers={'User-Agent': navigator})
    soup3 = BeautifulSoup(html3.text, 'html.parser')
    series = soup3.find_all('a', {'class' : 'position'}) 
    for serie in series:
        series_list_final = re.findall('href="(\S+)"',str(serie))
series_list_final

KeyboardInterrupt: 

In [15]:
series_list_final

['/en/fiat-egea-356-generation-8497']

In [None]:
#This is the final page, retrieve the desired information from here
for page4 in urls_list3:
    html4 = requests.get(page4, headers={'User-Agent': navigator})
    soup4 = BeautifulSoup(html4.text, 'html.parser')
    specs = soup4.find_all('table', {'class' : 'cardetailsout car2'})
    specs[0:5]

In [None]:
#Need to install this library to work
!pip install html5lib

## Final dataframe

In [None]:
what = pd.read_html(page4)[1]  
what

what2 = what[(what['General information'] == 'CO2 emissions') 
|(what['General information'] == 'Body type')
|(what['General information'] == 'Fuel tank capacity')
|(what['General information'] == 'Fuel consumption (economy) - combined')
|(what['General information'] == 'Fuel Type')
|(what['General information'] == 'Power')]
what2.index = what2['General information']
what3 = what2.T
what3.drop(axis = 0, labels = 'General information', inplace = True)
what3['Fuel tank(l)'] = what3['Fuel tank capacity'].apply(lambda x : x.split(' ')[0])
what3['Fuel consumption(l/100km)'] = what3['Fuel consumption (economy) - combined'].apply(lambda x : x.split(' ')[0])
what3['Power(Hp)'] = what3['Power'].apply(lambda x: x.split(' ')[0])
what3['Range (Km)'] = what3[['Fuel tank(l)','Fuel consumption(l/100km)']].apply(lambda item: int(item[0])/float(item[1])*100,axis=1)

what4 = what3
what4.drop(columns = ['Fuel consumption (economy) - combined', 'Fuel tank capacity'], inplace = True)
what4

General information,Body type,CO2 emissions,Fuel Type,Power,Fuel tank(l),Fuel consumption(l/100km),Power(Hp),Range (Km)
General information.1,Hatchback,122 g/km,Diesel,120 Hp @ 3750 rpm.,45,4.8,120,937.5
