# **Imports and defining variables**

In [15]:
import pandas as pd
import re
import requests
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from random import randint
from time import sleep

In [16]:
navigator = 'Microsoft Edge (Scrapping; This is my system)'
url_allbrands_homepage = 'https://www.auto-data.net/en/allbrands'
url = 'https://www.auto-data.net'

In [17]:
html = requests.get(url_allbrands_homepage, headers={'User-Agent': navigator})
html

<Response [200]>

# **Scraping**

## Creating the soup and cleaning the information to get only text

In [18]:
soup = BeautifulSoup(html.text, 'html.parser')

In [19]:
brands = soup.find_all('a', {'class' : 'marki_blok'})
brands[0:5]

[<a class="marki_blok" href="/en/abarth-brand-200" title="Abarth - Technical Specs, Fuel consumption, Dimensions"><img alt="Abarth - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Abarth.png"/><strong>Abarth</strong></a>,
 <a class="marki_blok" href="/en/ac-brand-1" title="AC - Technical Specs, Fuel consumption, Dimensions"><img alt="AC - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/AC.png"/><strong>AC</strong></a>,
 <a class="marki_blok" href="/en/acura-brand-6" title="Acura - Technical Specs, Fuel consumption, Dimensions"><img alt="Acura - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Acura.png"/><strong>Acura</strong></a>,
 <a class="marki_blok" href="/en/aiways-brand-301" title="Aiways - Technical Specs, Fuel consumption, Dimensions"><img alt="Aiways - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Aiways.png"/><strong>Aiways</strong></a>,
 <a class="marki_blok" href="/en/aixam-brand-255" title="Aixam - Technical Sp

In [20]:
#This loop is to get only the part to use in link ie whatever is after 'href='
brand_list = re.findall('href="(\S+)"',str(brands))
brand_list[0:5]

['/en/abarth-brand-200',
 '/en/ac-brand-1',
 '/en/acura-brand-6',
 '/en/aiways-brand-301',
 '/en/aixam-brand-255']

In [21]:
#Looping to remove the /en since the url already has it and removing it makes it so it will default to another language
brand_list_final = []
for i in range(len(brand_list)):
    brand_list_final += re.findall('en/(\S+)', brand_list[i])
brand_list_final[0:5]
#Starting to create dataframe for the links
df = pd.DataFrame(brand_list_final)
df['letter'] = df[0].apply(lambda x: x[0])
df['letter'].unique()

array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'z'],
      dtype=object)

In [22]:
#Creating CSV files for each starting letter of the brand names so we can save it more often
dicto = {}
for letter in df['letter'].unique():
    a = df[df['letter'] == letter].drop(columns = 'letter')
    a.to_csv('../BEEV/letter_csvs/' + letter + '.csv')
    dicto[letter] = a

## Creating the loops for the actual scraping

In [23]:
#Looping to create a column with url brand names
url = 'https://www.auto-data.net/en/'
for csvs in dicto.values():
    csvs[0] = csvs[0].apply(lambda x: url + x)

In [24]:
#Renaming the column to make it easier to understand
for csvs in dicto.values():
    csvs.rename(columns={0 : 'brands_url'}, inplace = True)

In [25]:
#Dataframe with url for each brand
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [26]:
#Function to get the models inside each brand
def model_of_each_brand(brand_url):
    models_list = []
    html2 = requests.get(brand_url, headers={'User-Agent': navigator})
    soup2 = BeautifulSoup(html2.text, 'html.parser')
    models = soup2.find_all('a', {'class' : 'modeli'}) 
    models_list_final = re.findall('en/(\S+)', str(models))
    return [x[:-1] for x in models_list_final]
brand_url = 'https://www.auto-data.net/en/abarth-brand-200'
model_of_each_brand(brand_url)

['abarth-124-spider-model-2152',
 'abarth-500-model-2872',
 'abarth-595-model-2149',
 'abarth-695-model-2150']

In [27]:
#Applying the function to get the models for each brand(displayed in lists inside the model_url column)
for csvs in dicto.values():
    csvs['models'] = csvs['brands_url'].apply(lambda x: model_of_each_brand(x))


In [28]:
#Saving to keep a column with models names
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [29]:
#Loop to explode the models name and get 1 name per each line
for letter, letter_csvs in dicto.items():
    dicto[letter] = dicto[letter].explode('models')

In [30]:
dicto['a'].head()         #Just checking

Unnamed: 0,brands_url,models
0,https://www.auto-data.net/en/abarth-brand-200,abarth-124-spider-model-2152
0,https://www.auto-data.net/en/abarth-brand-200,abarth-500-model-2872
0,https://www.auto-data.net/en/abarth-brand-200,abarth-595-model-2149
0,https://www.auto-data.net/en/abarth-brand-200,abarth-695-model-2150
1,https://www.auto-data.net/en/ac-brand-1,ac-ace-model-1


In [31]:
#Loop to create a column with models_url
for csvs in dicto.values():
    csvs['models_url'] = csvs['models'].apply(lambda x: url + x)

In [32]:
#Saving the models_url column
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [33]:
#Creating function to scrap the generation names for each models
def generation_of_each_model(model_url):
    gens_list = []
    html3 = requests.get(model_url, headers={'User-Agent': navigator})
    soup3 = BeautifulSoup(html3.text, 'html.parser')
    gens = soup3.find_all('a', {'class' : 'position'}) 
    gens_list_final = re.findall('en/(\S+)', str(gens))
    return [x[:-1] for x in gens_list_final]
model_url = 'https://www.auto-data.net/en/abarth-124-spider-model-2152'
generation_of_each_model(model_url)

['abarth-124-gt-generation-6774',
 'abarth-124-gt-generation-6774',
 'abarth-124-spider-generation-5004',
 'abarth-124-spider-generation-5004']

In [34]:
#Applying function to get the generation names for each models
for csvs in dicto.values():
    csvs['generations'] = csvs['models_url'].apply(lambda x: generation_of_each_model(x))

In [35]:
#Saving the generations columns
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [36]:
#Loop to explode the generations names
for letter, letter_csvs in dicto.items():
    dicto[letter] = dicto[letter].explode('generations')

In [37]:
#Saving the exploded generations names
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [38]:
#Dropping duplicates that were created
for letter, letter_csvs in dicto.items():
    dicto[letter].drop_duplicates(inplace = True)
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [39]:
#Looping to create a column for the generations url
for csvs in dicto.values():
    csvs['generations_url'] = csvs['generations'].apply(lambda x: url + x)

In [40]:
#Saving the column with the generations urls
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [41]:
#Creating a function to scrap for the modifications of each generation
def modifications_of_each_generation(gen_url):
    mods_list = []
    html4 = requests.get(gen_url, headers={'User-Agent': navigator})
    soup4 = BeautifulSoup(html4.text, 'html.parser')
    mods = soup4.find_all('table', {'class' : 'carlist'}) 
    gens_list_final = re.findall('en/(\S+)', str(mods))
    return [x[:-1] for x in gens_list_final]
gen_url = 'https://www.auto-data.net/en/alfa-romeo-mito-generation-363'
modifications_of_each_generation(gen_url)          

['alfa-romeo-mito-1.6-jtdm-120hp-16680',
 'alfa-romeo-mito-1.6-jtdm-120hp-16680',
 'alfa-romeo-mito-1.4-tb-multiair-170hp-44386',
 'alfa-romeo-mito-1.4-tb-multiair-170hp-44386',
 'alfa-romeo-mito-1.4-tb-155hp-16679',
 'alfa-romeo-mito-1.4-tb-155hp-16679',
 'alfa-romeo-mito-1.4-tb-multiair-135hp-tct-44377',
 'alfa-romeo-mito-1.4-tb-multiair-135hp-tct-44377',
 'alfa-romeo-mito-1.4-t-jet-120hp-16683',
 'alfa-romeo-mito-1.4-t-jet-120hp-16683',
 'alfa-romeo-mito-1.4-multiair-135hp-start-stop-16681',
 'alfa-romeo-mito-1.4-multiair-135hp-start-stop-16681',
 'alfa-romeo-mito-1.4-multiair-105hp-start-stop-16684',
 'alfa-romeo-mito-1.4-multiair-105hp-start-stop-16684',
 'alfa-romeo-mito-1.4-mpi-78hp-16682',
 'alfa-romeo-mito-1.4-mpi-78hp-16682',
 'alfa-romeo-mito-1.4-70hp-44378',
 'alfa-romeo-mito-1.4-70hp-44378',
 'alfa-romeo-mito-1.3-jtdm-90hp-40499',
 'alfa-romeo-mito-1.3-jtdm-90hp-40499',
 'alfa-romeo-mito-1.3-jtdm-eco-85hp-44387',
 'alfa-romeo-mito-1.3-jtdm-eco-85hp-44387',
 'alfa-romeo-mit

In [42]:
#Applying function to get the modifications names for each generation
for csvs in dicto.values():
    csvs['modifications'] = csvs['generations_url'].apply(lambda x: modifications_of_each_generation(x))

In [43]:
#Saving the mods column
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [44]:
#Exploding the mods column to get one per row
for letter, letter_csvs in dicto.items():
    dicto[letter] = dicto[letter].explode('modifications')

In [45]:
#Saving the exploded generations names
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [46]:
#Dropping duplicates that were created
for letter, letter_csvs in dicto.items():
    dicto[letter].drop_duplicates(inplace = True)
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [47]:
#Loop to create a column for the mods url
for csvs in dicto.values():
    csvs['modifications_url'] = csvs['modifications'].apply(lambda x: url + str(x))

In [48]:
#Saving the column for the mods urls
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [49]:
#This is the final loop to get the details for the cars
#specs = soup4.find_all('table', {'class' : 'cardetailsout car2'})

In [50]:
#Need to install this library to work
!pip install html5lib



## Final dataframe

In [96]:
#Creating the final function 

def specifications_of_each_car(mods_url):
    global specs3
    specs = pd.read_html(mods_url)[1]
    specs2 = specs[(specs['General information'] == 'Brand')
    |(specs['General information'] == 'Model')
    |(specs['General information'] == 'Generation')
    |(specs['General information'] == 'Modification (Engine)')
    |(specs['General information'] == 'CO2 emissions') 
    |(specs['General information'] == 'Body type')
    |(specs['General information'] == 'Fuel tank capacity')
    |(specs['General information'] == 'Fuel consumption (economy) - combined')
    |(specs['General information'] == 'Fuel Type')
    |(specs['General information'] == 'Power')]
    specs2.rename(columns = {'General information' : ''}, inplace = True)
    specs2.set_index([''], inplace = True)
    specs3 = specs2.T
    specs3.reset_index(inplace = True)
    specs3.drop(columns = ['index'], inplace = True)
    specs3['Fuel tank(l)'] = specs3['Fuel tank capacity'].apply(lambda x : x.split(' ')[0])
    specs3['Fuel consumption(l/100km)'] = specs3['Fuel consumption (economy) - combined'].apply(lambda x : x.split(' ')[0])
    specs3['Power(Hp)'] = specs3['Power'].apply(lambda x: x.split(' ')[0])
    specs3['Range (Km)'] = specs3[['Fuel tank(l)','Fuel consumption(l/100km)']].apply(lambda item: int(item[0])/float(item[1])*100,axis=1)
    #specs3.drop(columns = ['Fuel consumption (economy) - combined', 'Fuel tank capacity', 'Power'], inplace = True)
    return specs3
    

specifications_of_each_car('https://www.auto-data.net/en/abarth-124-gt-1.4-multiair-170hp-automatic-35172')

Unnamed: 0,Brand,Model,Generation,Modification (Engine),Body type,Fuel consumption (economy) - combined,CO2 emissions,Fuel Type,Power,Fuel tank capacity,Fuel tank(l),Fuel consumption(l/100km),Power(Hp),Range (Km)
0,Abarth,124 Spider,124 GT,1.4 MultiAir (170 Hp) Automatic,Roadster,6.6 l/100 km 35.64 US mpg42.8 UK mpg15.15 km/l,153 g/km,Petrol (Gasoline),170 Hp @ 5500 rpm.,45 l 11.89 US gal | 9.9 UK gal,45,6.6,170,681.818182


In [97]:
#Creating an empty dataframe
df = pd.DataFrame(columns = ['Brand', 'Model', 'Generation', 'Modification(Engine)','Body type', 'Fuel consumption (economy) - combined', 'CO2 emissions', 'Fuel type','Power','Fuel tank capacity', 'Fuel tank(l)', 'Fuel consumption(l/100km)', 'Power(Hp)','Range (km)'])
df.reset_index()


Unnamed: 0,index,Brand,Model,Generation,Modification(Engine),Body type,Fuel consumption (economy) - combined,CO2 emissions,Fuel type,Power,Fuel tank capacity,Fuel tank(l),Fuel consumption(l/100km),Power(Hp),Range (km)


In [99]:
#Testing for 'a' dataframe:
#for csvs in dicto['a']:
#    specifications_of_each_car(csvs['modifications_url'])
 #   df = pd.concat([df,specs3], ignore_index=True)

#Taking the shape to loop
#dicto['a']['modifications_url'].shape[0]
#dicto['a'].reset_index(inplace = True)
#global specs3
for i in range(0,dicto['b']['modifications_url'].shape[0]+1):
    specifications_of_each_car(dicto['a']['modifications_url'][i])
    df = pd.concat([df,specs3], ignore_index=True)
    

df.head()

KeyError: 'Fuel consumption (economy) - combined'

In [82]:
#Looping to create the final dataframe

#for i in range(0,10):
#    print(dicto['a']['modifications_url'][i])
#dicto['b'].reset_index()
dicto['a']#.drop(columns = ['level_0','index'],inplace = True)

Unnamed: 0,brands_url,models,models_url,generations,generations_url,modifications,modifications_url
0,https://www.auto-data.net/en/abarth-brand-200,abarth-124-spider-model-2152,https://www.auto-data.net/en/abarth-124-spider...,abarth-124-gt-generation-6774,https://www.auto-data.net/en/abarth-124-gt-gen...,abarth-124-gt-1.4-multiair-170hp-automatic-35172,https://www.auto-data.net/en/abarth-124-gt-1.4...
1,https://www.auto-data.net/en/abarth-brand-200,abarth-124-spider-model-2152,https://www.auto-data.net/en/abarth-124-spider...,abarth-124-gt-generation-6774,https://www.auto-data.net/en/abarth-124-gt-gen...,abarth-124-gt-1.4-multiair-170hp-35171,https://www.auto-data.net/en/abarth-124-gt-1.4...
2,https://www.auto-data.net/en/abarth-brand-200,abarth-124-spider-model-2152,https://www.auto-data.net/en/abarth-124-spider...,abarth-124-spider-generation-5004,https://www.auto-data.net/en/abarth-124-spider...,abarth-124-spider-1.4-multiair-170hp-automatic...,https://www.auto-data.net/en/abarth-124-spider...
3,https://www.auto-data.net/en/abarth-brand-200,abarth-124-spider-model-2152,https://www.auto-data.net/en/abarth-124-spider...,abarth-124-spider-generation-5004,https://www.auto-data.net/en/abarth-124-spider...,abarth-124-spider-1.4-multiair-170hp-25192,https://www.auto-data.net/en/abarth-124-spider...
4,https://www.auto-data.net/en/abarth-brand-200,abarth-500-model-2872,https://www.auto-data.net/en/abarth-500-model-...,abarth-500c-generation-8152,https://www.auto-data.net/en/abarth-500c-gener...,abarth-500c-1.4-t-jet-140hp-mta-42379,https://www.auto-data.net/en/abarth-500c-1.4-t...
...,...,...,...,...,...,...,...
3870,https://www.auto-data.net/en/autobianchi-brand-51,autobianchi-a-112-model-605,https://www.auto-data.net/en/autobianchi-a-112...,autobianchi-a-112-generation-1299,https://www.auto-data.net/en/autobianchi-a-112...,autobianchi-a-112-1.0-abarth-58hp-5765,https://www.auto-data.net/en/autobianchi-a-112...
3871,https://www.auto-data.net/en/autobianchi-brand-51,autobianchi-a-112-model-605,https://www.auto-data.net/en/autobianchi-a-112...,autobianchi-a-112-generation-1299,https://www.auto-data.net/en/autobianchi-a-112...,autobianchi-a-112-0.9-junior-42hp-5764,https://www.auto-data.net/en/autobianchi-a-112...
3872,https://www.auto-data.net/en/autobianchi-brand-51,autobianchi-a-112-model-605,https://www.auto-data.net/en/autobianchi-a-112...,autobianchi-a-112-generation-1299,https://www.auto-data.net/en/autobianchi-a-112...,autobianchi-a-112-0.9-e-47hp-5763,https://www.auto-data.net/en/autobianchi-a-112...
3873,https://www.auto-data.net/en/autobianchi-brand-51,autobianchi-a-112-model-605,https://www.auto-data.net/en/autobianchi-a-112...,autobianchi-a-112-generation-1299,https://www.auto-data.net/en/autobianchi-a-112...,autobianchi-a-112-0.9-45hp-5762,https://www.auto-data.net/en/autobianchi-a-112...


In [None]:
# #This is the dataframe with all the information we want
# page4 = 'https://www.auto-data.net/en/alfa-romeo-mito-1.6-jtdm-120hp-16680'
# what = pd.read_html(page4)[1]  
# what

# what2 = what[(what['General information'] == 'Brand')
# |(what['General information'] == 'Model')
# |(what['General information'] == 'Generation')
# |(what['General information'] == 'Modification (Engine)')
# |(what['General information'] == 'CO2 emissions') 
# |(what['General information'] == 'Body type')
# |(what['General information'] == 'Fuel tank capacity')
# |(what['General information'] == 'Fuel consumption (economy) - combined')
# |(what['General information'] == 'Fuel Type')
# |(what['General information'] == 'Power')]
# what2.index = what2['General information']
# what3 = what2.T
# what3.drop(axis = 0, labels = 'General information', inplace = True)
# what3['Fuel tank(l)'] = what3['Fuel tank capacity'].apply(lambda x : x.split(' ')[0])
# what3['Fuel consumption(l/100km)'] = what3['Fuel consumption (economy) - combined'].apply(lambda x : x.split(' ')[0])
# what3['Power(Hp)'] = what3['Power'].apply(lambda x: x.split(' ')[0])
# what3['Range (Km)'] = what3[['Fuel tank(l)','Fuel consumption(l/100km)']].apply(lambda item: int(item[0])/float(item[1])*100,axis=1)

# what4 = what3
# what4.drop(columns = ['Fuel consumption (economy) - combined', 'Fuel tank capacity'], inplace = True)
# what4                               #We need to remove this General information column still