# **Imports and defining variables**

In [1]:
import pandas as pd
import re
import requests
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from io import StringIO

In [None]:
navigator = 'Microsoft Edge (Scrapping; This is my system)'
url_allbrands_homepage = 'https://www.auto-data.net/en/allbrands'
url = 'https://www.auto-data.net'

In [None]:
html = requests.get(url_allbrands_homepage, headers={'User-Agent': navigator})
html

# **Scraping**

## Creating the soup and cleaning the information to get only text

In [18]:
soup = BeautifulSoup(html.text, 'html.parser')

In [19]:
brands = soup.find_all('a', {'class' : 'marki_blok'})
brands[0:5]

[<a class="marki_blok" href="/en/abarth-brand-200" title="Abarth - Technical Specs, Fuel consumption, Dimensions"><img alt="Abarth - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Abarth.png"/><strong>Abarth</strong></a>,
 <a class="marki_blok" href="/en/ac-brand-1" title="AC - Technical Specs, Fuel consumption, Dimensions"><img alt="AC - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/AC.png"/><strong>AC</strong></a>,
 <a class="marki_blok" href="/en/acura-brand-6" title="Acura - Technical Specs, Fuel consumption, Dimensions"><img alt="Acura - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Acura.png"/><strong>Acura</strong></a>,
 <a class="marki_blok" href="/en/aiways-brand-301" title="Aiways - Technical Specs, Fuel consumption, Dimensions"><img alt="Aiways - Technical Specs, Fuel consumption, Dimensions" src="/img/logos/Aiways.png"/><strong>Aiways</strong></a>,
 <a class="marki_blok" href="/en/aixam-brand-255" title="Aixam - Technical Sp

In [20]:
#This loop is to get only the part to use in link ie whatever is after 'href='
brand_list = re.findall('href="(\S+)"',str(brands))
brand_list[0:5]

['/en/abarth-brand-200',
 '/en/ac-brand-1',
 '/en/acura-brand-6',
 '/en/aiways-brand-301',
 '/en/aixam-brand-255']

In [21]:
#Looping to remove the /en since the url already has it and removing it makes it so it will default to another language
brand_list_final = []
for i in range(len(brand_list)):
    brand_list_final += re.findall('en/(\S+)', brand_list[i])
brand_list_final[0:5]
#Starting to create dataframe for the links
df = pd.DataFrame(brand_list_final)
df['letter'] = df[0].apply(lambda x: x[0])
df['letter'].unique()

array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'z'],
      dtype=object)

In [22]:
#Creating CSV files for each starting letter of the brand names so we can save it more often
dicto = {}
for letter in df['letter'].unique():
    a = df[df['letter'] == letter].drop(columns = 'letter')
    a.to_csv('../BEEV/letter_csvs/' + letter + '.csv')
    dicto[letter] = a

## Creating the loops for the actual scraping

In [23]:
#Looping to create a column with url brand names
url = 'https://www.auto-data.net/en/'
for csvs in dicto.values():
    csvs[0] = csvs[0].apply(lambda x: url + x)

In [24]:
#Renaming the column to make it easier to understand
for csvs in dicto.values():
    csvs.rename(columns={0 : 'brands_url'}, inplace = True)

In [25]:
#Dataframe with url for each brand
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [26]:
#Function to get the models inside each brand
def model_of_each_brand(brand_url):
    models_list = []
    html2 = requests.get(brand_url, headers={'User-Agent': navigator})
    soup2 = BeautifulSoup(html2.text, 'html.parser')
    models = soup2.find_all('a', {'class' : 'modeli'}) 
    models_list_final = re.findall('en/(\S+)', str(models))
    return [x[:-1] for x in models_list_final]
brand_url = 'https://www.auto-data.net/en/abarth-brand-200'
model_of_each_brand(brand_url)

['abarth-124-spider-model-2152',
 'abarth-500-model-2872',
 'abarth-595-model-2149',
 'abarth-695-model-2150']

In [27]:
#Applying the function to get the models for each brand(displayed in lists inside the model_url column)
for csvs in dicto.values():
    csvs['models'] = csvs['brands_url'].apply(lambda x: model_of_each_brand(x))


In [28]:
#Saving to keep a column with models names
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [29]:
#Loop to explode the models name and get 1 name per each line
for letter, letter_csvs in dicto.items():
    dicto[letter] = dicto[letter].explode('models')

In [30]:
dicto['a'].head()         #Just checking

Unnamed: 0,brands_url,models
0,https://www.auto-data.net/en/abarth-brand-200,abarth-124-spider-model-2152
0,https://www.auto-data.net/en/abarth-brand-200,abarth-500-model-2872
0,https://www.auto-data.net/en/abarth-brand-200,abarth-595-model-2149
0,https://www.auto-data.net/en/abarth-brand-200,abarth-695-model-2150
1,https://www.auto-data.net/en/ac-brand-1,ac-ace-model-1


In [31]:
#Loop to create a column with models_url
for csvs in dicto.values():
    csvs['models_url'] = csvs['models'].apply(lambda x: url + x)

In [32]:
#Saving the models_url column
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [33]:
#Creating function to scrap the generation names for each models
def generation_of_each_model(model_url):
    gens_list = []
    html3 = requests.get(model_url, headers={'User-Agent': navigator})
    soup3 = BeautifulSoup(html3.text, 'html.parser')
    gens = soup3.find_all('a', {'class' : 'position'}) 
    gens_list_final = re.findall('en/(\S+)', str(gens))
    return [x[:-1] for x in gens_list_final]
model_url = 'https://www.auto-data.net/en/abarth-124-spider-model-2152'
generation_of_each_model(model_url)

['abarth-124-gt-generation-6774',
 'abarth-124-gt-generation-6774',
 'abarth-124-spider-generation-5004',
 'abarth-124-spider-generation-5004']

In [34]:
#Applying function to get the generation names for each models
for csvs in dicto.values():
    csvs['generations'] = csvs['models_url'].apply(lambda x: generation_of_each_model(x))

In [35]:
#Saving the generations columns
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [36]:
#Loop to explode the generations names
for letter, letter_csvs in dicto.items():
    dicto[letter] = dicto[letter].explode('generations')

In [37]:
#Saving the exploded generations names
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [38]:
#Dropping duplicates that were created
for letter, letter_csvs in dicto.items():
    dicto[letter].drop_duplicates(inplace = True)
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [39]:
#Looping to create a column for the generations url
for csvs in dicto.values():
    csvs['generations_url'] = csvs['generations'].apply(lambda x: url + x)

In [40]:
#Saving the column with the generations urls
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [41]:
#Creating a function to scrap for the modifications of each generation
def modifications_of_each_generation(gen_url):
    mods_list = []
    html4 = requests.get(gen_url, headers={'User-Agent': navigator})
    soup4 = BeautifulSoup(html4.text, 'html.parser')
    mods = soup4.find_all('table', {'class' : 'carlist'}) 
    gens_list_final = re.findall('en/(\S+)', str(mods))
    return [x[:-1] for x in gens_list_final]
gen_url = 'https://www.auto-data.net/en/alfa-romeo-mito-generation-363'
modifications_of_each_generation(gen_url)          

['alfa-romeo-mito-1.6-jtdm-120hp-16680',
 'alfa-romeo-mito-1.6-jtdm-120hp-16680',
 'alfa-romeo-mito-1.4-tb-multiair-170hp-44386',
 'alfa-romeo-mito-1.4-tb-multiair-170hp-44386',
 'alfa-romeo-mito-1.4-tb-155hp-16679',
 'alfa-romeo-mito-1.4-tb-155hp-16679',
 'alfa-romeo-mito-1.4-tb-multiair-135hp-tct-44377',
 'alfa-romeo-mito-1.4-tb-multiair-135hp-tct-44377',
 'alfa-romeo-mito-1.4-t-jet-120hp-16683',
 'alfa-romeo-mito-1.4-t-jet-120hp-16683',
 'alfa-romeo-mito-1.4-multiair-135hp-start-stop-16681',
 'alfa-romeo-mito-1.4-multiair-135hp-start-stop-16681',
 'alfa-romeo-mito-1.4-multiair-105hp-start-stop-16684',
 'alfa-romeo-mito-1.4-multiair-105hp-start-stop-16684',
 'alfa-romeo-mito-1.4-mpi-78hp-16682',
 'alfa-romeo-mito-1.4-mpi-78hp-16682',
 'alfa-romeo-mito-1.4-70hp-44378',
 'alfa-romeo-mito-1.4-70hp-44378',
 'alfa-romeo-mito-1.3-jtdm-90hp-40499',
 'alfa-romeo-mito-1.3-jtdm-90hp-40499',
 'alfa-romeo-mito-1.3-jtdm-eco-85hp-44387',
 'alfa-romeo-mito-1.3-jtdm-eco-85hp-44387',
 'alfa-romeo-mit

In [42]:
#Applying function to get the modifications names for each generation
for csvs in dicto.values():
    csvs['modifications'] = csvs['generations_url'].apply(lambda x: modifications_of_each_generation(x))

In [43]:
#Saving the mods column
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [44]:
#Exploding the mods column to get one per row
for letter, letter_csvs in dicto.items():
    dicto[letter] = dicto[letter].explode('modifications')

In [45]:
#Saving the exploded generations names
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [46]:
#Dropping duplicates that were created
for letter, letter_csvs in dicto.items():
    dicto[letter].drop_duplicates(inplace = True)
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [47]:
#Loop to create a column for the mods url
for csvs in dicto.values():
    csvs['modifications_url'] = csvs['modifications'].apply(lambda x: url + str(x))

In [177]:
#Saving the column for the mods urls
for letter, letter_csvs in dicto.items():
    letter_csvs.to_csv('../BEEV/letter_csvs/' + letter + '.csv')

In [49]:
#This is the final loop to get the details for the cars
#specs = soup4.find_all('table', {'class' : 'cardetailsout car2'})

In [50]:
#Need to install this library to work
!pip install html5lib



## Final dataframe

In [222]:
#Creating the final function 

def specifications_of_each_car(mods_url):
    print(mods_url)
    if mods_url[-3:] == 'nan':
        return pd.DataFrame()
    else:
        specs = pd.read_html(mods_url)
        if len(specs)>=2:
            i=1
        elif len(specs)==1:
            return specs[0].drop_duplicates().T.reset_index(drop =True)
        else:
            return pd.DataFrame()
        specs = specs[i].drop_duplicates()
        #print(specs.T)
        #specs2 = specs[(specs['General information'] == 'Brand')
        #|(specs['General information'] == 'Model')
        #|(specs['General information'] == 'Generation')
        #|(specs['General information'] == 'Modification (Engine)')
        #|(specs['General information'] == 'CO2 emissions') 
        #|(specs['General information'] == 'Body type')
        #|(specs['General information'] == 'Fuel tank capacity')
        #|((specs['General information'] == 'Fuel consumption (economy) - combined') | (specs['General information'] == 'Fuel consumption (economy) - combined (NEDC)'))
        #|(specs['General information'] == 'Fuel Type')
        #|(specs['General information'] == 'Power')]
        specs.rename(columns = {'General information' : ''}, inplace = True)
        specs.set_index([''], inplace = True)
        #specs3 = specs2.T
        #specs3.reset_index(inplace = True)
        #specs3.drop(columns = ['index'], inplace = True)
        #specs3['Fuel tank(l)'] = specs3['Fuel tank capacity'].apply(lambda x : x.split(' ')[0])
        #specs3['Fuel consumption(l/100km)'] = specs3['Fuel consumption (economy) - combined'].apply(lambda x : x.split(' ')[0])
        #specs3['Power(Hp)'] = specs3['Power'].apply(lambda x: x.split(' ')[0])
        #specs3['Range (Km)'] = specs3[['Fuel tank(l)','Fuel consumption(l/100km)']].apply(lambda item: int(item[0])/float(item[1])*100,axis=1)
        #specs3.drop(columns = ['Fuel consumption (economy) - combined', 'Fuel tank capacity', 'Power'], inplace = True)
        return specs.T.reset_index(drop =True)
    

specifications_of_each_car('https://www.auto-data.net/en/abarth-124-gt-1.4-multiair-170hp-automatic-35172')

https://www.auto-data.net/en/abarth-124-gt-1.4-multiair-170hp-automatic-35172


Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,Powertrain Architecture,Body type,Seats,Doors,Performance specs,...,Number of Gears (automatic transmission),Front suspension,Rear suspension,Front brakes,Rear brakes,Assisting systems,Steering type,Power steering,Tires size,Wheel rims size
0,Abarth,124 Spider,124 GT,1.4 MultiAir (170 Hp) Automatic,2018 year,Internal Combustion engine,Roadster,2,2,Performance specs,...,6,Double wishbone,Multi-link independent,Ventilated discs,Disc,ABS (Anti-lock braking system),Steering rack and pinion,Electric Steering,205/45 R17 W84,17


In [218]:
#Creating the function to clean each DF because while scraping we do have duplicates and cannot concat with twice the same name for a column
def drop_dupli(df):
    return df.T.reset_index().drop_duplicates(inplace=False,subset = '').set_index('').T

In [None]:
#The for loop to build a DF in dataframe new column with apply specifications_of_each_car
#Then cleaning each DF with drop_dupli function with a condition to check if the DF is empty or not to avoid error message with no "" such column, 
#because in the scrap function depending of the web page we can scrap an empty DF
for letter in dicto.keys():
    dicto[letter]['dataframe'] = dicto[letter]['modifications_url'].apply(specifications_of_each_car)
    dicto[letter]['dataframe'] = dicto[letter]['dataframe'].apply(lambda item: drop_dupli(item) if item.shape[0]>0 else item)
    pd.concat([df for df in dicto[letter]['dataframe']]).to_csv(letter + '_scraped.csv',index=False)

In [124]:
# #This is the dataframe with all the information we want
# page4 = 'https://www.auto-data.net/en/alfa-romeo-mito-1.6-jtdm-120hp-16680'
# what = pd.read_html(page4)[1]  
# what

# what2 = what[(what['General information'] == 'Brand')
# |(what['General information'] == 'Model')
# |(what['General information'] == 'Generation')
# |(what['General information'] == 'Modification (Engine)')
# |(what['General information'] == 'CO2 emissions') 
# |(what['General information'] == 'Body type')
# |(what['General information'] == 'Fuel tank capacity')
# |(what['General information'] == 'Fuel consumption (economy) - combined')
# |(what['General information'] == 'Fuel Type')
# |(what['General information'] == 'Power')]
# what2.index = what2['General information']
# what3 = what2.T
# what3.drop(axis = 0, labels = 'General information', inplace = True)
# what3['Fuel tank(l)'] = what3['Fuel tank capacity'].apply(lambda x : x.split(' ')[0])
# what3['Fuel consumption(l/100km)'] = what3['Fuel consumption (economy) - combined'].apply(lambda x : x.split(' ')[0])
# what3['Power(Hp)'] = what3['Power'].apply(lambda x: x.split(' ')[0])
# what3['Range (Km)'] = what3[['Fuel tank(l)','Fuel consumption(l/100km)']].apply(lambda item: int(item[0])/float(item[1])*100,axis=1)

# what4 = what3
# what4.drop(columns = ['Fuel consumption (economy) - combined', 'Fuel tank capacity'], inplace = True)
# what4                               #We need to remove this General information column still

In [None]:
#To re-build the dicto from letter.csv file already saved
letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','z']
dicto = {letter : pd.read_csv("../BEEV/letter_csvs/"+ letter +".csv",index_col=None) for letter in letters}

### **Debugging**

In [277]:
#df = dicto['a']['dataframe'][0]
#for i in range(1,dicto['a']['dataframe'].shape[0]+1):
 #   df = pd.concat(df,dicto['a']['dataframe'][i])
#df
#pd.concat([dicto['a']['dataframe'][0].reset_index(drop = True),dicto['a']['dataframe'][1]].reset_index(drop = True))
testdf1 = dicto['a']['dataframe'][0]#.reset_index(drop = True)
testdf2 = dicto['a']['dataframe'][10]#.reset_index(drop = True)
#pd.concat([testdf1,testdf2],ignore_index = True,axis = 0).reindex(testdf1.index)
#testdf1.append(testdf2)
result = pd.concat([df for df in dicto['a']['dataframe']])#.iloc[:115]])#pd.concat([testdf1, testdf2])#, ignore_index=True, sort=False, axis = 0)
result

Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,Powertrain Architecture,Body type,Seats,Doors,Performance specs,...,CO2 emissions (CNG) (NEDC),All-electric range (NEDC),Net (usable) battery capacity,Battery technology,Acceleration 0 - 200 km/h,Recuperation output,"All-electric range (NEDC, WLTP equivalent)","Average Energy consumption (NEDC, WLTP equivalent)",Battery voltage,Climb angle
0,Abarth,124 Spider,124 GT,1.4 MultiAir (170 Hp) Automatic,2018 year,Internal Combustion engine,Roadster,2,2,Performance specs,...,,,,,,,,,,
0,Abarth,124 Spider,124 GT,1.4 MultiAir (170 Hp),2018 year,Internal Combustion engine,Roadster,2,2,Performance specs,...,,,,,,,,,,
0,Abarth,124 Spider,124 Spider,1.4 MultiAir (170 Hp) Automatic,2016 year,Internal Combustion engine,Cabriolet,2,2,Performance specs,...,,,,,,,,,,
0,Abarth,124 Spider,124 Spider,1.4 MultiAir (170 Hp),2016 year,Internal Combustion engine,Cabriolet,2,2,Performance specs,...,,,,,,,,,,
0,Abarth,500,500C,1.4 T-Jet (140 Hp) MTA,2010 year,Internal Combustion engine,Cabriolet,4,2,Performance specs,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Autobianchi,A 112,A 112,1.0 Abarth (58 Hp),1971 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,
0,Autobianchi,A 112,A 112,0.9 Junior (42 Hp),1975 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,
0,Autobianchi,A 112,A 112,0.9 E (47 Hp),1973 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,
0,Autobianchi,A 112,A 112,0.9 (45 Hp),1976 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,


In [72]:
print(len(dicto['a']['dataframe'][10].reset_index(drop =True).columns))
#dicto['a']['dataframe'][10].columns

70


In [73]:
print(len(dicto['a']['dataframe'][0].reset_index(drop =True).columns))
#dicto['a']['dataframe'][0].columns

61


In [228]:
for values in dicto['a']['dataframe'][114].reset_index(drop =True).columns:
    if values not in dicto['a']['dataframe'][115].reset_index(drop =True).columns:
        print(values)

Electric cars and hybrids specs
Electric motor 1
Electric motor power
Electric motor Torque
Electric motor 2
Electric motor power
Electric motor Torque
Electric motor 3
System power
System torque
Internal combustion engine specs
Drivetrain Architecture


In [223]:
dicto['a']['dataframe'][10].reset_index(drop =True)#['(adsbygoogle = window.adsbygoogle || []).push({});']

Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,Powertrain Architecture,Body type,Seats,Doors,Performance specs,...,Number of Gears (automatic transmission),Front suspension,Rear suspension,Front brakes,Rear brakes,Assisting systems,Steering type,Power steering,Tires size,Wheel rims size
0,Abarth,595,595C (facelift 2016),Turismo 1.4 T-Jet (165 Hp) Automatic,2016 year,Internal Combustion engine,Cabriolet,4,3,Performance specs,...,5,Independent type McPherson,"Semi-independent, coil spring","Ventilated discs, 284 mm","Ventilated discs, 240 mm",ABS (Anti-lock braking system),Steering rack and pinion,Electric Steering,205/40 R17,17


In [None]:
for letter in dicto.keys():
    #dicto['a']['dataframe'] = dicto['a']['modifications_url'].apply(specifications_of_each_car)
    #dicto['a']['dataframe'] = dicto['a']['dataframe'].apply(lambda item: drop_dupli(item) if item.shape[0]>0 else item)
    pd.concat([df for df in dicto['a']['dataframe']]).to_csv(letter + '_scraped.csv',index=False)
    #df_car = pd.concat(list(dicto[letter]['dataframe']),ignore_index=True)
    #pd.concat(list(dicto[letter]['dataframe']),ignore_index=True,join = 'inner').to_csv(letter + '_scraped.csv',index=False)
    #df_car.to_csv(letter + '_scraped.csv',index=False)

In [135]:
#dicto['a']['dataframe'][115].T[dicto['a']['dataframe'][115].T.duplicated() == True]
dicto['a']['dataframe'].iloc[114]#.columns

Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,End of production,Powertrain Architecture,Body type,Seats,Doors,...,Number of Gears (automatic transmission),Front suspension,Rear suspension,Front brakes,Rear brakes,Assisting systems,Steering type,Power steering,Tires size,Wheel rims size
0,Acura,MDX,MDX III (facelift 2017),3.0 V6 (321 Hp) Hybrid SH-AWD Automatic,2017 year,2020 year,FHEV (Full Hybrid Electric Vehicle),SUV,6-7,5,...,7 DCT,"Independent, Spring McPherson, with stabilizer",Multi-link independent,Ventilated discs,Disc,ABS (Anti-lock braking system),Steering rack and pinion,Electric Steering,245/60 R18; 245/50 R20,8J x 18; 8J x 20


In [177]:
len(dicto['a']['dataframe'].iloc[114].columns)
#len(set(dicto['a']['dataframe'].iloc[114].columns))

73

In [None]:
[print(df) for df in dicto['a']['dataframe'].iloc[:10]]

In [188]:
dicto['a']['dataframe'].iloc[114].T.reset_index()[""][dicto['a']['dataframe'].iloc[114].T.reset_index()[""].duplicated() == True]#.drop_duplicates().set_index("")


22     Electric motor power
23    Electric motor Torque
24          Engine location
32          Engine location
Name: , dtype: object

In [206]:
test = dicto['a']['dataframe'].iloc[114].T.reset_index()

In [207]:
test.drop_duplicates(inplace=True,subset = '')

In [210]:
test

Unnamed: 0,Unnamed: 1,0
0,Brand,Acura
1,Model,MDX
2,Generation,MDX III (facelift 2017)
3,Modification (Engine),3.0 V6 (321 Hp) Hybrid SH-AWD Automatic
4,Start of production,2017 year
...,...,...
68,Assisting systems,ABS (Anti-lock braking system)
69,Steering type,Steering rack and pinion
70,Power steering,Electric Steering
71,Tires size,245/60 R18; 245/50 R20


In [212]:
test.set_index('').T

Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,End of production,Powertrain Architecture,Body type,Seats,Doors,...,Number of Gears (automatic transmission),Front suspension,Rear suspension,Front brakes,Rear brakes,Assisting systems,Steering type,Power steering,Tires size,Wheel rims size
0,Acura,MDX,MDX III (facelift 2017),3.0 V6 (321 Hp) Hybrid SH-AWD Automatic,2017 year,2020 year,FHEV (Full Hybrid Electric Vehicle),SUV,6-7,5,...,7 DCT,"Independent, Spring McPherson, with stabilizer",Multi-link independent,Ventilated discs,Disc,ABS (Anti-lock braking system),Steering rack and pinion,Electric Steering,245/60 R18; 245/50 R20,8J x 18; 8J x 20


In [250]:
drop_dupli(dicto['a']['dataframe'][0])

Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,Powertrain Architecture,Body type,Seats,Doors,Performance specs,...,Number of Gears (automatic transmission),Front suspension,Rear suspension,Front brakes,Rear brakes,Assisting systems,Steering type,Power steering,Tires size,Wheel rims size
0,Abarth,124 Spider,124 GT,1.4 MultiAir (170 Hp) Automatic,2018 year,Internal Combustion engine,Roadster,2,2,Performance specs,...,6,Double wishbone,Multi-link independent,Ventilated discs,Disc,ABS (Anti-lock braking system),Steering rack and pinion,Electric Steering,205/45 R17 W84,17


In [248]:
dicto['a']['dataframe'].iloc[116]

Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,End of production,Powertrain Architecture,Body type,Seats,Doors,...,Number of Gears (automatic transmission),Front suspension,Rear suspension,Front brakes,Rear brakes,Assisting systems,Steering type,Power steering,Tires size,Wheel rims size
0,Acura,MDX,MDX III,3.5 V6 (290 Hp) Automatic,2013 year,2016 year,Internal Combustion engine,SUV,7,5,...,6 SportShift,"Independent, Spring McPherson, with stabilizer",Multi-link independent,Ventilated discs,Disc,ABS (Anti-lock braking system),Steering rack and pinion,Electric Steering,245/60 R18; 245/55 R19,8J x 18; 8J x 19


In [None]:
#dicto['a']['dataframe'] = dicto['a']['dataframe'].apply(lambda item: drop_dupli(item))
#for i in range(dicto['a']['dataframe'].shape[0]):
 #   if dicto['a']['dataframe'][i].shape[0]>0:
  #      dicto['a']['dataframe'][i] = drop_dupli(dicto['a']['dataframe'][i])
   # else: pass
    #print(i)

In [254]:
dicto['a']['dataframe'][53]

Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,End of production,Powertrain Architecture,Body type,Performance specs,Fuel Type,Engine specs,Power,Engine aspiration,(adsbygoogle = window.adsbygoogle || []).push({});,"Drivetrain, brakes and suspension specs",Wheel rims size
0,AC,ACE,Ace II,2.9 i V6 24V (195 Hp),1992 year,1996 year,Internal Combustion engine,Cabriolet,Performance specs,Petrol (Gasoline),Engine specs,195 Hp,Naturally aspirated engine,(adsbygoogle = window.adsbygoogle || []).push(...,"Drivetrain, brakes and suspension specs",4


In [259]:
dicto['a']['dataframe'][54].shape

(0, 0)

In [279]:
pd.read_csv("a_scraped.csv")


Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,Powertrain Architecture,Body type,Seats,Doors,Performance specs,...,CO2 emissions (CNG) (NEDC),All-electric range (NEDC),Net (usable) battery capacity,Battery technology,Acceleration 0 - 200 km/h,Recuperation output,"All-electric range (NEDC, WLTP equivalent)","Average Energy consumption (NEDC, WLTP equivalent)",Battery voltage,Climb angle
0,Abarth,124 Spider,124 GT,1.4 MultiAir (170 Hp) Automatic,2018 year,Internal Combustion engine,Roadster,2,2,Performance specs,...,,,,,,,,,,
1,Abarth,124 Spider,124 GT,1.4 MultiAir (170 Hp),2018 year,Internal Combustion engine,Roadster,2,2,Performance specs,...,,,,,,,,,,
2,Abarth,124 Spider,124 Spider,1.4 MultiAir (170 Hp) Automatic,2016 year,Internal Combustion engine,Cabriolet,2,2,Performance specs,...,,,,,,,,,,
3,Abarth,124 Spider,124 Spider,1.4 MultiAir (170 Hp),2016 year,Internal Combustion engine,Cabriolet,2,2,Performance specs,...,,,,,,,,,,
4,Abarth,500,500C,1.4 T-Jet (140 Hp) MTA,2010 year,Internal Combustion engine,Cabriolet,4,2,Performance specs,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3856,Autobianchi,A 112,A 112,1.0 Abarth (58 Hp),1971 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,
3857,Autobianchi,A 112,A 112,0.9 Junior (42 Hp),1975 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,
3858,Autobianchi,A 112,A 112,0.9 E (47 Hp),1973 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,
3859,Autobianchi,A 112,A 112,0.9 (45 Hp),1976 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,


In [281]:
pd.read_csv("b_scraped.csv")


Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,End of production,Powertrain Architecture,Body type,Seats,Doors,...,"Average Energy consumption (NEDC, WLTP equivalent)",Average Energy consumption (NEDC),Recuperation output,Average Energy consumption (WLTP),Electric motor 2,Ramp angle,Wading depth,Acceleration 0 - 200 km/h,Acceleration 0 - 300 km/h,100 km/h - 0
0,B.Engineering,Edonis,Edonis,3.8 V12 (690 Hp),2002 year,2006 year,Internal Combustion engine,Coupe,2,2.0,...,,,,,,,,,,
1,BAC,Mono,Mono R,2.5 (340 Hp) Automatic,2019 year,,Internal Combustion engine,Roadster,1,,...,,,,,,,,,,
2,BAC,Mono,Mono,2.5 (305 Hp) Automatic,2016 year,2020 year,Internal Combustion engine,Roadster,1,,...,,,,,,,,,,
3,BAIC Motor,BJ 2020,BJ 2020,2.4 (87 Hp),1989 year,2005 year,Internal Combustion engine,Station wagon (estate),5,5.0,...,,,,,,,,,,
4,BAIC Motor,BJ 2020,BJ 2020,2.4 (106 Hp),1989 year,2005 year,Internal Combustion engine,Station wagon (estate),5,5.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2862,BYD,F3,F3,1.6 i (100 Hp),2005 year,2013 year,Internal Combustion engine,Sedan,5,4.0,...,,,,,,,,,,
2863,BYD,F6,F6,2.4 (165 Hp),2007 year,2012 year,Internal Combustion engine,Sedan,5,4.0,...,,,,,,,,,,
2864,BYD,F6,F6,2.0 (140 Hp),2007 year,2012 year,Internal Combustion engine,Sedan,5,4.0,...,,,,,,,,,,
2865,BYD,F8,F8,2.0 16V (140 Hp),2009 year,2010 year,Internal Combustion engine,Cabriolet,2,2.0,...,,,,,,,,,,


In [283]:
pd.read_csv("z_scraped.csv")


Unnamed: 0,Brand,Model,Generation,Modification (Engine),Start of production,Powertrain Architecture,Body type,Seats,Doors,Performance specs,...,Valvetrain,Trunk (boot) space - maximum,Steering type,Max. weight,Max load,Acceleration 0 - 200 km/h,Ride height (ground clearance),Average Energy consumption,Approach angle,Departure angle
0,Zacua,MX2,MX2,18 kWh (46 Hp) Electric,"July, 2017 year",BEV (Electric Vehicle),Coupe,2.0,2,Performance specs,...,,,,,,,,,,
1,Zacua,MX3,MX3,18 kWh (46 Hp) Electric,"July, 2017 year",BEV (Electric Vehicle),Hatchback,2.0,3,Performance specs,...,,,,,,,,,,
2,Zastava,10,10,1.2 8V (60 Hp),"October, 2005 year",Internal Combustion engine,Hatchback,5.0,5,Performance specs,...,,,,,,,,,,
3,Zastava,101,101 (1100),1.1 Super (64 Hp),1979 year,Internal Combustion engine,Hatchback,,,Performance specs,...,,,,,,,,,,
4,Zastava,101,101 (1100),1.1 (56 Hp),1975 year,Internal Combustion engine,Hatchback,5.0,3/5,Performance specs,...,SOHC,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,ZX,GrandTiger,GrandTiger,2.4 (126 Hp) 4x4,2007 year,Internal Combustion engine,Pick-up,5.0,4,Performance specs,...,DOHC,,,2110 kg 4651.75 lbs.,500 kg 1102.31 lbs.,,,,,
69,ZX,GrandTiger,GrandTiger,2.4 (126 Hp) 4x2,2007 year,Internal Combustion engine,Pick-up,5.0,4,Performance specs,...,DOHC,,,2110 kg 4651.75 lbs.,500 kg 1102.31 lbs.,,,,,
70,ZX,Landmark,Landmark,2.4 (126 Hp) 4x4,2006 year,Internal Combustion engine,SUV,7.0,5,Performance specs,...,DOHC,,,2235 kg 4927.33 lbs.,455 kg 1003.1 lbs.,,,,,
71,ZX,Landmark,Landmark,2.4 (126 Hp) 4x2 Automatic,2006 year,Internal Combustion engine,SUV,7.0,5,Performance specs,...,DOHC,,,2165 kg 4773.01 lbs.,455 kg 1003.1 lbs.,,,,,
