In [1]:
from requests import get
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

### We want to scrape all 333 car model names, car types, and reliability indexes from https://www.reliabilityindex.com. The format for each car's url is "https://www.reliabilityindex.com/reliability/search/{i}" where {i} is substituted for the desired car number. To scrape the information for the 333 cars, we first create two empty lists: car_model and reliability_index. Then we run a for loop to create a url for each car number. Then we create a response object using a get request for each url and then pass that response to BeautifulSoup to create a bs4 object. We then search the soup object to find the required information, format the strings, and append each car's information to the corresponding list.

In [2]:
car_model=[]
reliability_index=[]
for i in range(1,334):
    url = "https://www.reliabilityindex.com/reliability/search/{i}".format(i=i)
    response = get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('h2', {'id': 'car_name'}).get_text()
    splitted_string = table.split("\n")
    # Car model  
    car_model.append(splitted_string[0])
    # Reliability index
    splitted_string2 = soup.body.div.tr.tr.get_text().split("x")
    reliability_index.append(splitted_string2[1])

In [3]:
len(reliability_index)

333

In [49]:
# Create an empty df
my_df  = pd.DataFrame()

# Add the columns 
my_df['car_model'] = car_model
my_df['reliability_index'] = reliability_index

In [50]:
# Check the df info
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   car_model          333 non-null    object
 1   reliability_index  333 non-null    object
dtypes: object(2)
memory usage: 5.3+ KB


In [51]:
# Change the reliability_index to numeric, and then to integer. Pandas wouldn't allow string to integer in this case.
my_df['reliability_index'] = pd.to_numeric(my_df['reliability_index'])
my_df[['reliability_index']] = my_df[['reliability_index']].applymap(np.int64)

In [52]:
my_df

Unnamed: 0,car_model,reliability_index
0,Alfa Romeo 147,131
1,Alfa Romeo 159,210
2,Alfa Romeo Brera,233
3,Alfa Romeo GIULIETTA,78
4,Alfa Romeo GT,204
...,...,...
328,Volvo V70,99
329,Volvo V70,79
330,Volvo XC60,112
331,Volvo XC70,167


We want to calculate the average reliability rating for each make. To do this, we will create a mask of each make using a string search for each make.

In [64]:
# We will create a mask to filter each car model
audi_mask = my_df.car_model.str.lower().str.contains('audi')
bmw_mask = my_df.car_model.str.lower().str.contains('bmw')
ford_mask = my_df.car_model.str.lower().str.contains('ford')
hyundai_mask = my_df.car_model.str.lower().str.contains('hyundai')
mercedes_mask = my_df.car_model.str.lower().str.contains('mercedes')
toyota_mask = my_df.car_model.str.lower().str.contains('toyota')
vw_mask = my_df.car_model.str.lower().str.contains('volkswagen')

In [66]:
# We will then create a list of the makes and a list of the masks
make_list = ['Audi','BMW','Ford','Hyundai','Mercedes',
            'Toyota', 'Volkswagen']
mask_list = [audi_mask, bmw_mask, ford_mask,
            hyundai_mask, mercedes_mask, toyota_mask,vw_mask]

# Create a list of the calculated means
avg_reliability_rating = [round(my_df[mask]['reliability_index'].mean(),0) for mask in mask_list]

In [67]:
# Create the df by zipping both lists, with columns specified 
df_reliability_index = pd.DataFrame(list(zip(make_list, avg_reliability_rating)), 
               columns =['make', 'avg_reliability_rating']) 
df_reliability_index

Unnamed: 0,make,avg_reliability_rating
0,Audi,219.0
1,BMW,249.0
2,Ford,83.0
3,Hyundai,72.0
4,Mercedes,215.0
5,Toyota,64.0
6,Volkswagen,121.0
