In [1]:
# Analysis of a Vehicle Dataset

import pandas as pd

In [2]:
df = pd.read_csv('C:/Users/phpunsal/Documents/Data_Analytics/Vehicle_Dataset/data/cars.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [3]:
# I'll start off by calculating the mean weight
mean_weight = df['weight'].mean()
print(f'The mean weight is {round(mean_weight, 2)} kg')

The mean weight is 2973.95 kg


In [4]:
# then obtain the maximum horsepower
max_horsepower = df['horsepower'].max()
max_horsepower

230.0

In [5]:
# next, lets obtain the number of cars with a weight equal to or greater than 3500
heavy_cars = df[df['weight']>=3500].shape[0]
heavy_cars

109

In [7]:
# I'll create a new df with an additional column called ratio, the contents of which is calculated by dividing horsepower by weight
df_ratio = df.copy()
df_ratio['ratio'] = df_ratio['horsepower']/df['weight']
df_ratio.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,ratio
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0.0371
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0.044679
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0.043655
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0.043694
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0.040591


In [9]:
# next, lets create a new df containing only cars with an origin of 'usa'
df_usa = df.copy()
df_usa = df_usa[df_usa['origin']=='usa']
df_usa.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [10]:
# from that df, I can calculate the mean mpg
mean_mpg_usa = df_usa['mpg'].mean()
mean_mpg_usa

20.04308943089431

In [11]:
# next i can find the number of USA cars that have 8 cyclinders
eight_cyl_usa = len(df_usa[df_usa['cylinders']==8])
eight_cyl_usa

103

In [12]:
# from df.info, i can see that the horsepower column has some missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           395 non-null    float64
 1   cylinders     395 non-null    int64  
 2   displacement  395 non-null    float64
 3   horsepower    390 non-null    float64
 4   weight        395 non-null    int64  
 5   acceleration  395 non-null    float64
 6   model_year    395 non-null    int64  
 7   origin        395 non-null    object 
 8   name          395 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 27.9+ KB


In [13]:
# i can therefore create a new df and drop the rows missing horsepower values
df_horsepower = df.copy()
df_horsepower = df_horsepower.dropna()
df_horsepower.info()

<class 'pandas.core.frame.DataFrame'>
Index: 390 entries, 0 to 394
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           390 non-null    float64
 1   cylinders     390 non-null    int64  
 2   displacement  390 non-null    float64
 3   horsepower    390 non-null    float64
 4   weight        390 non-null    int64  
 5   acceleration  390 non-null    float64
 6   model_year    390 non-null    int64  
 7   origin        390 non-null    object 
 8   name          390 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 30.5+ KB


In [14]:
# from that df, i can obtain the first mode value for horsepower
mode_hp = df_horsepower['horsepower'].mode()[0]
mode_hp

150.0

In [16]:
# I can then create a new df containing only cars with a horsepower greater than or equal to the mode_hp
df_high_hp = df_horsepower.copy()
df_high_hp = df_high_hp[df_high_hp['horsepower']>=mode_hp]
df_high_hp.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
5,15.0,8,429.0,198.0,4341,10.0,70,usa,ford galaxie 500
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala


In [17]:
#from this high horsepower df, i can calculate the percentage of cars with 8 cylinders
percentage_eight_cyl = len(df_high_hp[df_high_hp['cylinders']==8])/len(df_high_hp)*100
percentage_eight_cyl

98.50746268656717

In [18]:
# i can use value_counts() to see how many car names have more than one entry
df['name'].value_counts()

name
toyota corolla         5
amc matador            5
ford maverick          5
toyota corona          4
chevrolet chevette     4
                      ..
chevrolet monza 2+2    1
ford mustang ii        1
pontiac astro          1
amc pacer              1
chevy s-10             1
Name: count, Length: 306, dtype: int64

In [19]:
# to make a column of unique names, i can use the name column and combine with the model_year column, formatted appropriately
df_name = df.copy()
df_name['name_year'] = df['name']+[' - 19']+df['model_year'].astype(str)
df_name.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,name_year
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,chevrolet chevelle malibu - 1970
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,buick skylark 320 - 1970
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,plymouth satellite - 1970
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,amc rebel sst - 1970
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,ford torino - 1970


In [20]:
# I can check the new column to ensure i now have no duplicated entries
df_name['name_year'].value_counts()

name_year
chevrolet chevelle malibu - 1970    1
datsun 200-sx - 1978                1
plymouth sapporo - 1978             1
toyota celica gt liftback - 1978    1
dodge omni - 1978                   1
                                   ..
ford pinto - 1974                   1
datsun b210 - 1974                  1
chevrolet nova - 1974               1
amc hornet - 1974                   1
chevy s-10 - 1982                   1
Name: count, Length: 395, dtype: int64

In [22]:
# I can now set this new column of unique names as the row index
df_car_index = df_name.copy()
df_car_index.set_index('name_year', inplace=True )
df_car_index.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
name_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
chevrolet chevelle malibu - 1970,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
buick skylark 320 - 1970,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
plymouth satellite - 1970,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
amc rebel sst - 1970,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
ford torino - 1970,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [23]:
# now I can create a function that returns the acceleration value, if the name_year is inputted
def acceleration(name_year):
     return df_car_index.loc[name_year, 'acceleration']
    

In [24]:
acceleration('ford torino - 1970')

10.5