In [5]:
import numpy as np
import pandas as pd

#Reading the dataset
data = pd.read_csv('datasets/auto-mpg.csv', names=["mpg", "cylinders", "displacement", "horsepower", "weight","acceleration", "model year", "origin" ,"car name"])

#Checking the shape of data
data.shape

(398, 9)

In [6]:
#Taking a peek at data header
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [7]:
# Column names are
data.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [8]:
# Get info on data types and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [9]:
#Finding out the mean
data.mean()

  data.mean()


mpg               23.514573
cylinders          5.454774
displacement     193.425879
weight          2970.424623
acceleration      15.568090
model year        76.010050
origin             1.572864
dtype: float64

In [10]:
# where are the other columns? Check data types
data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [17]:
# dealing with missing values
# replace '?' with 'NaN'
data = data.replace({'?': 'NaN'})
# Row 126 shows NaN for horsepower
data.loc[120:130, :]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
120,19.0,4,121.0,112.0,2868,15.5,73,2,volvo 144ea
121,15.0,8,318.0,150.0,3399,11.0,73,1,dodge dart custom
122,24.0,4,121.0,110.0,2660,14.0,73,2,saab 99le
123,20.0,6,156.0,122.0,2807,13.5,73,3,toyota mark ii
124,11.0,8,350.0,180.0,3664,11.0,73,1,oldsmobile omega
125,20.0,6,198.0,95.0,3102,16.5,74,1,plymouth duster
126,21.0,6,200.0,,2875,17.0,74,1,ford maverick
127,19.0,6,232.0,100.0,2901,16.0,74,1,amc hornet
128,15.0,6,250.0,100.0,3336,17.0,74,1,chevrolet nova
129,31.0,4,79.0,67.0,1950,19.0,74,3,datsun b210


In [25]:
#Reloading the data with na_values = '?' flag
data = data = pd.read_csv('datasets/auto-mpg.csv', names=["mpg", "cylinders", "displacement", "horsepower", "weight","acceleration", "model year", "origin" ,"car name"], na_values='?')
data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [26]:
# describe data
data.describe() # ignores NaN

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [27]:
data.groupby(by='model year').describe().mpg

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
model year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
70,29.0,17.689655,5.339231,9.0,14.0,16.0,22.0,27.0
71,28.0,21.25,6.591942,12.0,15.5,19.0,27.0,35.0
72,28.0,18.714286,5.435529,11.0,13.75,18.5,23.0,28.0
73,40.0,17.1,4.700245,11.0,13.0,16.0,20.0,29.0
74,27.0,22.703704,6.42001,13.0,16.0,24.0,27.0,32.0
75,30.0,20.266667,4.940566,13.0,16.0,19.5,23.0,33.0
76,34.0,21.573529,5.889297,13.0,16.75,21.0,26.375,33.0
77,28.0,23.375,6.675862,15.0,17.375,21.75,30.0,36.0
78,36.0,24.061111,6.898044,16.2,19.35,20.7,28.0,43.1
79,29.0,25.093103,6.794217,15.5,19.2,23.9,31.8,37.3


In [28]:
#Finding null values
data.isnull()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
393,False,False,False,False,False,False,False,False,False
394,False,False,False,False,False,False,False,False,False
395,False,False,False,False,False,False,False,False,False
396,False,False,False,False,False,False,False,False,False


In [29]:
#Finding number of null values per column
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [30]:
#Get Data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 28.1+ KB


In [31]:
#Filling missing values with min data.
data.fillna(data.min()).describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,103.58794,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,38.859575,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,92.0,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [33]:
#Count unique values for mpg
data['mpg'].value_counts()

13.0    20
14.0    19
18.0    17
15.0    16
26.0    14
        ..
31.9     1
16.9     1
18.2     1
22.3     1
44.0     1
Name: mpg, Length: 129, dtype: int64