In [1]:
import os
import pandas as pd

In [2]:
df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/auto-mpg.csv")
print(df[0:5])

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0        130    3504          12.0    70   
1  15.0          8         350.0        165    3693          11.5    70   
2  18.0          8         318.0        150    3436          11.0    70   
3  16.0          8         304.0        150    3433          12.0    70   
4  17.0          8         302.0        140    3449          10.5    70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


In [3]:
#Display Function
pd.set_option('display.max_columns',7)
pd.set_option('display.max_rows',5)
display(df)

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
0,18.0,8,307.0,...,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,...,70,1,buick skylark 320
...,...,...,...,...,...,...,...
396,28.0,4,120.0,...,82,1,ford ranger
397,31.0,4,119.0,...,82,1,chevy s-10


In [4]:
# Code to generate a data frame to display statistical information about the first data frame
df = df.select_dtypes(include=['int','float'])
headers = list(df.columns.values)
fields = []

for field in headers:
    fields.append({
        'name':field,
        'mean':df[field].mean(),
        'var':df[field].var(),
        'sdev':df[field].std()
    })

for field in fields:
    print(field)

{'name': 'mpg', 'mean': 23.514572864321607, 'var': 61.089610774274405, 'sdev': 7.815984312565782}
{'name': 'cylinders', 'mean': 5.454773869346734, 'var': 2.893415439920003, 'sdev': 1.7010042445332119}
{'name': 'displacement', 'mean': 193.42587939698493, 'var': 10872.199152247384, 'sdev': 104.26983817119591}
{'name': 'weight', 'mean': 2970.424623115578, 'var': 717140.9905256763, 'sdev': 846.8417741973268}
{'name': 'acceleration', 'mean': 15.568090452261307, 'var': 7.604848233611383, 'sdev': 2.757688929812676}
{'name': 'year', 'mean': 76.01005025125629, 'var': 13.672442818627143, 'sdev': 3.697626646732623}
{'name': 'origin', 'mean': 1.5728643216080402, 'var': 0.6432920268850549, 'sdev': 0.8020548777266148}


In [5]:
#Display Function
pd.set_option('display.max_columns',0)
pd.set_option('display.max_rows',0)
df2 = pd.DataFrame(fields)
display(df2)

Unnamed: 0,name,mean,var,sdev
0,mpg,23.514573,61.089611,7.815984
1,cylinders,5.454774,2.893415,1.701004
2,displacement,193.425879,10872.199152,104.269838
3,weight,2970.424623,717140.990526,846.841774
4,acceleration,15.56809,7.604848,2.757689
5,year,76.01005,13.672443,3.697627
6,origin,1.572864,0.643292,0.802055


In [6]:
df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", na_values=['NA','?'])
print(f"hosepower has na? {pd.isnull(df['horsepower']).values.any()}")


hosepower has na? True


In [7]:
#Filling Missing values with medians
med = df['horsepower'].median()
df['horsepower']= df['horsepower'].fillna(med)
# Use df = df.dropna to drop NA values
print(f"hosepower has na? {pd.isnull(df['horsepower']).values.any()}")

hosepower has na? False


In [8]:
display(df)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198.0,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220.0,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215.0,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225.0,4425,10.0,70,1,pontiac catalina
...,...,...,...,...,...,...,...,...,...


In [9]:
#Function to Remove outlier that is 'sd' standard deviation away from the mean
import numpy as np
def remove_outliers(df,name,sd):
    drop_rows = df.index[(np.abs(df[name]-df[name].mean())>= (sd*df[name].std()))]
    df.drop(drop_rows,axis = 0, inplace= True)

In [10]:
#Dropping every row which is more than 2 sd away from the mean
from sklearn import metrics
from scipy.stats import zscore

#Recall we have replaced missing values already, now we drop the name column
df.drop('name',axis = 1 , inplace= True)

#Drop outliers in MPG
print(f"Length before MPG outliers dropped: {len(df)}")
remove_outliers(df,'mpg',2)
print(f"Length before MPG outliers dropped: {len(df)}")
display(df)

Length before MPG outliers dropped: 398
Length before MPG outliers dropped: 388


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1
5,15.0,8,429.0,198.0,4341,10.0,70,1
6,14.0,8,454.0,220.0,4354,9.0,70,1
7,14.0,8,440.0,215.0,4312,8.5,70,1
8,14.0,8,455.0,225.0,4425,10.0,70,1
...,...,...,...,...,...,...,...,...


In [11]:
#Concatenating Columns
df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", na_values=['NA','?'])

col_horsepower = df["horsepower"]
col_name = df["name"]
result = pd.concat([col_name,col_horsepower],axis = 1)
pd.set_option('display.max_columns',0)
pd.set_option('display.max_rows',5)
display(result)

Unnamed: 0,name,horsepower
0,chevrolet chevelle malibu,130.0
1,buick skylark 320,165.0
...,...,...
396,ford ranger,79.0
397,chevy s-10,82.0


In [12]:
#Concatenating first two and last two rows
result = pd.concat([df[0:2],df[-2:]],axis = 0)
pd.set_option('display.max_columns',0)
pd.set_option('display.max_rows',5)
display(result)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
396,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,1,chevy s-10


In [14]:
#Splitting a data frame into train and test
df = df.reindex(np.random.permutation(df.index)) #randomly shuffling the indices
mask = np.random.rand(len(df)) < 0.8
trainDF = pd.DataFrame(df[mask])
validationDF = pd.DataFrame(df[~mask])

print(f"Training DF: {len(trainDF)}")
print(f"Validation DF: {len(validationDF)}")

Training DF: 311
Validation DF: 87
