# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
filename = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv"
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]
df = pd.read_csv(filename, names = headers)
df.replace("?", np.nan, inplace = True)

<h3>Identify and handle missing values</h3>
<p>In case there is no data (nan) we can remove, or replace with mode.</p>

In [2]:
# Remove
df.dropna(subset=['price'], axis=0, inplace=True)
# reset index, because we droped two rows
df.reset_index(drop=True, inplace=True)

# Replace, with average or mode
avg = df['normalized-losses'].astype('float').mean(axis=0)
mode =  df['normalized-losses'].value_counts().idxmax()
df['normalized-losses'].replace(np.nan, avg, inplace = True)
print("Replaced normalized-losses NaN with media, avg:", avg)

Replaced normalized-losses NaN with media, avg: 122.0


<h3>Data standarization</h3>
<p>change units or format (Data Wrangling) of a column.</p>

In [3]:
# Convert mpg to L/100km by mathematical operation (235 divided by mpg)
df['city-L/100km'] = 235/df['city-mpg']

<h3>Data Normalization</h3>
<p>Techniques:</p>

In [4]:
# Simple feature Scaling
df['length'] = df['length']/df['length'].max()
# Min-Max
df['length'] = (df['length'] - df['length'].min())/(df['length'].max() - df['length'].min())
# z-score
df['length'] = (df['length'] - df['length'].mean())/df['length'].std()

<h3>Binning</h3>
<p> Convert data into discret categorical groups</p>

In [5]:
from matplotlib import pyplot
if 'np' not in vars(): import numpy as np
    
df.dropna(subset=['horsepower'], axis=0, inplace=True)
df['horsepower']=df['horsepower'].astype(int, copy=True)
bins = np.linspace(min(df['horsepower']), max(df['horsepower']), 4)
names = ['Low', 'Medium', 'High']
df['horsepower-binned'] = pd.cut(df['horsepower'], bins, labels=names, include_lowest=True )

pyplot.bar(names, df["horsepower-binned"].value_counts())
# set x/y labels and plot title
pyplot.xlabel("horsepower")
pyplot.ylabel("count")
pyplot.title("horsepower bins")

# or you could just use pyplot.hist(df["horsepower"], bins = 3) to visualize

Text(0.5, 1.0, 'horsepower bins')

<h3>Indicator variable (or dummy variable)</h3>

In [6]:
dummy_variable_1 = pd.get_dummies(df["fuel-type"])
# merge data frame "df" and "dummy_variable_1" 
df = pd.concat([df, dummy_variable_1], axis=1)
# drop original column "fuel-type" from "df"
df.drop("fuel-type", axis = 1, inplace=True)
df

Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,-0.438315,...,9.0,111,5000,21,27,13495,11.190476,Low,0,1
1,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,-0.438315,...,9.0,111,5000,21,27,16500,11.190476,Low,0,1
2,1,122,alfa-romero,std,two,hatchback,rwd,front,94.5,-0.243544,...,9.0,154,5000,19,26,16500,12.368421,Medium,0,1
3,2,164,audi,std,four,sedan,fwd,front,99.8,0.194690,...,10.0,102,5500,24,30,13950,9.791667,Low,0,1
4,2,164,audi,std,four,sedan,4wd,front,99.4,0.194690,...,8.0,115,5500,18,22,17450,13.055556,Low,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95,volvo,std,four,sedan,rwd,front,109.1,1.184775,...,9.5,114,5400,23,28,16845,10.217391,Low,0,1
197,-1,95,volvo,turbo,four,sedan,rwd,front,109.1,1.184775,...,8.7,160,5300,19,25,19045,12.368421,Medium,0,1
198,-1,95,volvo,std,four,sedan,rwd,front,109.1,1.184775,...,8.8,134,5500,18,23,21485,13.055556,Medium,0,1
199,-1,95,volvo,turbo,four,sedan,rwd,front,109.1,1.184775,...,23.0,106,4800,26,27,22470,9.038462,Low,1,0
