In [3]:
import numpy as np
import pandas as pd
from sklearn import datasets
from io import StringIO

In [4]:
csv_data = '''A,B,C,D,E
            5.0,2.0,3.0,,6
            1.0,6.0,,8.0,5
            0.0,11.0,12.0,4.0,5
            3.0,,3.0,5.0,
            5.0,1.0,4.0,2.0,4
           '''

In [5]:
df = pd.read_csv(StringIO(csv_data))

In [6]:
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


# Missing data(空值資料處理)

In [7]:
#任一有空值即刪除
df.dropna()

Unnamed: 0,A,B,C,D,E
2,0.0,11.0,12.0,4.0,5.0
4,5.0,1.0,4.0,2.0,4.0


In [8]:
#都有空值才刪除
df.dropna(how='all')

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [11]:
#條件符合才刪除
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [12]:
df.fillna(0)

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,0.0,6.0
1,1.0,6.0,0.0,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,0.0,3.0,5.0,0.0
4,5.0,1.0,4.0,2.0,4.0


In [13]:
#補平均值
df['B'] = df['B'].fillna(df['B'].mean())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [14]:
#補眾數
df['C'] = df['C'].fillna(df['C'].mode())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [15]:
#補中位數
df['D'] = df['D'].fillna(df['D'].median())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [16]:
#補最小值
df['E'] = df['E'].fillna(df['E'].min())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,4.0
4,5.0,1.0,4.0,2.0,4.0


# Categorical Data(類別資料處理)

In [32]:
df2 = pd.DataFrame(
    [['green', 'M', 10.1, 1],
    ['red', 'L', 13.5, 2],
    ['blue', 'XL', 15.3, 1]]
)
df2.columns = ['color', 'size', 'price', 'classlabel']
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,2
2,blue,XL,15.3,1


# One-hot encoding

In [33]:
size_mapping={
    'XL':3,
    'L':2,
    'M':1
}
df2['size']=df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,2
2,blue,3,15.3,1


In [34]:
#One-hot encoding(get_dummies()函式)
pd.get_dummies(df2['color'])

Unnamed: 0,blue,green,red
0,0,1,0
1,0,0,1
2,1,0,0


In [35]:
onehot_encoding = pd.get_dummies(df2['color'], prefix = 'color')

In [36]:
df2=df2.drop('color',1)
df2

Unnamed: 0,size,price,classlabel
0,1,10.1,1
1,2,13.5,2
2,3,15.3,1


In [37]:
pd.concat([onehot_encoding, df2],axis=1)

Unnamed: 0,color_blue,color_green,color_red,size,price,classlabel
0,0,1,0,1,10.1,1
1,0,0,1,2,13.5,2
2,1,0,0,3,15.3,1


# Feature Scaling

## 資料正規化(normalization)

In [38]:
from IPython.display import Math

In [39]:
Math(r'x^{(i)}_{norm}=\frac{x^{(i)}-x_{min}}{x_{max}-x_{min}}')

<IPython.core.display.Math object>

In [40]:
iris = datasets.load_iris()
x = pd.DataFrame(iris['data'], columns=iris['feature_names'])
print("target_names: "+str(iris['target_names']))
y = pd.DataFrame(iris['target'], columns=['target_names'])
data = pd.concat([x,y], axis=1)
data.head(3)

target_names: ['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [41]:
data['sepal length (cm)']=(data['sepal length (cm)']-data['sepal length (cm)'].min())/\
                          (data['sepal length (cm)'].max()-data['sepal length (cm)'].min())
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,0.222222,3.5,1.4,0.2,0
1,0.166667,3.0,1.4,0.2,0
2,0.111111,3.2,1.3,0.2,0
3,0.083333,3.1,1.5,0.2,0
4,0.194444,3.6,1.4,0.2,0


## 標準化(Standardization)

In [43]:
Math(r'x^{(i)}_{std}=\frac{x^{(i)}-mu_{x}}{\sigma_{min}}')

<IPython.core.display.Math object>

In [42]:
data['sepal width (cm)'] = (data['sepal width (cm)'] - data['sepal width (cm)'].mean())/\
                            (data['sepal width (cm)'].std())
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,0.222222,1.015602,1.4,0.2,0
1,0.166667,-0.131539,1.4,0.2,0
2,0.111111,0.327318,1.3,0.2,0
3,0.083333,0.097889,1.5,0.2,0
4,0.194444,1.24503,1.4,0.2,0
