# 데이터 표준화
## 단위환산

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

df=pd.read_csv('./auto-mpg.csv',header=None)

In [2]:
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name']

In [3]:
mpg_to_kpl = 1.60934/3.78541

In [4]:
df['kpl'] = df['mpg'] * mpg_to_kpl
df.round(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,kpl
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,7.65
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,6.38
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,7.65
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,6.80
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,7.23
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl,11.48
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup,18.71
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage,13.60
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger,11.90


In [5]:
df['horsepower'].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [6]:
df['horsepower'].replace('?', np.nan, inplace=True)
df.dropna(subset=['horsepower'], axis=0, inplace=True)
df['horsepower']=df['horsepower'].astype('float')

df['horsepower']

0      130.0
1      165.0
2      150.0
3      150.0
4      140.0
       ...  
393     86.0
394     52.0
395     84.0
396     79.0
397     82.0
Name: horsepower, Length: 392, dtype: float64

In [7]:
df['origin'].unique()

df['origin'].replace({1:'USA', 2:'EU', 3:'JPN'},inplace=True)

In [8]:
df['origin']=df['origin'].astype('category')
df['origin']

0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: category
Categories (3, object): ['EU', 'JPN', 'USA']

In [9]:
df['origin']=df['origin'].astype('str')
df['origin']

0      USA
1      USA
2      USA
3      USA
4      USA
      ... 
393    USA
394     EU
395    USA
396    USA
397    USA
Name: origin, Length: 392, dtype: object

In [10]:
df['model year']=df['model year'].astype('category')
df['model year']

0      70
1      70
2      70
3      70
4      70
       ..
393    82
394    82
395    82
396    82
397    82
Name: model year, Length: 392, dtype: category
Categories (13, int64): [70, 71, 72, 73, ..., 79, 80, 81, 82]

## 범주형(카테고리) 데이터 처리

In [11]:
count, bin_dividers = np.histogram(df['horsepower'], bins=3) # 경계값 구함
bin_dividers

array([ 46.        , 107.33333333, 168.66666667, 230.        ])

In [12]:
count

array([257, 103,  32], dtype=int64)

In [13]:
bin_names = ['저출력', '보통출력', '고출력']

df['hp_bin'] = pd.cut(x=df['horsepower'],
                     bins=bin_dividers,
                     labels=bin_names,
                     include_lowest=True)

df[['horsepower','hp_bin']].sample(15)

Unnamed: 0,horsepower,hp_bin
179,98.0,저출력
226,105.0,저출력
230,170.0,고출력
140,150.0,보통출력
43,170.0,고출력
201,110.0,보통출력
210,108.0,보통출력
116,230.0,고출력
211,120.0,보통출력
165,110.0,보통출력


## 더미 변수

In [14]:
horsepower_dummies = pd.get_dummies(df['hp_bin'])
horsepower_dummies.head(15)

Unnamed: 0,저출력,보통출력,고출력
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,0,1


## 정규화

In [15]:
df.horsepower.describe()

count    392.000000
mean     104.469388
std       38.491160
min       46.000000
25%       75.000000
50%       93.500000
75%      126.000000
max      230.000000
Name: horsepower, dtype: float64

In [16]:
df.horsepower = df.horsepower/abs(df.horsepower.max())

In [17]:
df.horsepower.head()

0    0.565217
1    0.717391
2    0.652174
3    0.652174
4    0.608696
Name: horsepower, dtype: float64

In [18]:
df.horsepower.describe()

count    392.000000
mean       0.454215
std        0.167353
min        0.200000
25%        0.326087
50%        0.406522
75%        0.547826
max        1.000000
Name: horsepower, dtype: float64

In [19]:
min_x = df.horsepower - df.horsepower.min()
min_max = df.horsepower.max() - df.horsepower.min()
df.horsepower = min_x/min_max

In [20]:
df.horsepower.head()

0    0.456522
1    0.646739
2    0.565217
3    0.565217
4    0.510870
Name: horsepower, dtype: float64

In [21]:
df.horsepower.describe()

count    392.000000
mean       0.317768
std        0.209191
min        0.000000
25%        0.157609
50%        0.258152
75%        0.434783
max        1.000000
Name: horsepower, dtype: float64