# 원 핫 인코딩과 Min-Max 스케일링

In [1]:
import pandas as pd

df = pd.DataFrame({'item':['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서'] })
df.head()

Unnamed: 0,item
0,TV
1,냉장고
2,전자렌지
3,컴퓨터
4,선풍기


In [2]:
one_hot_result= pd.get_dummies(df)
one_hot_result.head()

Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자렌지,item_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0


# Sales Data 원 핫 인코딩

In [3]:
import numpy as np 
import pandas as pd 

In [4]:
# load data
sales_df = pd.read_csv('sales_na.csv')
sales_df

Unnamed: 0,flag,gender,education,house_val,age,online,customer_psy,marriage,child,occupation,mortgage,house_owner,region,car_prob,fam_income
0,Y,M,4. Grad,756460,1_Unk,N,B,Unknown,U,Professional,1Low,Unknown,Midwest,1,L
1,N,F,3. Bach,213171,7_>65,N,E,Unknown,U,Professional,1Low,Owner,Northeast,3,G
2,N,M,2. Some College,111147,2_<=25,Y,C,Unknown,Y,Professional,1Low,Owner,Midwest,1,J
3,Y,M,2. Some College,354151,2_<=25,Y,B,Single,U,Sales/Service,1Low,Unknown,West,2,L
4,Y,F,2. Some College,117087,1_Unk,Y,J,Married,Y,Sales/Service,1Low,Unknown,South,7,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,Y,F,3. Bach,0,7_>65,Y,C,Unknown,U,Retired,1Low,Unknown,South,3,F
39996,N,F,1. HS,213596,4_<=45,N,I,Married,U,Blue Collar,1Low,Owner,South,1,D
39997,Y,M,0. <HS,134070,3_<=35,Y,F,Married,U,Sales/Service,1Low,Owner,Midwest,4,E
39998,N,M,1. HS,402210,7_>65,Y,E,Unknown,Y,Sales/Service,1Low,Unknown,West,2,B


In [5]:
sales_df.isnull().sum()

flag            0
gender          0
education       0
house_val       0
age             0
online          0
customer_psy    0
marriage        0
child           0
occupation      0
mortgage        0
house_owner     0
region          0
car_prob        0
fam_income      0
dtype: int64

## Data Encoding - One-Hot Encoding

In [6]:
for cat in sales_df.columns:
    print(cat, sales_df[cat].unique())

flag ['Y' 'N']
gender ['M' 'F' 'U']
education ['4. Grad' '3. Bach' '2. Some College' '1. HS' '0. <HS' 'Unknown']
house_val [756460 213171 111147 ... 120630 603554 213596]
age ['1_Unk' '7_>65' '2_<=25' '6_<=65' '5_<=55' '4_<=45' '3_<=35']
online ['N' 'Y']
customer_psy ['B' 'E' 'C' 'J' 'A' 'G' 'F' 'I' 'D' 'H' 'U']
marriage ['Unknown' 'Single' 'Married']
child ['U' 'Y' 'N' '0']
occupation ['Professional' 'Sales/Service' 'Blue Collar' 'Others' 'Retired' 'Farm']
mortgage ['1Low' '2Med' '3High']
house_owner ['Unknown' 'Owner' 'Renter']
region ['Midwest' 'Northeast' 'West' 'South' 'Rest']
car_prob [1 3 2 7 5 6 9 8 4 0]
fam_income ['L' 'G' 'J' 'H' 'C' 'I' 'D' 'E' 'A' 'F' 'B' 'K' 'U']


In [7]:
#sales_df['flag'] = sales_df['flag'].apply(lambda value: 1 if value == 'Y' else 0)
#sales_df['online'] = sales_df['online'].apply(lambda value: 1 if value == 'Y' else 0)
sales_df['flag'].replace(["N","Y"],[0,1],inplace=True)
sales_df['online'].replace(["N","Y"],[0,1],inplace=True)

In [8]:
for cat in sales_df.columns:
    print(cat, sales_df[cat].unique())

flag [1 0]
gender ['M' 'F' 'U']
education ['4. Grad' '3. Bach' '2. Some College' '1. HS' '0. <HS' 'Unknown']
house_val [756460 213171 111147 ... 120630 603554 213596]
age ['1_Unk' '7_>65' '2_<=25' '6_<=65' '5_<=55' '4_<=45' '3_<=35']
online [0 1]
customer_psy ['B' 'E' 'C' 'J' 'A' 'G' 'F' 'I' 'D' 'H' 'U']
marriage ['Unknown' 'Single' 'Married']
child ['U' 'Y' 'N' '0']
occupation ['Professional' 'Sales/Service' 'Blue Collar' 'Others' 'Retired' 'Farm']
mortgage ['1Low' '2Med' '3High']
house_owner ['Unknown' 'Owner' 'Renter']
region ['Midwest' 'Northeast' 'West' 'South' 'Rest']
car_prob [1 3 2 7 5 6 9 8 4 0]
fam_income ['L' 'G' 'J' 'H' 'C' 'I' 'D' 'E' 'A' 'F' 'B' 'K' 'U']


In [9]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   flag          40000 non-null  int64 
 1   gender        40000 non-null  object
 2   education     40000 non-null  object
 3   house_val     40000 non-null  int64 
 4   age           40000 non-null  object
 5   online        40000 non-null  int64 
 6   customer_psy  40000 non-null  object
 7   marriage      40000 non-null  object
 8   child         40000 non-null  object
 9   occupation    40000 non-null  object
 10  mortgage      40000 non-null  object
 11  house_owner   40000 non-null  object
 12  region        40000 non-null  object
 13  car_prob      40000 non-null  int64 
 14  fam_income    40000 non-null  object
dtypes: int64(4), object(11)
memory usage: 4.6+ MB


In [10]:
features=list(sales_df.columns)
features=[x for x in features if x not in ['flag','house_val','online','car_prob']]
features

['gender',
 'education',
 'age',
 'customer_psy',
 'marriage',
 'child',
 'occupation',
 'mortgage',
 'house_owner',
 'region',
 'fam_income']

In [11]:
sales_df=pd.get_dummies(sales_df,columns=features)
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 68 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   flag                       40000 non-null  int64
 1   house_val                  40000 non-null  int64
 2   online                     40000 non-null  int64
 3   car_prob                   40000 non-null  int64
 4   gender_F                   40000 non-null  uint8
 5   gender_M                   40000 non-null  uint8
 6   gender_U                   40000 non-null  uint8
 7   education_0. <HS           40000 non-null  uint8
 8   education_1. HS            40000 non-null  uint8
 9   education_2. Some College  40000 non-null  uint8
 10  education_3. Bach          40000 non-null  uint8
 11  education_4. Grad          40000 non-null  uint8
 12  education_Unknown          40000 non-null  uint8
 13  age_1_Unk                  40000 non-null  uint8
 14  age_2_<=25            

In [12]:
sales_df = sales_df.astype('float')
sales_df.info()
sales_df[['house_val']].describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 68 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   flag                       40000 non-null  float64
 1   house_val                  40000 non-null  float64
 2   online                     40000 non-null  float64
 3   car_prob                   40000 non-null  float64
 4   gender_F                   40000 non-null  float64
 5   gender_M                   40000 non-null  float64
 6   gender_U                   40000 non-null  float64
 7   education_0. <HS           40000 non-null  float64
 8   education_1. HS            40000 non-null  float64
 9   education_2. Some College  40000 non-null  float64
 10  education_3. Bach          40000 non-null  float64
 11  education_4. Grad          40000 non-null  float64
 12  education_Unknown          40000 non-null  float64
 13  age_1_Unk                  40000 non-null  flo

Unnamed: 0,house_val
count,40000.0
mean,307213.8
std,422214.6
min,0.0
25%,80657.25
50%,214872.0
75%,393762.0
max,9999999.0


## Min-Max Scaling for numeric variables

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(sales_df)
sales_df_scaled=scaler.transform(sales_df)

In [14]:
sales_df_scaled

array([[1.        , 0.07564601, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.0213171 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.0111147 , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.013407  , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.040221  , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.08360301, 1.        , ..., 0.        , 0.        ,
        0.        ]])

In [15]:
sales_df.columns.tolist()

['flag',
 'house_val',
 'online',
 'car_prob',
 'gender_F',
 'gender_M',
 'gender_U',
 'education_0. <HS',
 'education_1. HS',
 'education_2. Some College',
 'education_3. Bach',
 'education_4. Grad',
 'education_Unknown',
 'age_1_Unk',
 'age_2_<=25',
 'age_3_<=35',
 'age_4_<=45',
 'age_5_<=55',
 'age_6_<=65',
 'age_7_>65',
 'customer_psy_A',
 'customer_psy_B',
 'customer_psy_C',
 'customer_psy_D',
 'customer_psy_E',
 'customer_psy_F',
 'customer_psy_G',
 'customer_psy_H',
 'customer_psy_I',
 'customer_psy_J',
 'customer_psy_U',
 'marriage_Married',
 'marriage_Single',
 'marriage_Unknown',
 'child_0',
 'child_N',
 'child_U',
 'child_Y',
 'occupation_Blue Collar',
 'occupation_Farm',
 'occupation_Others',
 'occupation_Professional',
 'occupation_Retired',
 'occupation_Sales/Service',
 'mortgage_1Low',
 'mortgage_2Med',
 'mortgage_3High',
 'house_owner_Owner',
 'house_owner_Renter',
 'house_owner_Unknown',
 'region_Midwest',
 'region_Northeast',
 'region_Rest',
 'region_South',
 'region_

In [16]:
sales_df_scaled_df = pd.DataFrame(data=sales_df_scaled, columns=sales_df.columns.tolist())
sales_df_scaled_df.shape

(40000, 68)

In [17]:
sales_df_scaled_df[['house_val']].describe()

Unnamed: 0,house_val
count,40000.0
mean,0.030721
std,0.042221
min,0.0
25%,0.008066
50%,0.021487
75%,0.039376
max,1.0


In [18]:
sales_df_scaled_df.to_csv('sales_scaled.csv',index=False)