# 레이블 인코딩

In [29]:
from sklearn.preprocessing import LabelEncoder

In [30]:
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [31]:
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('인코딩 변환값:',labels)

인코딩 변환값: [0 1 4 5 3 3 2 2]


In [32]:
print('인코딩 클래스:',encoder.classes_)

인코딩 클래스: ['TV' '냉장고' '믹서' '선풍기' '전자렌지' '컴퓨터']


In [33]:
print('디코딩 원본 값:',encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3]))

디코딩 원본 값: ['전자렌지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


# sales data 레이블 인코딩

In [2]:
import numpy as np 
import pandas as pd 

In [4]:
# load data
sales_df = pd.read_csv('sales_na.csv')
sales_df

Unnamed: 0,flag,gender,education,house_val,age,online,customer_psy,marriage,child,occupation,mortgage,house_owner,region,car_prob,fam_income
0,Y,M,4. Grad,756460,1_Unk,N,B,Unknown,U,Professional,1Low,Unknown,Midwest,1,L
1,N,F,3. Bach,213171,7_>65,N,E,Unknown,U,Professional,1Low,Owner,Northeast,3,G
2,N,M,2. Some College,111147,2_<=25,Y,C,Unknown,Y,Professional,1Low,Owner,Midwest,1,J
3,Y,M,2. Some College,354151,2_<=25,Y,B,Single,U,Sales/Service,1Low,Unknown,West,2,L
4,Y,F,2. Some College,117087,1_Unk,Y,J,Married,Y,Sales/Service,1Low,Unknown,South,7,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,Y,F,3. Bach,0,7_>65,Y,C,Unknown,U,Retired,1Low,Unknown,South,3,F
39996,N,F,1. HS,213596,4_<=45,N,I,Married,U,Blue Collar,1Low,Owner,South,1,D
39997,Y,M,0. <HS,134070,3_<=35,Y,F,Married,U,Sales/Service,1Low,Owner,Midwest,4,E
39998,N,M,1. HS,402210,7_>65,Y,E,Unknown,Y,Sales/Service,1Low,Unknown,West,2,B


In [5]:
# shape of dataframe
sales_df.shape

(40000, 15)

In [7]:
# dataframe dtypes for each feature
sales_df.dtypes

flag            object
gender          object
education       object
house_val        int64
age             object
online          object
customer_psy    object
marriage        object
child           object
occupation      object
mortgage        object
house_owner     object
region          object
car_prob         int64
fam_income      object
dtype: object

In [8]:
sales_df.isnull().sum()

flag            0
gender          0
education       0
house_val       0
age             0
online          0
customer_psy    0
marriage        0
child           0
occupation      0
mortgage        0
house_owner     0
region          0
car_prob        0
fam_income      0
dtype: int64

## Data Encoding - Label Encoding

In [39]:
for cat in sales_df.columns:
    print(cat, sales_df[cat].unique())

flag ['Y' 'N']
gender ['M' 'F' 'U']
education ['4. Grad' '3. Bach' '2. Some College' '1. HS' '0. <HS' 'Unknown']
house_val [756460 213171 111147 ... 120630 603554 213596]
age ['1_Unk' '7_>65' '2_<=25' '6_<=65' '5_<=55' '4_<=45' '3_<=35']
online ['N' 'Y']
customer_psy ['B' 'E' 'C' 'J' 'A' 'G' 'F' 'I' 'D' 'H' 'U']
marriage ['Unknown' 'Single' 'Married']
child ['U' 'Y' 'N' '0']
occupation ['Professional' 'Sales/Service' 'Blue Collar' 'Others' 'Retired' 'Farm']
mortgage ['1Low' '2Med' '3High']
house_owner ['Unknown' 'Owner' 'Renter']
region ['Midwest' 'Northeast' 'West' 'South' 'Rest']
car_prob [1 3 2 7 5 6 9 8 4 0]
fam_income ['L' 'G' 'J' 'H' 'C' 'I' 'D' 'E' 'A' 'F' 'B' 'K' 'U']


In [40]:
#sales_df['flag'] = sales_df['flag'].apply(lambda value: 1 if value == 'Y' else 0)
#sales_df['online'] = sales_df['online'].apply(lambda value: 1 if value == 'Y' else 0)
sales_df['flag'].replace(["N","Y"],[0,1],inplace=True)
sales_df['online'].replace(["N","Y"],[0,1],inplace=True)

In [41]:
for cat in sales_df.columns:
    print(cat, sales_df[cat].unique())

flag [1 0]
gender ['M' 'F' 'U']
education ['4. Grad' '3. Bach' '2. Some College' '1. HS' '0. <HS' 'Unknown']
house_val [756460 213171 111147 ... 120630 603554 213596]
age ['1_Unk' '7_>65' '2_<=25' '6_<=65' '5_<=55' '4_<=45' '3_<=35']
online [0 1]
customer_psy ['B' 'E' 'C' 'J' 'A' 'G' 'F' 'I' 'D' 'H' 'U']
marriage ['Unknown' 'Single' 'Married']
child ['U' 'Y' 'N' '0']
occupation ['Professional' 'Sales/Service' 'Blue Collar' 'Others' 'Retired' 'Farm']
mortgage ['1Low' '2Med' '3High']
house_owner ['Unknown' 'Owner' 'Renter']
region ['Midwest' 'Northeast' 'West' 'South' 'Rest']
car_prob [1 3 2 7 5 6 9 8 4 0]
fam_income ['L' 'G' 'J' 'H' 'C' 'I' 'D' 'E' 'A' 'F' 'B' 'K' 'U']


In [42]:
features=list(sales_df.columns)
features=[x for x in features if x not in ['flag','house_val','online','car_prob']]
features

['gender',
 'education',
 'age',
 'customer_psy',
 'marriage',
 'child',
 'occupation',
 'mortgage',
 'house_owner',
 'region',
 'fam_income']

In [43]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for feature in features:
    le=le.fit(sales_df[feature])
    sales_df[feature]=le.transform(sales_df[feature])
sales_df.head()

Unnamed: 0,flag,gender,education,house_val,age,online,customer_psy,marriage,child,occupation,mortgage,house_owner,region,car_prob,fam_income
0,1,1,4,756460,0,0,1,2,2,3,0,2,0,1,11
1,0,0,3,213171,6,0,4,2,2,3,0,0,1,3,6
2,0,1,2,111147,1,1,2,2,3,3,0,0,0,1,9
3,1,1,2,354151,1,1,1,1,2,5,0,2,4,2,11
4,1,0,2,117087,0,1,9,0,3,5,0,2,3,7,7


In [44]:
sales_df.to_csv('sales_l_encoding.csv',index=False)