In [1]:
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = {'Feature1': ['A', 'B', 'A', 'C'],
        'Feature2': ['High', 'Low', 'Medium', 'Low']}

In [3]:
import pandas as pd 

#### Encoding with One Hot Encoding 
- 범주형 데이터를 연속형 숫자 데이터로 변환
- 범주형 항목만큼 벡터화

In [4]:
df_data = pd.DataFrame(data)
df_data

Unnamed: 0,Feature1,Feature2
0,A,High
1,B,Low
2,A,Medium
3,C,Low


In [5]:
df_data.index, df_data.columns, df_data.values

(RangeIndex(start=0, stop=4, step=1),
 Index(['Feature1', 'Feature2'], dtype='object'),
 array([['A', 'High'],
        ['B', 'Low'],
        ['A', 'Medium'],
        ['C', 'Low']], dtype=object))

In [6]:
type(df_data.values) # array

numpy.ndarray

In [7]:
oneHotEncoder = OneHotEncoder() # 인스턴스화

In [8]:
oneHotEncoder.fit(df_data[['Feature1']]) # 훈련

In [9]:
oneHotEncoder.categories_ # 분류된 범주 찾기
, oneHotEncoder.categories_

'oneHotEncoder.categories_'

In [10]:
encoder_array = oneHotEncoder.transform(df_data[['Feature1']]).toarray() #범부형이 연속형으로 변경이 이루어짐 
encoder_array

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [11]:
#pd.DataFrame(encoder_array, columns= oneHotEncoder.get_feature_names_out(['Feature1']))
## columns= oneHotEncoder.get_feature_names_out() # 임의로 생성

In [12]:
df_encoder = pd.DataFrame(encoder_array, columns= oneHotEncoder.categories_) 
##columns= oneHotEncoder.categories_  #categories로 컬럼명 생성 

#### concat

In [13]:
pd.concat([df_data, df_encoder], axis=1)

Unnamed: 0,Feature1,Feature2,"(A,)","(B,)","(C,)"
0,A,High,1.0,0.0,0.0
1,B,Low,0.0,1.0,0.0
2,A,Medium,1.0,0.0,0.0
3,C,Low,0.0,0.0,1.0


## Imbalanced Data Sampling

### Under Sampling : Tomek's Link 
- 데이터의 비율이 적은 쪽으로 한쪽의 데이터를 줄여주는 기법
- (Random Under Sampling / Tomek's Link / KNN ....)

In [16]:
from imblearn.under_sampling import TomekLinks
# 결과값은 numpy로 뱉어냄

In [17]:
from sklearn.datasets import make_classification

In [27]:
features, target = make_classification(n_classes=2, class_sep=2,
                    weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0,
                    n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

In [28]:
features.shape, target.shape

((1000, 20), (1000,))

from collections import Counter

In [20]:
from collections import Counter

In [29]:
Counter(target)

Counter({0: 300, 1: 700})

In [30]:
tomekLinks = TomekLinks() #인스턴스화
features_resaple, target_resample = tomekLinks.fit_resample(features, target) #학습

In [31]:
features_resaple.shape, target_resample.shape

((996, 20), (996,))

In [32]:
Counter(target_resample)

Counter({0: 300, 1: 696})

### Over Sampling : SMOTE
- 데이터의 비율이 많은 쪽으로 한쪽의 데이터를 늘려주는 기법
- (Random Over Sampling / SMOTE / ADASYN ...)
- 가상의 데이터를 넣는 작업이기에 위험함.

In [33]:
from imblearn.over_sampling import SMOTE

In [36]:
smote = SMOTE() #인스턴스화
features_over_sampling, target_over_sampling = smote.fit_resample(features, target) # 학습
features_over_sampling.shape, target_over_sampling.shape

((1400, 20), (1400,))

In [35]:
Counter(target_over_sampling)

Counter({0: 700, 1: 700})