In [4]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [5]:
auto_mpg = pd.read_csv('../data/auto-mpg.csv', header=None)
auto_mpg.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'name']

auto_mpg['horsepower'].replace('?', np.nan, inplace=True)
auto_mpg.dropna(subset=['horsepower'], axis=0, inplace=True)
auto_mpg['horsepower'] = auto_mpg['horsepower'].astype('float')

print(auto_mpg.head())

    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0  3504.0          12.0   
1  15.0          8         350.0       165.0  3693.0          11.5   
2  18.0          8         318.0       150.0  3436.0          11.0   
3  16.0          8         304.0       150.0  3433.0          12.0   
4  17.0          8         302.0       140.0  3449.0          10.5   

   model year  origin                       name  
0          70       1  chevrolet chevelle malibu  
1          70       1          buick skylark 320  
2          70       1         plymouth satellite  
3          70       1              amc rebel sst  
4          70       1                ford torino  


In [6]:
count, bin_dividers = np.histogram(auto_mpg['horsepower'], bins = 3)
print(count, bin_dividers)
# [257 103  32] [ 46.         107.33333333 168.66666667 230.        ]

# 범주형의 형태로 생성
auto_mpg['hp_bin'] = pd.cut(x = auto_mpg['horsepower'],
                            bins = bin_dividers,
                            labels=['저출력', '중간출력', '고출력'],
                            include_lowest = True)

[257 103  32] [ 46.         107.33333333 168.66666667 230.        ]


In [8]:
from sklearn.preprocessing import LabelBinarizer

one_hot = LabelBinarizer()
print(one_hot.fit_transform(auto_mpg['hp_bin']))

print(one_hot.classes_) 

[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [0 1 0]
 [0 1 0]
 [0 1 0]]
['고출력' '저출력' '중간출력']


In [9]:
# 2개 이상의 특성을 가지고 원 핫 인코딩
# 2개 이상의 1이 등장할 수 있다.

from sklearn.preprocessing import MultiLabelBinarizer

multi_feature = [("Java", "C++"), ("C++", "Python"), ("Java", "C#"), ("Java", "Kotlin"), ("Python", "Go"), ("Python", "R")]

one_hot_classes = MultiLabelBinarizer()
print(one_hot_classes.fit_transform(multi_feature))

print(one_hot_classes.classes_)

[[0 1 0 1 0 0 0]
 [0 1 0 0 0 1 0]
 [1 0 0 1 0 0 0]
 [0 0 0 1 1 0 0]
 [0 0 1 0 0 1 0]
 [0 0 0 0 0 1 1]]
['C#' 'C++' 'Go' 'Java' 'Kotlin' 'Python' 'R']


In [10]:
df = pd.DataFrame({"Score":["저조", '보통', '보통', '저조', '우수', '매우우수']})
scale_mapper = {'저조': 1, '보통': 2, '우수': 3, '매우우수': 4}
df['encoder'] = df['Score'].replace(scale_mapper)

print(df)

  Score  encoder
0    저조        1
1    보통        2
2    보통        2
3    저조        1
4    우수        3
5  매우우수        4


In [13]:
from sklearn.preprocessing import OrdinalEncoder

features = np.array([['Low', 10], ['Normal', 20], ['High', 15]])

ordinal_encoder = OrdinalEncoder()
print(ordinal_encoder.fit_transform(features))

print(ordinal_encoder.categories_)

[[1. 0.]
 [2. 2.]
 [0. 1.]]
[array(['High', 'Low', 'Normal'], dtype='<U21'), array(['10', '15', '20'], dtype='<U21')]


In [25]:
from sklearn.neighbors import KNeighborsClassifier

# 훈련 데이터 생성
X = np.array([[0, 2.10, 1.45], [1, 1.16, 1.22], [0, 1.22, 1.27], [1, -0.21, -1.19]])

# 예측에 사용할 데이터
X_with_nan = np.array([[np.nan, 0.87, 0.31], [np.nan, -0.67, -0.22]])

clf = KNeighborsClassifier(3, weights='distance')

# 첫 번째 데이터를 label로 하고 나머지 데이터를 feature로 설정해서 훈련
trained_model = clf.fit(X[:, 1:], X[:, 0])

# 예측
imputed_values = trained_model.predict(X_with_nan[:, 1:])
print(imputed_values)

# 예측한 데이터와 원본 데이터를 합치기
X_with_imputed = np.hstack((imputed_values.reshape(-1, 1), X_with_nan[:, 1:]))
print(X_with_imputed)

# 결측치를 대체한 데이터와 훈련에 사용한 데이터를 합치기
result = np.vstack((X_with_imputed, X))
print(result)


[0. 1.]
[[ 0.    0.87  0.31]
 [ 1.   -0.67 -0.22]]
[[ 0.    0.87  0.31]
 [ 1.   -0.67 -0.22]
 [ 0.    2.1   1.45]
 [ 1.    1.16  1.22]
 [ 0.    1.22  1.27]
 [ 1.   -0.21 -1.19]]


In [27]:
# 가장 많이 나오는 데이터로 대체
from sklearn.impute import SimpleImputer

X_complete = np.vstack((X_with_nan, X))
print(X_complete)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X_complete)

[[  nan  0.87  0.31]
 [  nan -0.67 -0.22]
 [ 0.    2.1   1.45]
 [ 1.    1.16  1.22]
 [ 0.    1.22  1.27]
 [ 1.   -0.21 -1.19]]


array([[ 0.  ,  0.87,  0.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.16,  1.22],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [36]:
import re
match = re.match('[0-9]', '1234')
print(match)

match = re.match('[0-9]', 'abc')
print(match)

<re.Match object; span=(0, 1), match='1'>
None


In [41]:
string = 'ㄴㅁㄹㄴㅇㄹㄴㅇㄹㄹㄴㄹㅇㄹㄴ!@#$123123123 12312321!#!#!@4ㅁㄹㄴㅇㄹㅁㄴㅇ123ㄹㄴㅇㄹㅁㄴ3!!!1'

p = re.compile('[0-9]+')
result = p.sub('', string)
print(result)

# \W: 숫자나 문자를 제외한. (공백 포함 제거)
p = re.compile('\W+')
result = p.sub('', result)
print(result)

ㄴㅁㄹㄴㅇㄹㄴㅇㄹㄹㄴㄹㅇㄹㄴ!@#$ !#!#!@ㅁㄹㄴㅇㄹㅁㄴㅇㄹㄴㅇㄹㅁㄴ!!!
ㄴㅁㄹㄴㅇㄹㄴㅇㄹㄹㄴㄹㅇㄹㄴㅁㄹㄴㅇㄹㅁㄴㅇㄹㄴㅇㄹㅁㄴ


In [45]:
import unicodedata
import sys

text_data = ['안녕하세요.', 'C & C++ .', 'Java .']

punctutaion = dict.fromkeys(i for i in range(sys.maxunicode)
                            if unicodedata.category(chr(i)).startswith('P'))

result = [string.translate(punctutaion) for string in text_data]
print(result)

['안녕하세요', 'C  C++ ', 'Java ']
