In [53]:
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
import platform
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

# matplotlib 기반의 그래프를 출력 시 한글 사용을 위한 설정

# system이 Mac이면,
if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')

# 음수를 사용하기 위함
plt.rcParams['axes.unicode_minus'] = False

In [54]:
feature = np.array([[1, 2], [2, 3], [3, 8], [4, 2], [7, 2]])

# 정규화 객체
# L1를 norm에 적용하면 맨하튼 거리
# L2를 norm에 적용하면 유클리드 거리 - 각 값을 전체 데이터를 제곱해서 더한 값의 제곱근으로 나눈 값
# feature[1]: 1 / root(1^2 + 2^2), 2 / root(1^2 + 2^2)
normalizer = preprocessing.Normalizer(norm='l2')
l2_norm = normalizer.transform(feature)

print(l2_norm)

[[0.4472136  0.89442719]
 [0.5547002  0.83205029]
 [0.35112344 0.93632918]
 [0.89442719 0.4472136 ]
 [0.96152395 0.27472113]]


In [55]:
feature = np.array([[1, 2], [2, 3], [3, 8], [4, 2], [7, 2]])

# 제곱항까지의 다항을 생성 - 열의 개수가 늘어남
# 회귀 분석할 때 시간의 흐름에 따라 변화가 급격하게 일어나는 경우
# 데이터가 부족할 때 샘플 데이터를 추가하기 우해 사용
# 제곱하거나 곱하기를 하면 데이터의 특성 자체는 크게 변화하지 않기 때문에 사용.

polynomial = preprocessing.PolynomialFeatures(degree=3, include_bias=False)
result = polynomial.fit_transform(feature)

print(result)   

[[  1.   2.   1.   2.   4.   1.   2.   4.   8.]
 [  2.   3.   4.   6.   9.   8.  12.  18.  27.]
 [  3.   8.   9.  24.  64.  27.  72. 192. 512.]
 [  4.   2.  16.   8.   4.  64.  32.  16.   8.]
 [  7.   2.  49.  14.   4. 343.  98.  28.   8.]]


In [56]:
feature = np.array([[1, 2], [2, 3], [3, 8], [4, 2], [7, 2]])

#위의 데이터에 함수 적용
result1 = preprocessing.FunctionTransformer(lambda x : x + 1).transform(feature)
print(result1)

df = pd.DataFrame(feature, columns=["feature1", "feature2"])
print(df.apply(lambda x : x + 1).values)

def add_one(x):
    return x + 1

def sub_one(x):
    return x - 1

result2 = ColumnTransformer([   ("add_one", preprocessing.FunctionTransformer(add_one, validate=True), ['feature1']),
                                ("sub_one", preprocessing.FunctionTransformer(sub_one, validate=True), ['feature2'])]).fit_transform(df)

[[2 3]
 [3 4]
 [4 9]
 [5 3]
 [8 3]]
[[2 3]
 [3 4]
 [4 9]
 [5 3]
 [8 3]]


In [57]:
auto_mpg = pd.read_csv('../data/auto-mpg.csv', header=None)
auto_mpg.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'name']

auto_mpg['horsepower'].replace('?', np.nan, inplace=True)
auto_mpg.dropna(subset=['horsepower'], axis=0, inplace=True)
auto_mpg['horsepower'] = auto_mpg['horsepower'].astype('float')

In [58]:
# auto_mpg의 horsepower를 3개의 구간으로 분할
auto_mpg['horsepower'].describe()

# 경계값 찾기
count, bin_dividers = np.histogram(auto_mpg['horsepower'], bins=3)

print(count, bin_dividers)

# 46.0 ~ 107.3 : 257개
# 107.3 ~ 168.6 : 103개
# 168.6 ~ 230.0 : 32개

bin_names = ['저출력', '보통 출력', '고출력']

auto_mpg['hp_bin'] = pd.cut(x = auto_mpg['horsepower'],
                            bins=bin_dividers,
                            labels=bin_names,
                            include_lowest=True)
print(auto_mpg[['horsepower', 'hp_bin']].head(20))

[257 103  32] [ 46.         107.33333333 168.66666667 230.        ]
    horsepower hp_bin
0        130.0  보통 출력
1        165.0  보통 출력
2        150.0  보통 출력
3        150.0  보통 출력
4        140.0  보통 출력
5        198.0    고출력
6        220.0    고출력
7        215.0    고출력
8        225.0    고출력
9        190.0    고출력
10       170.0    고출력
11       160.0  보통 출력
12       150.0  보통 출력
13       225.0    고출력
14        95.0    저출력
15        95.0    저출력
16        97.0    저출력
17        85.0    저출력
18        88.0    저출력
19        46.0    저출력


In [59]:
result = np.digitize(auto_mpg['horsepower'], bins=[107.33333333, 168.66666667, 230.0], right=True)

print(result)

[1 1 1 1 1 2 2 2 2 2 2 1 1 2 0 0 0 0 0 0 0 0 0 1 0 2 2 2 2 0 0 0 0 0 0 0 0
 1 2 1 1 2 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 1 2 1 1 2 0 1 1 1
 1 1 0 0 0 0 0 0 0 0 2 1 1 1 1 2 1 1 1 2 2 2 0 0 0 0 0 0 1 1 2 2 0 0 0 0 0
 0 0 0 1 2 0 0 0 1 1 1 1 2 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 2 1 1 1 1 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 2 1 1 1 0 0 0 0 0 1 1 1
 1 1 0 0 0 2 2 2 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1
 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]


In [60]:
# sklearn의 binning(구간 분할)

age = np.array([[13], [30], [67], [36], [20], [33], [27]])

binarizer = preprocessing.Binarizer(threshold=30.0)
result = binarizer.transform(age)
print(result)

# 여러 개의 그룹으로 분할
# 4개의 그룹으로 일련번호 형태로 일정한 비율 분할
# strategy에 uniform을 설정하면간격을 일정하게 분할
# encode 가 ordinal이면 일련변호로 그룹이 생성
# onehot 을 설정하면 onehot, encoding을 한 후 최소 행렬로
# onehot-dense 를 설정하면 onehot encoding을 한 후 밀집 행렬로.

kb = preprocessing.KBinsDiscretizer(4, encode='ordinal', strategy='uniform')
result = kb.fit_transform(age)
print(result)

[[0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]]
[[0.]
 [1.]
 [3.]
 [1.]
 [0.]
 [1.]
 [1.]]




In [61]:
from sklearn.cluster import KMeans

sample = np.array([[13, 30], [30, 40], [67, 44], [26, 24], [22, 11], [98, 28]])
df = pd.DataFrame(sample, columns= ['feature_1', 'feature_2'])
print(df)

cluster = KMeans(3, random_state = 42)
cluster.fit(sample)
df['group'] = cluster.predict(sample)
print(df)

   feature_1  feature_2
0         13         30
1         30         40
2         67         44
3         26         24
4         22         11
5         98         28
   feature_1  feature_2  group
0         13         30      1
1         30         40      1
2         67         44      2
3         26         24      1
4         22         11      1
5         98         28      0


  super()._check_params_vs_input(X, default_n_init=10)


In [62]:
def outliers_z_score(ys):
    threshold = 3

    mean_y = np.mean(ys)
    stdev_y = np.std(ys)
    z_scores = [(y - mean_y) / stdev_y for y in ys]
    print(mean_y, stdev_y, z_scores, sep='\n', end='\n')
    return np.where(np.abs(z_scores) > threshold)

# 데이터가 12개 이하면 이상치가 없다고 판단함.
feature = np.array([[10, 10, 7, 6, 3], [1000000, 3, 23, 12, 11]])
print(outliers_z_score(feature))

100008.5
299997.16671570414
[array([-0.33333148, -0.33333148, -0.33334148, -0.33334481, -0.33335481]), array([ 3.        , -0.33335481, -0.33328815, -0.33332481, -0.33332815])]
(array([], dtype=int64), array([], dtype=int64))


In [63]:
def outliers_z_score(ys):
    threshold = 3.5

    # 평균에서 중앙값으로 변경
    # mean_y = np.mean(ys)
    median_y = np.median(ys)
    stdev_y = np.std(ys)
    z_scores = [0.6745 * (y - median_y) / stdev_y for y in ys]
    print(median_y, stdev_y, z_scores, sep='\n', end='\n')
    return np.where(np.abs(z_scores) > threshold)

# 데이터가 12개 이하면 이상치가 없다고 판단함.
feature = np.array([[10, 10, 7, 6, 3], [1000000, 3, 23, 12, 11]])
print(outliers_z_score(feature))

10.0
299997.16671570414
[array([ 0.00000000e+00,  0.00000000e+00, -6.74506370e-06, -8.99341827e-06,
       -1.57384820e-05]), array([ 2.24833208e+00, -1.57384820e-05,  2.92286094e-05,  4.49670913e-06,
        2.24835457e-06])]
(array([], dtype=int64), array([], dtype=int64))


In [64]:
def outliers_iqr(ys):
    #1사분위수와 3사분위 수 구하기
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1

    lower_bound = quartile_1 - (iqr * 0.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

feature = np.array([[10, 10, 7, 6, 3], [1000000, 3, 23, 12, 11]])
print(outliers_iqr(feature))

(array([0, 1, 1, 1]), array([4, 0, 1, 2]))


In [65]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

feature, _ = make_blobs(n_samples=10, n_features=2, centers=1, random_state=42)
print(feature)

feature[0, 0] = 10000
feature[0, 1] = 10000
print(feature)
outlier_detector = EllipticEnvelope(contamination=0.1)
outlier_detector.fit(feature)

# 이상치로 판단되면 -1을 리턴하고 그렇지 않으면 1을 리턴
outlier_detector.predict(feature)

[[-2.743351    8.78014917]
 [-3.4172217   7.60198243]
 [-3.52202874  9.32853346]
 [-2.26723535  7.10100588]
 [-2.97261532  8.54855637]
 [-1.04354885  8.78850983]
 [-1.86150908 10.53731598]
 [-2.97867201  9.55684617]
 [-4.23411546  8.4519986 ]
 [-0.92998481  9.78172086]]
[[ 1.00000000e+04  1.00000000e+04]
 [-3.41722170e+00  7.60198243e+00]
 [-3.52202874e+00  9.32853346e+00]
 [-2.26723535e+00  7.10100588e+00]
 [-2.97261532e+00  8.54855637e+00]
 [-1.04354885e+00  8.78850983e+00]
 [-1.86150908e+00  1.05373160e+01]
 [-2.97867201e+00  9.55684617e+00]
 [-4.23411546e+00  8.45199860e+00]
 [-9.29984808e-01  9.78172086e+00]]


array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [66]:
houses = pd.DataFrame()
houses['Price'] = [5000000, 390000, 290000, 5000000]
houses['Rooms'] = [2, 3, 5, 116]
houses['Feet'] = [1500, 2000, 1300, 20000]

# rooms 값이 20보다 크면 이상치로 간주하고 특성을 추가
houses['Outlier'] = np.where(houses['Rooms'] > 20, 1, 0)
print(houses)

houses['Log_Feet'] = [np.log(x) for x in houses['Feet']]
print(houses)

# Outlier의 영향향을 최소화 - 특성 변환(Scaling)
imsi = pd.DataFrame(houses['Rooms'])
scaler = preprocessing.RobustScaler()
scaler.fit(imsi)
houses['Scale_Rooms'] = scaler.transform(imsi)
print(houses)

     Price  Rooms   Feet  Outlier
0  5000000      2   1500        0
1   390000      3   2000        0
2   290000      5   1300        0
3  5000000    116  20000        1
     Price  Rooms   Feet  Outlier  Log_Feet
0  5000000      2   1500        0  7.313220
1   390000      3   2000        0  7.600902
2   290000      5   1300        0  7.170120
3  5000000    116  20000        1  9.903488
     Price  Rooms   Feet  Outlier  Log_Feet  Scale_Rooms
0  5000000      2   1500        0  7.313220    -0.066667
1   390000      3   2000        0  7.600902    -0.033333
2   290000      5   1300        0  7.170120     0.033333
3  5000000    116  20000        1  9.903488     3.733333


In [67]:
import seaborn as sns

titainic = sns.load_dataset('titanic')
print(titainic.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [69]:
# 결측치 삭제
# 각 컬럼의 None의 개수를 파악함.
print(titainic.isnull().sum(axis=0))

# 결측치의 개수가 200 개 이상인 컬럼 삭제
titainic_thresh = titainic.dropna(axis=1, thresh=200)
print(titainic_thresh)

result = titainic_thresh[['survived', 'pclass', 'sex', 'age', 'sibsp']]

# 결측치인 행만 제거함
result_age = titainic.dropna(subset=['age'], how='any', axis=0)
result_age.info()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   

In [73]:
titanic = sns.load_dataset('titanic')

print(titanic['embark_town'][825:831])
titanic['embark_town'].fillna(method='ffill')
print(titanic['embark_town'][825:831])

# 결측치가 몇 개 되지 않을 때는 대표값으로 대체
# 대표값으로 사용될 수 있는 데이터는 평균, 중간값, 최빈값 등
# 대표값으로 변환하는 경우 많은 양의 데이터를 변경하면 분석할 때 결과가 왜곡될 수 있음.

mode = titanic['embark_town'].value_counts()
# 가장 많이 출현한 데이터
# pirnt(mode. idxmax())
titanic['embark_town'].fillna(mode.idxmax(), inplace=True)
print(titanic['embark_town'][825:831])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
830      Cherbourg
Name: embark_town, dtype: object
825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
830      Cherbourg
Name: embark_town, dtype: object
825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
830      Cherbourg
Name: embark_town, dtype: object


In [75]:
# sklearn 의 SimpleImputer 이용
# 객체를 만들 때 strategy 옵션에 mean, median, most_frequent, constant를 설정
# constant를 설정하면 fill_value 옵션에 채울 값을 추가해우어야 한다.

from sklearn.impute import SimpleImputer
features = np.array([[100], [200], [300], [400], [500], [np.nan]])

# NaN값을 중간값으로 대체함.
simple_imputer = SimpleImputer(strategy='median')

print(simple_imputer.fit_transform(features))


[[100.]
 [200.]
 [300.]
 [400.]
 [500.]
 [300.]]


In [79]:
from fancyimpute import KNN

features = np.array([[100, 200], [200, 400], [300, 600], [400, 800], [200, np.nan]])
print(KNN(k=5, verbose=0).fit_transform(feature))


[[ 1.00000000e+04  1.00000000e+04]
 [-3.41722170e+00  7.60198243e+00]
 [-3.52202874e+00  9.32853346e+00]
 [-2.26723535e+00  7.10100588e+00]
 [-2.97261532e+00  8.54855637e+00]
 [-1.04354885e+00  8.78850983e+00]
 [-1.86150908e+00  1.05373160e+01]
 [-2.97867201e+00  9.55684617e+00]
 [-4.23411546e+00  8.45199860e+00]
 [-9.29984808e-01  9.78172086e+00]]
[[ 1.00000000e+04  1.00000000e+04]
 [-3.41722170e+00  7.60198243e+00]
 [-3.52202874e+00  9.32853346e+00]
 [-2.26723535e+00  7.10100588e+00]
 [-2.97261532e+00  8.54855637e+00]
 [-1.04354885e+00  8.78850983e+00]
 [-1.86150908e+00  1.05373160e+01]
 [-2.97867201e+00  9.55684617e+00]
 [-4.23411546e+00  8.45199860e+00]
 [-9.29984808e-01  9.78172086e+00]]


