### 데이터 전처리 실습1
    - 가상의 당뇨병 환자 데이터 생성 및 처리

#### 환경구성

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
np.random.seed(3)

data = {
    'age': np.random.randint(20, 80, 100),
    'gender':np.random.choice(['M', 'F'], 100),
    'bmi':np.random.uniform(18.5, 35, 100),
    'blood_pressure': np.random.randint(80, 180, 100),
    'glucose_level': np.random.uniform(70, 200, 100),
    'insulin': np.random.uniform(15, 300, 100),
    'diabetes': np.random.choice([0, 1], 100, p=[0.7, 0.3])
}

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0,age,gender,bmi,blood_pressure,glucose_level,insulin,diabetes
0,62,M,26.244792,106,190.578806,113.311737,0
1,44,M,26.791749,158,98.643723,127.504283,1
2,77,M,21.343988,102,102.750939,21.336088,0
3,23,M,23.423713,156,187.521621,203.198910,0
4,76,M,22.935321,91,114.917254,63.447155,0
...,...,...,...,...,...,...,...
95,72,M,23.180876,91,189.661025,238.511383,0
96,69,F,22.012757,104,125.139142,244.173089,0
97,70,M,25.434397,170,150.512546,39.657672,0
98,22,M,20.576047,168,96.403995,139.085175,0


#### 결측치 생성

In [5]:
df.loc[np.random.choice(df.index, 10), 'bmi'] = np.nan
df.loc[np.random.choice(df.index, 10), 'insulin'] = np.nan

In [9]:
df.tail(15)

Unnamed: 0,age,gender,bmi,blood_pressure,glucose_level,insulin,diabetes
85,38,M,25.91512,103,164.401839,45.737563,0
86,31,F,26.369374,176,140.717134,238.638316,1
87,75,F,27.018715,164,139.540293,157.159487,0
88,52,F,24.579924,98,130.389807,34.983919,0
89,51,M,22.830051,154,70.870248,249.567886,0
90,33,F,31.746662,169,121.021349,126.636631,0
91,57,F,33.362718,124,199.861997,45.428007,0
92,50,M,24.457882,111,176.637857,64.532557,0
93,68,F,33.087067,171,144.471892,259.938296,1
94,53,M,23.5511,156,95.540185,64.231413,1


In [10]:
# 결측치 개수 확인
df.isnull().sum()

age                0
gender             0
bmi                8
blood_pressure     0
glucose_level      0
insulin           10
diabetes           0
dtype: int64

#### Numeric, Categorical data

In [12]:
df.describe()

Unnamed: 0,age,bmi,blood_pressure,glucose_level,insulin,diabetes
count,100.0,92.0,100.0,100.0,90.0,100.0
mean,51.43,26.884778,129.19,136.01846,165.077223,0.31
std,17.624166,4.710787,30.636248,37.279935,79.962184,0.464823
min,20.0,18.864581,80.0,70.276323,18.728601,0.0
25%,37.75,22.931775,102.75,102.694345,112.568564,0.0
50%,52.0,26.317681,128.0,139.706647,162.340935,0.0
75%,68.0,31.410877,156.5,165.770543,226.980967,1.0
max,79.0,34.999595,179.0,199.861997,298.309161,1.0


In [13]:
numeric_feature = ['age', 'bmi', 'blood_pressure', 'glucose_level', 'insulin']
categorical_feature = ['gender']

#### Pipeline 구성

In [14]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [16]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(drop='first'))
])

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feature),
        ('cat', categorical_transformer, categorical_feature)
    ]
)

#### Pipeline 실행

In [19]:
X = df.drop('diabetes', axis=1)
y = df['diabetes']
X_processed = preprocessor.fit_transform(X)

#### Result

In [20]:
feature_names = (numeric_feature + preprocessor.named_transformers_['cat']
                .named_steps['onehot']
                .get_feature_names_out(categorical_feature).tolist())

X_processed_df = pd.DataFrame(X_processed, columns=feature_names, index=X.index)

In [21]:
X_processed_df

Unnamed: 0,age,bmi,blood_pressure,glucose_level,insulin,gender_M
0,0.602766,-0.132242,-0.760760,1.470904,-0.682548,1.0
1,-0.423704,-0.010600,0.945127,-1.007594,-0.494419,1.0
2,1.458158,-1.222173,-0.891982,-0.896866,-1.901727,1.0
3,-1.621253,-0.759645,0.879516,1.388485,0.508947,1.0
4,1.401132,-0.868263,-1.252843,-0.568872,-1.343526,1.0
...,...,...,...,...,...,...
95,1.173027,-0.116032,-1.252843,1.446161,0.977030,1.0
96,1.001949,-1.073440,-0.826371,-0.293298,1.052079,0.0
97,1.058975,-0.312473,1.338793,0.390749,-1.658866,1.0
98,-1.678279,-1.392961,1.273182,-1.067975,-0.340909,1.0


#### 상관관계 분석

In [22]:
correlation = X_processed_df.corr()
correlation

Unnamed: 0,age,bmi,blood_pressure,glucose_level,insulin,gender_M
age,1.0,-0.077216,-0.000433,0.031987,0.018829,-0.033992
bmi,-0.077216,1.0,0.020765,-0.016169,0.113527,-0.212271
blood_pressure,-0.000433,0.020765,1.0,-0.10852,0.098721,-0.101271
glucose_level,0.031987,-0.016169,-0.10852,1.0,0.002093,0.050307
insulin,0.018829,0.113527,0.098721,0.002093,1.0,0.034133
gender_M,-0.033992,-0.212271,-0.101271,0.050307,0.034133,1.0


#### 주요 특성(Important feature)

In [23]:
threshold = 0.5
important_feature = correlation[abs(correlation['glucose_level']) > threshold].index

important_feature.tolist()

['glucose_level']

### 데이터 전처리 실습2
    - 심장질환 데이터셋

#### 환경구성

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

#### UCI ML Repository
    - 심장질환 데이터셋을 링크로 불러오기

In [47]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
                'oldpeak', 'slope', 'ca', 'thal', 'num']

df = pd.read_csv(url, names=column_names, na_values='?')
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


#### 결측치 확인

In [26]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [None]:
import missingno as msno

msno.matrix(df)

In [None]:
msno.bar(df)

#### 이상치 처리
    - RobustScaler: 이상치에 덜 민감한 스케일링 방식. 중앙값과 사분위값을 이용한 스케일링 방식
            StandardScaler와 다르게 극단적 값에 영향이 적다.
            데이터에 이상치가 많거나 정규분포를 따르지 않을 때 사용

$$
    X\_scaled = (X - median) / IQR
$$

In [48]:
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
robust_scaler = RobustScaler()
df[numeric_features] = robust_scaler.fit_transform(df[numeric_features])

#### 결측치 처리
    - SimpleImputer
    - KNNImputer: KNN 알고리즘을 활용하여 결측치 주변의 데이터를 사용하여 결측치를 대체
            변수간의 관계가 복잡하거나 SimpleImputer로 부족한 경우

In [49]:
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

df_imputed

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0.538462,1.0,1.0,0.75,-0.125000,1.0,2.0,-0.092308,0.0,0.9375,3.0,0.0,6.0,0.0
1,0.846154,1.0,4.0,1.50,0.703125,0.0,2.0,-1.384615,1.0,0.4375,2.0,3.0,3.0,2.0
2,0.846154,1.0,4.0,-0.50,-0.187500,0.0,2.0,-0.738462,1.0,1.1250,2.0,2.0,7.0,1.0
3,-1.461538,1.0,3.0,0.00,0.140625,0.0,0.0,1.046154,0.0,1.6875,3.0,0.0,3.0,0.0
4,-1.153846,0.0,2.0,0.00,-0.578125,0.0,2.0,0.584615,0.0,0.3750,1.0,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,-0.846154,1.0,1.0,-1.00,0.359375,0.0,0.0,-0.646154,0.0,0.2500,2.0,0.0,7.0,1.0
299,0.923077,1.0,4.0,0.70,-0.750000,1.0,0.0,-0.369231,0.0,1.6250,2.0,2.0,7.0,2.0
300,0.076923,1.0,4.0,0.00,-1.718750,0.0,0.0,-1.169231,1.0,0.2500,2.0,1.0,7.0,3.0
301,0.076923,0.0,2.0,0.00,-0.078125,0.0,2.0,0.646154,0.0,-0.5000,2.0,1.0,3.0,1.0


In [None]:
msno.bar(df_imputed)

#### 범주형 데이터 처리
    - One-hot Encoding
    - Label Encoding

In [50]:
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
label_encoder = LabelEncoder()
for feature in categorical_features:
    df_imputed[feature] = label_encoder.fit_transform(df_imputed[feature].astype(str))

df_imputed

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0.538462,1,0,0.75,-0.125000,1,2,-0.092308,0,0.9375,2,0.0,2,0.0
1,0.846154,1,3,1.50,0.703125,0,2,-1.384615,1,0.4375,1,3.0,0,2.0
2,0.846154,1,3,-0.50,-0.187500,0,2,-0.738462,1,1.1250,1,2.0,3,1.0
3,-1.461538,1,2,0.00,0.140625,0,0,1.046154,0,1.6875,2,0.0,0,0.0
4,-1.153846,0,1,0.00,-0.578125,0,2,0.584615,0,0.3750,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,-0.846154,1,0,-1.00,0.359375,0,0,-0.646154,0,0.2500,1,0.0,3,1.0
299,0.923077,1,3,0.70,-0.750000,1,0,-0.369231,0,1.6250,1,2.0,3,2.0
300,0.076923,1,3,0.00,-1.718750,0,0,-1.169231,1,0.2500,1,1.0,3,3.0
301,0.076923,0,1,0.00,-0.078125,0,2,0.646154,0,-0.5000,1,1.0,0,1.0


#### 새로운 특성 생성

In [51]:
df_imputed['bmi'] = df_imputed['trestbps'] / ((df_imputed['age'] / 100) ** 2)
df_imputed['heart_rate_pressure_product'] = df_imputed['thalach'] * df_imputed['trestbps']

df_imputed

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,bmi,heart_rate_pressure_product
0,0.538462,1,0,0.75,-0.125000,1,2,-0.092308,0,0.9375,2,0.0,2,0.0,25867.346939,-0.069231
1,0.846154,1,3,1.50,0.703125,0,2,-1.384615,1,0.4375,1,3.0,0,2.0,20950.413223,-2.076923
2,0.846154,1,3,-0.50,-0.187500,0,2,-0.738462,1,1.1250,1,2.0,3,1.0,-6983.471074,0.369231
3,-1.461538,1,2,0.00,0.140625,0,0,1.046154,0,1.6875,2,0.0,0,0.0,0.000000,0.000000
4,-1.153846,0,1,0.00,-0.578125,0,2,0.584615,0,0.3750,0,0.0,0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,-0.846154,1,0,-1.00,0.359375,0,0,-0.646154,0,0.2500,1,0.0,3,1.0,-13966.942149,0.646154
299,0.923077,1,3,0.70,-0.750000,1,0,-0.369231,0,1.6250,1,2.0,3,2.0,8215.277778,-0.258462
300,0.076923,1,3,0.00,-1.718750,0,0,-1.169231,1,0.2500,1,1.0,3,3.0,0.000000,-0.000000
301,0.076923,0,1,0.00,-0.078125,0,2,0.646154,0,-0.5000,1,1.0,0,1.0,0.000000,0.000000


In [53]:
df_imputed = df_imputed.drop(['bmi', 'heart_rate_pressure_product'], axis=1)
df_imputed

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0.538462,1,0,0.75,-0.125000,1,2,-0.092308,0,0.9375,2,0.0,2,0.0
1,0.846154,1,3,1.50,0.703125,0,2,-1.384615,1,0.4375,1,3.0,0,2.0
2,0.846154,1,3,-0.50,-0.187500,0,2,-0.738462,1,1.1250,1,2.0,3,1.0
3,-1.461538,1,2,0.00,0.140625,0,0,1.046154,0,1.6875,2,0.0,0,0.0
4,-1.153846,0,1,0.00,-0.578125,0,2,0.584615,0,0.3750,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,-0.846154,1,0,-1.00,0.359375,0,0,-0.646154,0,0.2500,1,0.0,3,1.0
299,0.923077,1,3,0.70,-0.750000,1,0,-0.369231,0,1.6250,1,2.0,3,2.0
300,0.076923,1,3,0.00,-1.718750,0,0,-1.169231,1,0.2500,1,1.0,3,3.0
301,0.076923,0,1,0.00,-0.078125,0,2,0.646154,0,-0.5000,1,1.0,0,1.0


#### 낮은 분산 특성 제거
    - 변화가 없는 특성 제거
    - 복잡성을 낮춤
    - 다중공선성를 낮추는 방안

In [54]:
selector = VarianceThreshold(threshold=0.1)
selected_features = selector.fit_transform(df_imputed)
selected_feature_names = df_imputed.columns[selector.get_support()].tolist()
df_selected = pd.DataFrame(selected_features, columns=selected_feature_names)

df_selected

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0.538462,1.0,0.0,0.75,-0.125000,1.0,2.0,-0.092308,0.0,0.9375,2.0,0.0,2.0,0.0
1,0.846154,1.0,3.0,1.50,0.703125,0.0,2.0,-1.384615,1.0,0.4375,1.0,3.0,0.0,2.0
2,0.846154,1.0,3.0,-0.50,-0.187500,0.0,2.0,-0.738462,1.0,1.1250,1.0,2.0,3.0,1.0
3,-1.461538,1.0,2.0,0.00,0.140625,0.0,0.0,1.046154,0.0,1.6875,2.0,0.0,0.0,0.0
4,-1.153846,0.0,1.0,0.00,-0.578125,0.0,2.0,0.584615,0.0,0.3750,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,-0.846154,1.0,0.0,-1.00,0.359375,0.0,0.0,-0.646154,0.0,0.2500,1.0,0.0,3.0,1.0
299,0.923077,1.0,3.0,0.70,-0.750000,1.0,0.0,-0.369231,0.0,1.6250,1.0,2.0,3.0,2.0
300,0.076923,1.0,3.0,0.00,-1.718750,0.0,0.0,-1.169231,1.0,0.2500,1.0,1.0,3.0,3.0
301,0.076923,0.0,1.0,0.00,-0.078125,0.0,2.0,0.646154,0.0,-0.5000,1.0,1.0,0.0,1.0


In [55]:
# PCA
pca = PCA(n_components=0.95)
pca_features = pca.fit_transform(df_selected)

In [56]:
X = df_selected.drop('num', axis=1)
y = df_selected['num']

In [57]:
pca_features.shape[1]

11