In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# 예제 데이터프레임 생성
np.random.seed(42)
df = pd.DataFrame({
    'feature_1': np.random.normal(10, 2, 100),
    'feature_2': np.random.normal(50, 10, 100)
})

In [7]:
# 결측치 일부 삽입
df.loc[[5, 15, 25], 'feature_1'] = np.nan
df.loc[[3, 23], 'feature_2'] = np.nan

df.head(10)  # 앞의 10개만 미리보기

Unnamed: 0,feature_1,feature_2
0,10.993428,35.846293
1,9.723471,45.793547
2,11.295377,46.572855
3,13.04606,
4,9.531693,48.387143
5,,54.040509
6,13.158426,68.861859
7,11.534869,51.745778
8,9.061051,52.575504
9,11.08512,49.255541


In [8]:
# 평균으로 결측치 채우기
df_filled = df.copy()
df_filled['feature_1'] = df_filled['feature_1'].fillna(df_filled['feature_1'].mean())
df_filled['feature_2'] = df_filled['feature_2'].fillna(df_filled['feature_2'].mean())

# 결측치 확인
print("남은 결측치 개수:")
print(df_filled.isnull().sum())

남은 결측치 개수:
feature_1    0
feature_2    0
dtype: int64


In [9]:
# 이상치 제거 함수 정의
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

# 이상치 제거 적용
df_no_outliers = df_filled.copy()
df_no_outliers = remove_outliers_iqr(df_no_outliers, 'feature_1')
df_no_outliers = remove_outliers_iqr(df_no_outliers, 'feature_2')

print("제거 후 데이터 크기:", df_no_outliers.shape)

제거 후 데이터 크기: (98, 2)


In [10]:
from sklearn.preprocessing import MinMaxScaler

# 정규화 실행
scaler = MinMaxScaler()
normalized = scaler.fit_transform(df_no_outliers)

# 다시 DataFrame으로 변환
df_normalized = pd.DataFrame(normalized, columns=['feature_1', 'feature_2'])

# 확인
df_normalized.describe()

Unnamed: 0,feature_1,feature_2
count,98.0,98.0
mean,0.499653,0.441433
std,0.22486,0.207225
min,0.0,0.0
25%,0.372236,0.260816
50%,0.487855,0.45257
75%,0.636454,0.554733
max,1.0,1.0


In [11]:
from sklearn.preprocessing import StandardScaler

# 표준화 실행
scaler = StandardScaler()
standardized = scaler.fit_transform(df_no_outliers)

# 다시 DataFrame으로 변환
df_standardized = pd.DataFrame(standardized, columns=['feature_1', 'feature_2'])

# 확인
df_standardized.describe()

Unnamed: 0,feature_1,feature_2
count,98.0,98.0
mean,7.168302e-16,2.46968e-16
std,1.005141,1.005141
min,-2.233489,-2.141161
25%,-0.569566,-0.8760809
50%,-0.0527373,0.05401969
75%,0.6115116,0.5495568
max,2.23659,2.709316
