<a href="https://colab.research.google.com/github/Edith0001/Hur-Se-eun_Portfolio/blob/main/%EC%B6%94%EB%A1%A0%ED%86%B5%EA%B3%84_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats #통계라이브러리 | 머신러닝 사이킥런
import statsmodels.api as sm #통계라이브러리
from statsmodels.formula.api import ols #통계라이브러리
from statsmodels.stats.multicomp import pairwise_tukeyhsd #통계라이브러리
import warnings
warnings.filterwarnings('ignore')


In [None]:
# 시각화 스타일 설정
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("pastel")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# 샘플 데이터셋 생성 (주택 가격 분석용)
np.random.seed(42)

# 샘플 수
n = 500

# 변수 생성
neighborhoods = np.random.choice(['Downtown', 'Suburbs', 'Rural'], n)
house_sizes = np.random.normal(2000, 500, n)  # 평방피트
num_bedrooms = np.random.choice([2, 3, 4, 5], n, p=[0.1, 0.4, 0.4, 0.1])
num_bathrooms = num_bedrooms * 0.6 + np.random.normal(0, 0.5, n)
num_bathrooms = np.round(num_bathrooms, 1)
has_garage = np.random.choice([0, 1], n, p=[0.3, 0.7])
house_age = np.random.gamma(10, 2, n)  # 연식
house_age = np.round(house_age)

# 종속 변수 생성 (주택 가격)
# 기본 가격
base_price = 200000
# 크기 효과
size_effect = 100 * house_sizes
# 침실 효과
bedroom_effect = 25000 * num_bedrooms
# 욕실 효과
bathroom_effect = 30000 * num_bathrooms
# 차고 효과
garage_effect = 25000 * has_garage
# 연식 효과 (음수)
age_effect = -1000 * house_age
# 지역 효과
neighborhood_effect = np.zeros(n)
neighborhood_effect[neighborhoods == 'Downtown'] = 50000
neighborhood_effect[neighborhoods == 'Suburbs'] = 25000

# 랜덤 노이즈
noise = np.random.normal(0, 20000, n)

# 효과를 조합하여 주택 가격 생성
house_prices = base_price + size_effect + bedroom_effect + bathroom_effect + garage_effect + age_effect + neighborhood_effect + noise
house_prices = np.round(house_prices)

# DataFrame 생성
housing_df = pd.DataFrame({
    'Neighborhood': neighborhoods,
    'Size_sqft': house_sizes,
    'Bedrooms': num_bedrooms,
    'Bathrooms': num_bathrooms,
    'Has_Garage': has_garage,
    'Age_years': house_age,
    'Price': house_prices
})

# Has_Garage를 범주형으로 변환
housing_df['Has_Garage'] = housing_df['Has_Garage'].map({0: 'No', 1: 'Yes'})

# 데이터셋 미리보기
print("주택 가격 데이터셋 개요:")
print(housing_df.head())
print("\n데이터셋 요약 통계:")
print(housing_df.describe())

# 결측값 확인
print("\n결측값:")
print(housing_df.isnull().sum())

주택 가격 데이터셋 개요:
  Neighborhood    Size_sqft  Bedrooms  Bathrooms Has_Garage  Age_years  \
0        Rural  3593.516669         4        2.2        Yes       18.0   
1     Downtown  2042.081761         4        2.5         No       27.0   
2        Rural  2486.316433         5        2.7        Yes       22.0   
3        Rural  2772.237419         3        1.6        Yes       37.0   
4     Downtown  2172.034066         5        3.6        Yes       13.0   

      Price  
0  754489.0  
1  599927.0  
2  674424.0  
3  568847.0  
4  704530.0  

데이터셋 요약 통계:
         Size_sqft    Bedrooms  Bathrooms   Age_years          Price
count   500.000000  500.000000  500.00000  500.000000     500.000000
mean   2017.764859    3.506000    2.12120   20.328000  574215.678000
std     536.105677    0.829059    0.69371    6.410195   72959.255457
min     317.952237    2.000000   -0.00000    6.000000  333586.000000
25%    1649.135828    3.000000    1.70000   16.000000  522404.750000
50%    2028.032777    4.00000