In [3]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# 모델들
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# 1. 데이터 로딩 및 전처리
df = pd.read_csv("car_data.csv")
df["Car_Age"] = 2025 - df["Year"]
X = df.drop(columns=["Selling_Price"])
y = df["Selling_Price"]

# 2. 컬럼 분리
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# 3. 전처리 파이프라인 (수치형은 표준화, 범주형은 원-핫 인코딩)
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# 4. 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. 모델 정의
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "KNeighbors": KNeighborsRegressor(),
    "SVR": SVR()  # Support Vector Regression
}

# 6. 학습 및 평가
results = []

for name, model in models.items():
    print(f"\n=== {name} ===")
    
    # 파이프라인 구성
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    
    # 학습
    pipe.fit(X_train, y_train)
    
    # 예측
    y_pred = pipe.predict(X_test)
    
    # 평가 지표 계산
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"RMSE: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # 결과 저장
    results.append({
        "Model": name,
        "RMSE": rmse,
        "R2_Score": r2
    })

# 7. 결과 정리
results_df = pd.DataFrame(results).sort_values(by="R2_Score", ascending=False)
print("\n===== 전체 모델 성능 비교 결과 =====")
print(results_df)



=== LinearRegression ===
RMSE: 1.5126
R² Score: 0.9007

=== Ridge ===
RMSE: 1.5787
R² Score: 0.8918

=== DecisionTree ===
RMSE: 1.3550
R² Score: 0.9203

=== RandomForest ===
RMSE: 0.8880
R² Score: 0.9658

=== GradientBoosting ===
RMSE: 0.9323
R² Score: 0.9623

=== KNeighbors ===
RMSE: 1.0755
R² Score: 0.9498

=== SVR ===
RMSE: 2.0890
R² Score: 0.8106

===== 전체 모델 성능 비교 결과 =====
              Model      RMSE  R2_Score
3      RandomForest  0.888048  0.965765
4  GradientBoosting  0.932293  0.962268
5        KNeighbors  1.075487  0.949788
2      DecisionTree  1.354962  0.920301
0  LinearRegression  1.512553  0.900683
1             Ridge  1.578691  0.891808
6               SVR  2.089006  0.810556


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# 1. 데이터 로딩 및 전처리
df = pd.read_csv("car_data.csv")
df["Car_Age"] = 2025 - df["Year"]
X = df.drop(columns=["Selling_Price"])
y = df["Selling_Price"]

# 2. 수치형/범주형 컬럼 분리
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# 3. 전처리 파이프라인 (스케일링 + 원핫인코딩)
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

# 4. 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. 모델과 하이퍼파라미터 그리드 정의
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

param_grids = {
    "RandomForest": {
        "regressor__n_estimators": [100, 200],
        "regressor__max_depth": [None, 10, 20],
        "regressor__min_samples_split": [2, 5]
    },
    "GradientBoosting": {
        "regressor__n_estimators": [100, 200],
        "regressor__learning_rate": [0.1, 0.05],
        "regressor__max_depth": [3, 5]
    }
}

# 6. 튜닝 및 교차검증 수행
tuning_results = []

for name in models:
    print(f"\n=== 튜닝 중: {name} ===")
    
    # 파이프라인 정의
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", models[name])
    ])
    
    # GridSearchCV 실행
    grid = GridSearchCV(pipe, param_grids[name], cv=5, scoring="r2", n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    best_params = grid.best_params_
    print(f"최적 파라미터: {best_params}")
    
    # 훈련셋 기준 교차검증
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring="r2")
    print(f"훈련셋 기준 평균 R²: {np.mean(cv_scores):.4f} / 표준편차: {np.std(cv_scores):.4f}")
    
    # 결과 저장
    tuning_results.append({
        "Model": name,
        "Best_Params": best_params,
        "CV_R2_Mean": np.mean(cv_scores),
        "CV_R2_Std": np.std(cv_scores)
    })

# 7. 결과 테이블 출력
results_df = pd.DataFrame(tuning_results)
print("\n===== 튜닝 및 교차검증 결과 요약 =====")
print(results_df)


=== 튜닝 중: RandomForest ===
최적 파라미터: {'regressor__max_depth': 10, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
훈련셋 기준 평균 R²: 0.8834 / 표준편차: 0.0628

=== 튜닝 중: GradientBoosting ===
최적 파라미터: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 200}
훈련셋 기준 평균 R²: 0.8910 / 표준편차: 0.0396

===== 튜닝 및 교차검증 결과 요약 =====
              Model                                        Best_Params  \
0      RandomForest  {'regressor__max_depth': 10, 'regressor__min_s...   
1  GradientBoosting  {'regressor__learning_rate': 0.1, 'regressor__...   

   CV_R2_Mean  CV_R2_Std  
0    0.883424   0.062793  
1    0.891023   0.039586  
