In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# 데이터 불러오기
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data"
column_names = ['Vendor name', 'Model name', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
data = pd.read_csv(url, names=column_names)

# 데이터 탐색 및 전처리
data = data.drop(columns=['Vendor name', 'Model name', 'ERP'])  # 불필요한 컬럼 제거

# 데이터 시각화
sns.pairplot(data)
plt.show()

In [None]:
# 특징 변수(X)와 타겟 변수(y) 설정
X = data.drop(columns='PRP')
y = data['PRP']

# 훈련 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Pipeline 기반 다중 회귀 모델 정의
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('regressor', LinearRegression())
])

In [11]:
# 하이퍼파라미터 튜닝을 위한 GridSearchCV 설정
param_grid = {
    'poly__degree': [1, 2, 3],
    'regressor': [LinearRegression(), Ridge()]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
# 최적 모델 선택
best_model = grid_search.best_estimator_

In [12]:
# 학습 데이터 성능 평가
y_train_pred = best_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# 테스트 데이터 성능 평가
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# 교차 검증 점수
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
mean_cv_score = np.mean(cv_scores)

# 성능 지표 출력
print(f"Train MSE: {train_mse}")
print(f"Train MAE: {train_mae}")
print(f"Train R^2: {train_r2}")
print(f"Test MSE: {test_mse}")
print(f"Test MAE: {test_mae}")
print(f"Test R^2: {test_r2}")
print(f"Mean Cross-validated R^2: {mean_cv_score}")

NameError: name 'cross_val_score' is not defined

In [None]:
# 모델 성능 개선 (앙상블 학습)
ensemble_model = VotingRegressor(estimators=[
    ('lr', LinearRegression()),
    ('ridge', Ridge()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
])
ensemble_model.fit(X_train, y_train)

# 앙상블 모델 테스트 데이터 성능 평가
y_test_pred_ensemble = ensemble_model.predict(X_test)
ensemble_test_r2 = r2_score(y_test, y_test_pred_ensemble)

print(f"Ensemble Test R^2: {ensemble_test_r2}")

In [None]:
# 4. 성능 지표 데이터프레임에 저장 및 시각화
results = {
    'Model': ['Baseline Model', 'Pipeline Model', 'Ensemble Model'],
    'Train MSE': [None, train_mse, None],
    'Test MSE': [None, test_mse, None],
    'Train MAE': [None, train_mae, None],
    'Test MAE': [None, test_mae, None],
    'Train R^2': [None, train_r2, None],
    'Test R^2': [None, test_r2, ensemble_test_r2],
    'Mean CV R^2': [None, mean_cv_score, None]
}

results_df = pd.DataFrame(results)
results_df.set_index('Model', inplace=True)
results_df.plot(kind='bar', figsize=(12, 8))
plt.title('Model Performance Comparison')
plt.ylabel('Scores')
plt.show()

# 성능 결과 데이터프레임 출력
print(results_df)