In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [27]:
# 데이터셋 로드

data = pd.read_csv("machine.data_update.csv", header=None)

In [28]:
# 열 이름 할당
columns = ['Vendor', 'Model', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
data.columns = columns

# 기본 통계량 출력
data.describe()

Unnamed: 0,Vendor,Model,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
count,210,210,210,210,210,210,210,210,210,210
unique,31,210,61,26,24,23,16,32,117,105
top,ibm,ModelName,50,2000,8000,0,1,6,32,28
freq,32,1,25,54,43,69,94,30,7,9


In [29]:
# 'Vendor'와 'Model' 열 삭제
data = data.drop(['Vendor', 'Model'], axis=1)

print(data.head())

# 모든 값을 숫자로 변환
for column in data.columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')

# 결측치 확인
print(data.isnull().sum())

# 결측치가 있는 행 삭제
data = data.dropna()

   MYCT  MMIN   MMAX  CACH  CHMIN  CHMAX  PRP  ERP
0  MYCT  MMIN   MMAX  CACH  CHMIN  CHMAX  PRP  ERP
1   125   256   6000   256     16    128  198  199
2    29  8000  32000    32      8     32  269  253
3    29  8000  32000    32      8     32  220  253
4    29  8000  32000    32      8     32  172  253
MYCT     1
MMIN     1
MMAX     1
CACH     1
CHMIN    1
CHMAX    1
PRP      1
ERP      1
dtype: int64


In [30]:
# 특성과 타깃 변수로 데이터 분리
X = data.drop('PRP', axis=1)
y = data['PRP']

# 데이터셋을 훈련 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
# 데이터셋을 훈련 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# 파이프라인 생성
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

In [36]:
# 하이퍼파라미터 튜닝을 위한 GridSearchCV 설정
param_grid = [
    {
        'regressor': [LinearRegression()]
    },
    {
        'regressor': [Ridge()],
        'regressor__alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
    }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)


In [37]:
# 최적 모델
best_model = grid_search.best_estimator_

# 훈련 데이터와 테스트 데이터에 대해 예측 수행
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

In [38]:
# 모델 평가
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# 교차 검증 수행
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')

# 결과 출력
print(f'훈련 MSE: {mse_train}, MAE: {mae_train}, R^2: {r2_train}')
print(f'테스트 MSE: {mse_test}, MAE: {mae_test}, R^2: {r2_test}')
print(f'교차 검증 R^2 점수: {cv_scores}')
print(f'평균 교차 검증 R^2: {cv_scores.mean()}')

훈련 MSE: 1529.020470551296, MAE: 23.232255784639353, R^2: 0.9204648044109383
테스트 MSE: 2370.0963747758187, MAE: 31.406218675535484, R^2: 0.9534424890368547
교차 검증 R^2 점수: [0.85060539 0.75851864 0.80002955 0.89072312 0.9400806 ]
평균 교차 검증 R^2: 0.8479914615575082
