In [23]:
import pandas as pd

# 데이터셋 로드
data = pd.read_csv("machine.data_update.csv", header=None)

# 데이터셋의 처음 몇 행을 출력
data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,VendorName,ModelName,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
1,adviser,32/60,125,256,6000,256,16,128,198,199
2,amdahl,470v/7,29,8000,32000,32,8,32,269,253
3,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
4,amdahl,470v/7b,29,8000,32000,32,8,32,172,253


In [24]:
# 열 이름 할당
columns = ['Vendor', 'Model', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
data.columns = columns

# 기본 통계량 출력
data.describe()


Unnamed: 0,Vendor,Model,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
count,210,210,210,210,210,210,210,210,210,210
unique,31,210,61,26,24,23,16,32,117,105
top,ibm,ModelName,50,2000,8000,0,1,6,32,28
freq,32,1,25,54,43,69,94,30,7,9


In [25]:
# 'Vendor'와 'Model' 열 삭제
data = data.drop(['Vendor', 'Model'], axis=1)

# 데이터 확인
print(data.head())

   MYCT  MMIN   MMAX  CACH  CHMIN  CHMAX  PRP  ERP
0  MYCT  MMIN   MMAX  CACH  CHMIN  CHMAX  PRP  ERP
1   125   256   6000   256     16    128  198  199
2    29  8000  32000    32      8     32  269  253
3    29  8000  32000    32      8     32  220  253
4    29  8000  32000    32      8     32  172  253


In [26]:
# 데이터 타입 확인
print(data.dtypes)

# 숫자로 변환할 수 없는 값 확인 (예: 'MYCT' 열에 문자열이 포함되어 있는지 확인)
print(data['MYCT'].unique())

MYCT     object
MMIN     object
MMAX     object
CACH     object
CHMIN    object
CHMAX    object
PRP      object
ERP      object
dtype: object
['MYCT' '125' '29' '26' '23' '400' '60' '50' '350' '200' '167' '143' '110'
 '320' '25' '56' '64' '133' '810' '700' '140' '220' '800' '75' '90' '105'
 '175' '300' '180' '330' '57' '480' '203' '115' '1100' '600' '900' '225'
 '185' '17' '1500' '100' '150' '92' '72' '40' '35' '38' '48' '30' '112'
 '84' '250' '160' '240' '52' '70' '59' '116' '124' '98']


In [27]:
# 모든 값을 숫자로 변환
for column in data.columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')

# 결측치 확인
print(data.isnull().sum())

# 결측치가 있는 행 삭제
data = data.dropna()

# 특성과 타깃 변수로 데이터 분리
X = data.drop('PRP', axis=1)
y = data['PRP']

MYCT     1
MMIN     1
MMAX     1
CACH     1
CHMIN    1
CHMAX    1
PRP      1
ERP      1
dtype: int64


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 데이터셋을 훈련 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 훈련
model = LinearRegression()
model.fit(X_train, y_train)

# 훈련 데이터와 테스트 데이터에 대해 예측 수행
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 모델 평가
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f'훈련 MSE: {mse_train}, MAE: {mae_train}, R^2: {r2_train}')
print(f'테스트 MSE: {mse_test}, MAE: {mae_test}, R^2: {r2_test}')


훈련 MSE: 1529.0204705512954, MAE: 23.232255784639317, R^2: 0.9204648044109384
테스트 MSE: 2370.096374775835, MAE: 31.4062186755356, R^2: 0.9534424890368544
