### 최적 RPM 1202
- 데이터셋 새로 생성:
	* 피쳐:	현재온도(3개) 이전 중량				
	* 타겟:	현재 rpm

- 모델: 다중회귀, 랜덤포레스트, XGB, LGBM

In [1]:
# 데이터 로드 관련 모듈 로딩 -------------------------------
import pandas as pd
import numpy as np

# 시각화 관련 모듈 ----------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# 모델 관련 모듈 로딩 --------------------------------------
from lightgbm import LGBMRegressor

from sklearn.ensemble import RandomForestRegressor

from sklearn import datasets
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터셋 생성
PATH_csv = r'C:\Users\KDP-14\Desktop\VSCode\KDT6\기업 프로젝트\rpm 예측 모델\2nd_data_raw_only.csv'
data = pd.read_csv(PATH_csv)

selected_columns = ['c_temp_pv', 'n_temp_pv', 's_temp_pv', 'k_rpm_pv', 'scale_pv']

new_data = data[selected_columns].copy()
new_data['scale_pv_trg'] = data['scale_pv'].shift(1)

new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38609 entries, 0 to 38608
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   c_temp_pv     38609 non-null  float64
 1   n_temp_pv     38609 non-null  float64
 2   s_temp_pv     38609 non-null  float64
 3   k_rpm_pv      38609 non-null  int64  
 4   scale_pv      38609 non-null  float64
 5   scale_pv_trg  38608 non-null  float64
dtypes: float64(5), int64(1)
memory usage: 1.8 MB


In [3]:
new_data.dropna(inplace=True)
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38608 entries, 1 to 38608
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   c_temp_pv     38608 non-null  float64
 1   n_temp_pv     38608 non-null  float64
 2   s_temp_pv     38608 non-null  float64
 3   k_rpm_pv      38608 non-null  int64  
 4   scale_pv      38608 non-null  float64
 5   scale_pv_trg  38608 non-null  float64
dtypes: float64(5), int64(1)
memory usage: 2.1 MB


In [4]:
new_data.columns

Index(['c_temp_pv', 'n_temp_pv', 's_temp_pv', 'k_rpm_pv', 'scale_pv',
       'scale_pv_trg'],
      dtype='object')

In [5]:
# Train, Val, Test 분리

y_target = new_data['k_rpm_pv']
X_feature = new_data[['c_temp_pv', 'n_temp_pv', 's_temp_pv', 'scale_pv_trg']]

X_train, X_test, y_train, y_test = train_test_split(X_feature, y_target, 
                                                    test_size=0.2,
                                                    random_state=77)


# 크기 확인
print(f"X_train: {X_train.shape} {X_train.ndim}D")
print(f"y_train: {y_train.shape} {y_train.ndim}D")

print("="*30)
print(f"X_test: {X_test.shape} {X_test.ndim}D")
print(f"y_test: {y_test.shape} {y_test.ndim}D")

X_train: (30886, 4) 2D
y_train: (30886,) 1D
X_test: (7722, 4) 2D
y_test: (7722,) 1D


In [6]:
# 랜덤포레스트
# Randomforest 인스턴스 생성
model = RandomForestRegressor(random_state=77)

# 모델 학습
model.fit(X_train,y_train)

In [7]:
# 모델 예측
y_pred = model.predict(X_train)

# 성능
# mae = mean_absolute_error(y_pred=y_pred, y_true=y_train)
# mape = mean_absolute_percentage_error(y_pred=y_pred, y_true=y_train)
r2 = r2_score(y_pred=y_pred, y_true=y_train)

# print(f"mae = {mae}")
# print(f"mape = {mape*100}%")
print(f"r2 ={r2}")

r2 =0.8334165569355528


In [8]:
# LGBM + HyperOpt Ver.
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [9]:
# 목적 함수 정의
def objective(params):
    # LGBM 모델 생성
    lgbm = LGBMRegressor(
		
        n_estimators=int(params['n_estimators']),
        learning_rate=params['learning_rate'],
        max_depth=int(params['max_depth']),
        num_leaves=int(params['num_leaves']),
        min_child_samples=int(params['min_child_samples']),
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        random_state=77,
        force_col_wise=True
    )
    
    # 모델 학습 및 평가
    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_test)
    r2 = r2_score(y_true=y_test, y_pred=y_pred)
    
    return {'loss': r2, 'status': STATUS_OK}

In [10]:
param_space = { 
	
    'n_estimators': hp.quniform('n_estimators', 100, 500, 50),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),  
    'max_depth': hp.quniform('max_depth', 3, 25, 1),  
    'num_leaves': hp.quniform('num_leaves', 20, 127, 1),  
    'min_child_samples': hp.quniform('min_child_samples', 10, 30, 1),  
    'subsample': hp.uniform('subsample', 0.3, 1.0),  
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0)  
}

In [11]:
# Trials 객체 생성 (탐색 결과 저장)
trials = Trials()

# 최적화 수행
best_params = fmin(fn=objective, space=param_space, algo=tpe.suggest, 
                   max_evals=30, trials=trials, rstate=np.random.default_rng(77))

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

[LightGBM] [Info] Total Bins 203                      
[LightGBM] [Info] Number of data points in the train set: 30886, number of used features: 4
[LightGBM] [Info] Start training from score 181.099592
[LightGBM] [Info] Total Bins 203                                                 
[LightGBM] [Info] Number of data points in the train set: 30886, number of used features: 4
[LightGBM] [Info] Start training from score 181.099592                           
[LightGBM] [Info] Total Bins 203                                                 
[LightGBM] [Info] Number of data points in the train set: 30886, number of used features: 4
[LightGBM] [Info] Start training from score 181.099592                          
[LightGBM] [Info] Total Bins 203                                                
[LightGBM] [Info] Number of data points in the train set: 30886, number of used features: 4
[LightGBM] [Info] Start training from score 181.099592                          
[LightGBM] [Info] Total Bins 203 

In [12]:
# 최적 하이퍼 파라미터로 LGBM 하이퍼 파라미터 설정
lgbm_reg = LGBMRegressor(random_state=77, 
                         learning_rate=best_params['learning_rate'],
                         max_depth=int(best_params['max_depth']),
                         min_child_samples=int(best_params['min_child_samples']),
                         n_estimators=int(best_params['n_estimators']),
                         num_leaves=int(best_params['num_leaves']),
                         subsample=best_params['subsample'],
                         colsample_bytree=best_params['colsample_bytree'],
                         )

In [13]:
# 최적 하이퍼 파라미터 학습
# lgbm_reg = LGBMRegressor()
lgbm_reg.fit(X_train, y_train)

# train 데이터 점수
# 테스트 
y_pred = lgbm_reg.predict(X_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 203
[LightGBM] [Info] Number of data points in the train set: 30886, number of used features: 4
[LightGBM] [Info] Start training from score 181.099592


In [14]:
print()
print("")
print("Best Parameter Setting")
print(f"MAE: {mean_absolute_error(y_pred=y_pred, y_true=y_train)}")
print(f"MAPE: {mean_absolute_percentage_error(y_pred=y_pred, y_true=y_train)*100}")
print(f"R2: {r2_score(y_pred=y_pred, y_true=y_train)}")
print()





Best Parameter Setting
MAE: 5.315946414439684
MAPE: 2.983484001197563
R2: 0.1786541079935925



#### 중량 예측 w/ RF + 최적 RPM w/ RF

In [15]:
import sklearn
sklearn.__version__

'1.3.2'

In [23]:
tmp_list

c_temp_pv    70.2
k_rpm_pv      173
n_temp_pv    67.6
s_temp_pv    66.4
Name: 8888, dtype: object

In [22]:
# 예측 -------------------------------------------------------------------------

# 모델 로드
import joblib
PATH_scale_model = r'C:\Users\KDP-14\Desktop\VSCode\LocalData\기업 프로젝트\model\scale_RF_1202_r2_0.806.pkl'
scale_model = joblib.load(PATH_scale_model)

scale_cols = ['c_temp_pv', 'k_rpm_pv', 'n_temp_pv', 's_temp_pv']

row_num = 8888

tmp_list = data.loc[row_num, ['c_temp_pv', 'k_rpm_pv', 'n_temp_pv', 's_temp_pv']]
tmp_scale = data.loc[row_num,'scale_pv']

print(tmp_list)
# print(tmp_scale)

tmp_test = pd.DataFrame(data = [tmp_list], columns=scale_cols)
# print(tmp_test)

y_pred = scale_model.predict(tmp_test)
print("="*50)
print("실제 중량(g):", tmp_scale)
print("예측 중량(g): ",y_pred[0])
print("현재 rpm: ", tmp_list[1])

c_temp_pv    70.2
k_rpm_pv      173
n_temp_pv    67.6
s_temp_pv    66.4
Name: 8888, dtype: object
실제 중량(g): 3.08
예측 중량(g):  3.076778278521001
현재 rpm:  173


In [25]:
from math import ceil, floor

# ['c_temp_pv', 'n_temp_pv', 's_temp_pv', 'scale_pv_trg']

tmp2_list = [tmp_list[0],tmp_list[2], tmp_list[3], y_pred[0]]
tmp2_test = pd.DataFrame(data=[tmp2_list], columns=X_test.columns.to_list())

rpm_pred = model.predict(tmp2_test)
print(rpm_pred)
print("최적 RPM:",floor(rpm_pred[0]))

[175.5025]
최적 RPM: 175


In [27]:
tmp3_list = [tmp_list[0],floor(rpm_pred[0]),tmp_list[2], tmp_list[3]]
tmp3_test = pd.DataFrame(data = [tmp3_list], columns=scale_cols)

y_pred2 = scale_model.predict(tmp3_test)

print("예측 중량(g): ",y_pred[0])
print("현재 rpm: ", tmp_list[1])
print("최적 RPM:",floor(rpm_pred[0]))
print("최적 RPM 적용 예측 중량(g): ",y_pred2[0])

예측 중량(g):  3.076778278521001
현재 rpm:  173
최적 RPM: 175
최적 RPM 적용 예측 중량(g):  3.1093741945391673
