# 07. Model Workflow

In [35]:
library(tidyverse)
library(tidymodels)
ggplot2::theme_set(theme_bw())
tidymodels_prefer()

In [36]:
ames <- ames %>% mutate(Sale_Price = log10(Sale_Price))
set.seed(20221010)
ames_split <- initial_split(ames, prop = 0.8, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test <- testing(ames_split)

# 
# 
### workflow :  모델링 프로세스의 주요 부분을 객체화 하는 것
- 예)
![proper-workflow.svg](attachment:proper-workflow.svg)

# 
# 
### Workflow 기본

- 모델 생성

In [37]:
lm_model <-
    linear_reg() %>%
    set_engine("lm")

- 모델 추가

In [38]:
lm_wflow <-
    workflow() %>%
    add_model(lm_model)

- 모델 공식 추가

In [39]:
lm_wflow <-
    lm_wflow %>%
    add_formula(Sale_Price ~ Longitude + Latitude)

- 데이터 적합

In [40]:
lm_fit <- fit(lm_wflow, ames_train)
lm_fit

[3mPreprocessor:[23m Formula
[3mModel:[23m linear_reg()

-- Preprocessor --------------------------------------------------------------------------------------------------------
Sale_Price ~ Longitude + Latitude

-- Model ---------------------------------------------------------------------------------------------------------------

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
(Intercept)    Longitude     Latitude  
   -25.3446      -0.1706       0.2400  


- 예측

In [41]:
predict(lm_fit, ames_test %>% slice(1:3))

.pred
<dbl>
0.7227429
0.7194929
0.7189566


- 워크플로 제거 / 업데이트

In [42]:
lm_fit %>% update_formula(Sale_Price ~ Longitude)

[3mPreprocessor:[23m Formula
[3mModel:[23m linear_reg()

-- Preprocessor --------------------------------------------------------------------------------------------------------
Sale_Price ~ Longitude

-- Model ---------------------------------------------------------------------------------------------------------------
Linear Regression Model Specification (regression)

Computational engine: lm 


# 
# 
# 
### 변수 추가
### ```add_variables(outcome, predictors)```
- ```outcome``` : 반응변수
- ```predictors``` : 설명변수
    - ```ends_with()```를 사용하여 지정 가능
    - ```everything()``` : 모든 변수

In [43]:
lm_wflow <-
    lm_wflow %>%
    remove_formula() %>%
    add_variables(outcome = Sale_Price, predictors = everything())

In [44]:
lm_wflow

[3mPreprocessor:[23m Variables
[3mModel:[23m linear_reg()

-- Preprocessor --------------------------------------------------------------------------------------------------------
Outcomes: Sale_Price
Predictors: everything()

-- Model ---------------------------------------------------------------------------------------------------------------
Linear Regression Model Specification (regression)

Computational engine: lm 


In [45]:
# fit(lm_wflow, ames_train) %>% extract_fit_engine() %>% tidy

# 
# 
# 

### Workflow 수식

In [46]:
library(nlme)
data(Orthodont)

In [47]:
library(multilevelmod)

In [48]:
multilevel_spec <- linear_reg() %>% set_engine("lmer")

In [49]:
multilevel_workflow <-
    workflow() %>%
    add_variables(outcome = distance, predictors = c(Sex, age, Subject)) %>%
    add_model(multilevel_spec,
              formula = distance ~ Sex + (age | Subject))

In [50]:
multilevel_fit <- fit(multilevel_workflow, data = Orthodont)
multilevel_fit

[3mPreprocessor:[23m Variables
[3mModel:[23m linear_reg()

-- Preprocessor --------------------------------------------------------------------------------------------------------
Outcomes: distance
Predictors: c(Sex, age, Subject)

-- Model ---------------------------------------------------------------------------------------------------------------
Linear mixed model fit by REML ['lmerMod']
Formula: distance ~ Sex + (age | Subject)
   Data: data
REML criterion at convergence: 471.1635
Random effects:
 Groups   Name        Std.Dev. Corr 
 Subject  (Intercept) 7.3912        
          age         0.6943   -0.97
 Residual             1.3100        
Number of obs: 108, groups:  Subject, 27
Fixed Effects:
(Intercept)    SexFemale  
     24.517       -2.145  

# 
# 
### 생존함수

In [51]:
library(censored)

In [52]:
parametic_spec <- survival_reg()

In [53]:
parametic_workflow <-
    workflow() %>%
    add_variables(outcome = c(fustat, futime), predictors = c(age, rx)) %>%
    add_model(parametic_spec,
              formula = Surv(futime, fustat) ~ age + strata(rx))

In [54]:
parametic_fit <- fit(parametic_workflow, data = ovarian)
parametic_fit

[3mPreprocessor:[23m Variables
[3mModel:[23m survival_reg()

-- Preprocessor --------------------------------------------------------------------------------------------------------
Outcomes: c(fustat, futime)
Predictors: c(age, rx)

-- Model ---------------------------------------------------------------------------------------------------------------
Call:
survival::survreg(formula = Surv(futime, fustat) ~ age + strata(rx), 
    data = data, model = TRUE)

Coefficients:
(Intercept)         age 
 12.8734120  -0.1033569 

Scale:
     rx=1      rx=2 
0.7695509 0.4703602 

Loglik(model)= -89.4   Loglik(intercept only)= -97.1
	Chisq= 15.36 on 1 degrees of freedom, p= 8.88e-05 
n= 26 

# 
# 
# 
### 복수 Workflow 생성

### ```workflow_set(preproc, models)```
- ```preproc``` : 수식 List
- ```models``` : 적합할 모형식

In [55]:
location <- list(
    Longitude = Sale_Price ~ Longitude,
    Latitude = Sale_Price ~ Latitude,
    coords = Sale_Price ~ Longitude + Latitude,
    neighborhood = Sale_Price ~ Neighborhood
)

In [56]:
library(workflowsets)

In [57]:
location_models <- workflow_set(preproc = location, 
                                models = list(lm = lm_model))
location_models

wflow_id,info,option,result
<chr>,<list>,<list>,<list>
Longitude_lm,"Sale_Price ~ Longitude, ~NULL, ~NULL, regression, FALSE, lm, TRUE, FALSE, formula, linear_reg,",,
Latitude_lm,"Sale_Price ~ Latitude, ~NULL, ~NULL, regression, FALSE, lm, TRUE, FALSE, formula, linear_reg,",,
coords_lm,"Sale_Price ~ Longitude + Latitude, ~NULL, ~NULL, regression, FALSE, lm, TRUE, FALSE, formula, linear_reg,",,
neighborhood_lm,"Sale_Price ~ Neighborhood, ~NULL, ~NULL, regression, FALSE, lm, TRUE, FALSE, formula, linear_reg,",,


# 
# 
### ```extract_workflow()``` : workflow_set에서 모델 추출

In [58]:
extract_workflow(location_models, id = "coords_lm")

[3mPreprocessor:[23m Formula
[3mModel:[23m linear_reg()

-- Preprocessor --------------------------------------------------------------------------------------------------------
Sale_Price ~ Longitude + Latitude

-- Model ---------------------------------------------------------------------------------------------------------------
Linear Regression Model Specification (regression)

Computational engine: lm 


In [59]:
location_models <-
    location_models %>%
    mutate(fit = map(info, ~ fit(.x$workflow[[1]], ames_train)))

In [60]:
location_models$fit[[1]]

[3mPreprocessor:[23m Formula
[3mModel:[23m linear_reg()

-- Preprocessor --------------------------------------------------------------------------------------------------------
Sale_Price ~ Longitude

-- Model ---------------------------------------------------------------------------------------------------------------

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
(Intercept)    Longitude  
   -14.5985      -0.1636  


# 
# 
### 테스트 세트 평가

### ```last_fit()``` : 최종 모델을 훈련 세트에 적합, 테스트 세트로 평가

In [33]:
ames_split

<Training/Testing/Total>
<2342/588/2930>

In [None]:
final_lm_res <- last_fit(lm_wflow, ames_split)
final_lm_res

```
#> # Resampling results
#> # Manual resampling 
#> # A tibble: 1 × 6
#>   splits             id               .metrics         .notes   .predic…¹ .workflow 
#>   <list>             <chr>            <list>           <list>   <list>    <list>    
#> 1 <split [2342/588]> train/test split <tibble [2 × 4]> <tibble> <tibble>  <workflow>
#> # … with abbreviated variable name ¹​.predictions```

In [None]:
fitted_lm_wflow <- extract_workflow(final_lm_res)

# 
# 
### ```collect_metrics()``` : 성능 메트릭에 대한 정보 제공
### ```collect_predictions()``` : 예측 메트릭에 대한 정보 제공

In [None]:
collect_metrics(final_lm_res)
collect_predictions(final_lm_res) %>% slice(1:5)