In [18]:
# libraries
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [19]:
# importing in data and adding column names
data <- read_csv("https://raw.githubusercontent.com/BeesKneezz/dsci_100_2023_group_7/main/data/processed.cleveland.data", col_names = FALSE)
colnames(data) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")
data

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3
57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1
38,1,3,138,175,0,0,173,0,0.0,1,?,3.0,0


In [22]:
# filtering / wrangling data
data_wrangled <- data |>  
    #   chol: serum cholestoral in mg/dl
    #   trestbps: resting blood pressure (in mm Hg on admission to the hospital)
    #   fbs: fasting blood sugar > 120 mg/dl  (1 = true; 0 = false)
    #   thalach: maximum heart rate achieved
    #   num: diagnosis of heart disease (angiographic disease status)
    #    -- Value 0: < 50% diameter narrowing
    #    -- Value 1: > 50% diameter narrowing
    select(chol, trestbps, fbs, thalach, num) |>
    mutate(num = as_factor(num))
levels(data_wrangled$num) <- c(levels(data_wrangled$num), 'Diagnosed', 'Not diagnosed')
data_wrangled$num[data_wrangled$num != '0'] <- 'Diagnosed'
data_wrangled$num[data_wrangled$num == '0'] <- 'Not diagnosed'
data_wrangled

chol,trestbps,fbs,thalach,num
<dbl>,<dbl>,<dbl>,<dbl>,<fct>
233,145,1,150,Not diagnosed
286,160,0,108,Diagnosed
229,120,0,129,Diagnosed
⋮,⋮,⋮,⋮,⋮
131,130,0,115,Diagnosed
236,130,0,174,Diagnosed
175,138,0,173,Not diagnosed


In [23]:
# splitting data sets
data_split <- initial_split(data_wrangled, prop = 0.75, strata = num)
data_train <- training(data_split)
data_test <- testing(data_split)