In [1]:
from ucimlrepo import fetch_ucirepo 
import polars as pl

| Variable     | Descripción                                                                                           | Tipo                         | Valores                                                                                  |
|--------------|--------------------------------------------------------------------------------------------------------|------------------------------|--------------------------------------------------------------------------------------------------|
| age          | Edad del paciente en años al momento del estudio.                                                      | Numérica (entera)            | 29 – 77                                                                                  |
| sex          | Sexo del paciente.                                                                                     | Binaria                      | `male (1)`, `female (0)`                                                                 |
| cp           | Tipo de dolor torácico experimentado.                                                                 | Categórica nominal           | `typical angina (1)`, `atypical angina (2)`, `non-anginal pain (3)`, `asymptomatic (4)`    |
| trestbps     | Presión arterial en reposo (mm Hg) medida al ingreso hospitalario.                                     | Numérica (entera)            | 94 – 200                                                                                  |
| chol         | Colesterol sérico total (mg/dl).                                                                      | Numérica (entera)            | 126 – 564                                                                                 |
| fbs          | Azúcar en ayunas mayor a 120 mg/dl.                                                                  | Binaria                      | `true (1)`, `false (0)`                                                                   |
| restecg      | Resultados del electrocardiograma (ECG) en reposo.                                                    | Categórica nominal           | `normal (0)`, `ST-T wave abnormality (1)`, `left ventricular hypertrophy (2)`              |
| thalach      | Frecuencia cardíaca máxima alcanzada.                                                                 | Numérica (entera)            | 71 – 202                                                                                  |
| exang        | Angina inducida por ejercicio.                                                                        | Binaria                      | `yes (1)`, `no (0)`                                                                       |
| oldpeak      | Depresión del segmento ST inducida por ejercicio relativa al reposo.                                   | Numérica (continua)          | 0 – 6.2                                                                                    |
| slope        | Pendiente del segmento ST al ejercicio máximo.                                                        | Categórica ordinal           | `upsloping (1)`, `flat (2)`, `downsloping (3)`                                             |
| ca           | Número de vasos principales (0–3) coloreados por fluoroscopia.                                        | Numérica (entera)            | 0 – 3                                                                                      |
| thal         | Resultado del test de talio.                                                                          | Categórica nominal           | `normal (3)`, `fixed defect (6)`, `reversible defect (7)`                                   |
| num (target) | Diagnóstico de enfermedad cardíaca.                                                                  | Binaria (redefinida)         | `ausencia (0)`, `presencia (1)` (originalmente 0 = ausencia, 1–4 = presencia leve a grave) |


In [2]:
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = pl.from_pandas(heart_disease.data.features)
y = pl.from_pandas(heart_disease.data.targets)

In [3]:
X

age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,f64,f64
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
…,…,…,…,…,…,…,…,…,…,…,…,…
45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0
68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0
57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0
57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0


In [4]:
y

num
i64
0
2
1
0
0
…
1
2
3
1


In [5]:
X.describe()

statistic,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0
"""mean""",54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219
"""std""",9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706
"""min""",29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
"""25%""",48.0,0.0,3.0,120.0,211.0,0.0,0.0,134.0,0.0,0.0,1.0,0.0,3.0
"""50%""",56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0
"""75%""",61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
"""max""",77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [6]:
y.describe()

statistic,num
str,f64
"""count""",303.0
"""null_count""",0.0
"""mean""",0.937294
"""std""",1.228536
"""min""",0.0
"""25%""",0.0
"""50%""",0.0
"""75%""",2.0
"""max""",4.0


In [7]:
## Drop missing values
X = X.drop_nulls()

In [8]:
print('Unique values:\n')

for col in X.columns:
    try:
        unique_values =X[col].unique().to_numpy()
    except:
        unique_values = X[col].unique()

    print(f'- {col}: {unique_values}\n')

Unique values:

- age: [29 34 35 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
 58 59 60 61 62 63 64 65 66 67 68 69 70 71 74 76 77]

- sex: [0 1]

- cp: [1 2 3 4]

- trestbps: [ 94 100 101 102 104 105 106 108 110 112 114 115 117 118 120 122 123 124
 125 126 128 129 130 132 134 135 136 138 140 142 144 145 146 148 150 152
 154 155 156 158 160 164 165 170 172 174 178 180 192 200]

- chol: [126 131 141 149 157 160 164 166 167 168 169 172 174 175 176 177 178 180
 182 183 184 185 186 187 188 192 193 195 196 197 198 199 200 201 203 204
 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 239 240 241
 242 243 244 245 246 247 248 249 250 252 253 254 255 256 257 258 259 260
 261 262 263 264 265 266 267 268 269 270 271 273 274 275 276 277 278 281
 282 283 284 286 288 289 290 293 294 295 298 299 300 302 303 304 305 306
 307 308 309 311 313 315 318 319 321 322 325 326 327 330 335 340 341 342
 353 354 3

# Encoding

In [None]:
y = y.with_columns(pl.col('num').replace({2: 1, 3: 1, 4: 1}).alias('num'))

# Sort variables according to data type


In [9]:
len_unique_values = {}
for col in X.columns:
    len_unique_values[col] = len(X[col].unique())

In [23]:
quant_predictors = [col for col, len in len_unique_values.items() if len >= 4]
cat_predictors = [col for col in X.columns if col not in quant_predictors]
binary_predictors = [col for col in cat_predictors if len_unique_values[col] == 2]
multiclass_predictors = [col for col in cat_predictors if col not in binary_predictors]

In [20]:
X = X[quant_predictors + binary_predictors + multiclass_predictors]

age,cp,trestbps,chol,thalach,oldpeak,ca,sex,fbs,exang,restecg,slope,thal
i64,i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,i64,f64
63,1,145,233,150,2.3,0.0,1,1,0,2,3,6.0
67,4,160,286,108,1.5,3.0,1,0,1,2,2,3.0
67,4,120,229,129,2.6,2.0,1,0,1,2,2,7.0
37,3,130,250,187,3.5,0.0,1,0,0,0,3,3.0
41,2,130,204,172,1.4,0.0,0,0,0,2,1,3.0
…,…,…,…,…,…,…,…,…,…,…,…,…
57,4,140,241,123,0.2,0.0,0,0,1,0,2,7.0
45,1,110,264,132,1.2,0.0,1,0,0,0,2,7.0
68,4,144,193,141,3.4,2.0,1,1,0,0,2,7.0
57,4,130,131,115,1.2,1.0,1,0,1,0,2,7.0


In [15]:
cat_predictors

['sex', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# Compute p1, p2, p3


In [24]:
p1 = len(quant_predictors)
p2 = len(binary_predictors)
p3 = len(multiclass_predictors)