In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

I will be using the 'SUPPORT 2' dataset from the UCI machine learning repository. It's description:
'This dataset comprises 9105 individual critically ill patients across 5 United States medical centers, accessioned throughout 1989-1991 and 1992-1994. Each row concerns hospitalized patient records who met the inclusion and exclusion criteria for nine disease categories: acute respiratory failure, chronic obstructive pulmonary disease, congestive heart failure, liver disease, coma, colon cancer, lung cancer, multiple organ system failure with malignancy, and multiple organ system failure with sepsis. The goal is to determine these patients' 2- and 6-month survival rates based on several physiologic, demographics, and disease severity information. It is an important problem because it addresses the growing national concern over patients' loss of control near the end of life. It enables earlier decisions and planning to reduce the frequency of a mechanical, painful, and prolonged dying process.'

This can be found here: https://archive.ics.uci.edu/dataset/880/support2 

My aim is to create a model to predict length of stay of the patients using several variable factors.

In [None]:
# First step is to load in the data

#supp = pd.read_csv


In [8]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Collecting certifi>=2020.12.5 (from ucimlrepo)
  Downloading certifi-2024.8.30-py3-none-any.whl.metadata (2.2 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Downloading certifi-2024.8.30-py3-none-any.whl (167 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.3/167.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: certifi, ucimlrepo
Successfully installed certifi-2024.8.30 ucimlrepo-0.0.7

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [9]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
support2 = fetch_ucirepo(id=880) 
  
# data (as pandas dataframes) 
X = support2.data.features 
y = support2.data.targets 
  
# metadata 
print(support2.metadata) 
  
# variable information 
print(support2.variables) 


{'uci_id': 880, 'name': 'SUPPORT2', 'repository_url': 'https://archive.ics.uci.edu/dataset/880/support2', 'data_url': 'https://archive.ics.uci.edu/static/public/880/data.csv', 'abstract': "This dataset comprises 9105 individual critically ill patients across 5 United States medical centers, accessioned throughout 1989-1991 and 1992-1994.\nEach row concerns hospitalized patient records who met the inclusion and exclusion criteria for nine disease categories: acute respiratory failure, chronic obstructive pulmonary disease, congestive heart failure, liver disease, coma, colon cancer, lung cancer, multiple organ system failure with malignancy, and multiple organ system failure with sepsis. The goal is to determine these patients' 2- and 6-month survival rates based on several physiologic, demographics, and disease severity information. \nIt is an important problem because it addresses the growing national concern over patients' loss of control near the end of life. It enables earlier deci

In [13]:
print(X.columns) # checking feature column names

Index(['age', 'sex', 'dzgroup', 'dzclass', 'num.co', 'edu', 'income', 'scoma',
       'charges', 'totcst', 'totmcst', 'avtisst', 'race', 'sps', 'aps',
       'surv2m', 'surv6m', 'hday', 'diabetes', 'dementia', 'ca', 'prg2m',
       'prg6m', 'dnr', 'dnrday', 'meanbp', 'wblc', 'hrt', 'resp', 'temp',
       'pafi', 'alb', 'bili', 'crea', 'sod', 'ph', 'glucose', 'bun', 'urine',
       'adlp', 'adls', 'adlsc'],
      dtype='object')


In [14]:
print(y.columns) # checking target column names

Index(['death', 'hospdead', 'sfdm2'], dtype='object')


In [15]:
print(y) # overview of targets

      death  hospdead                sfdm2
0         0         0                  NaN
1         1         1     <2 mo. follow-up
2         1         0     <2 mo. follow-up
3         1         0  no(M2 and SIP pres)
4         0         0  no(M2 and SIP pres)
...     ...       ...                  ...
9100      0         0                  NaN
9101      0         0                  NaN
9102      0         0                  NaN
9103      1         1     <2 mo. follow-up
9104      1         0  no(M2 and SIP pres)

[9105 rows x 3 columns]


In [16]:
print(X) # overview of features

           age     sex            dzgroup             dzclass  num.co   edu  \
0     62.84998    male        Lung Cancer              Cancer       0  11.0   
1     60.33899  female          Cirrhosis  COPD/CHF/Cirrhosis       2  12.0   
2     52.74698  female          Cirrhosis  COPD/CHF/Cirrhosis       2  12.0   
3     42.38498  female        Lung Cancer              Cancer       2  11.0   
4     79.88495  female  ARF/MOSF w/Sepsis            ARF/MOSF       1   NaN   
...        ...     ...                ...                 ...     ...   ...   
9100  66.07300    male  ARF/MOSF w/Sepsis            ARF/MOSF       1   8.0   
9101  55.15399  female               Coma                Coma       1  11.0   
9102  70.38196    male  ARF/MOSF w/Sepsis            ARF/MOSF       1   NaN   
9103  47.01999    male       MOSF w/Malig            ARF/MOSF       1  13.0   
9104  81.53894  female  ARF/MOSF w/Sepsis            ARF/MOSF       1   8.0   

          income  scoma  charges      totcst  ...  

In [40]:
testing = support2.variables
print(testing[['name','description']]) # decoding the variable names to figure out which to keep and which aren't relevant. For the ones to keep, a mapping dictionary would be made to make column names more understandable.

        name                                        description
0         id                                               None
1        age                       Age of the patients in years
2      death  Death at any time up to National Death Index (...
3        sex  Gender of the patient. Listed values are {male...
4   hospdead                                  Death in hospital
5       slos                 Days from Study Entry to Discharge
6     d.time                                  Days of follow-up
7    dzgroup  The patient's disease sub category amogst ARF/...
8    dzclass  The patient's disease category amongst "ARF/MO...
9     num.co  The number of simultaneous diseases (or comorb...
10       edu                                 Years of education
11    income  Income of the patient. Listed values are {"$11...
12     scoma  SUPPORT day 3 Coma Score based on Glasgow scal...
13   charges                                   Hospital charges
14    totcst         Total ratio of cost