In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import pandas as pd

In [2]:
# Load the dataset
file_path = 'sepsis_diab_pt_all.xlsx'  # Update if needed
# Load all sheets
sheets = pd.ExcelFile(file_path)
sheet_names = sheets.sheet_names
print("Available Sheets:", sheet_names)

Available Sheets: ['sepsis_pt_all_admission details', 'sepsis_lab_events', 'microbiology events', 'prescriptoin', 'poe', 'poe_detail']


In [3]:
# Load individual sheets
admission_data = sheets.parse('sepsis_pt_all_admission details')
lab_events = sheets.parse('sepsis_lab_events')
microbiology_events = sheets.parse('microbiology events')
prescription_data = sheets.parse('prescriptoin')
#poe_data = sheets.parse('poe')
#poe_details = sheets.parse('poe_detail')

In [4]:
admission_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2791 entries, 0 to 2790
Data columns (total 59 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   subject_id            2791 non-null   int64         
 1   hadm_id               2791 non-null   int64         
 2   admittime             2791 non-null   datetime64[ns]
 3   dischtime             2791 non-null   datetime64[ns]
 4   deathtime             158 non-null    datetime64[ns]
 5   admission_type        2791 non-null   object        
 6   admit_provider_id     2791 non-null   object        
 7   admission_location    2791 non-null   object        
 8   discharge_location    2777 non-null   object        
 9   insurance             2791 non-null   object        
 10  language              2791 non-null   object        
 11  marital_status        2736 non-null   object        
 12  race                  2791 non-null   object        
 13  edregtime         

In [5]:
# Select relevant columns
admission_data = admission_data[['hadm_id', 'admission_type', 'drg_code', 'dx_1_code', 'edhours']].drop_duplicates()
prescription_data = prescription_data[['hadm_id', 'drug']].drop_duplicates()

prescription_data=prescription_data.reset_index()

# Drop rows with missing values in critical columns
admission_data.dropna(subset=['hadm_id', 'admission_type', 'drg_code', 'dx_1_code', 'edhours'], inplace=True)
prescription_data.dropna(subset=['hadm_id', 'drug'], inplace=True)

# Merge admissions and prescriptions
admission_drug_data = pd.merge(prescription_data, admission_data, on='hadm_id', how='inner')

# Encode categorical variables
categorical_features = pd.get_dummies(
    admission_drug_data[['admission_type', 'drg_code', 'dx_1_code']],
    drop_first=True
)

# Scale numerical features (EDHOURS)
scaler = StandardScaler()
numerical_features = scaler.fit_transform(admission_drug_data[['edhours']])

numerical_features = pd.DataFrame(numerical_features, columns=['scaled_edhours'])

# TF-IDF for drug names
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
drug_tfidf_matrix = tfidf_vectorizer.fit_transform(admission_drug_data['drug'])

# Combine All Features into a Single DataFrame
combined_features = pd.concat([categorical_features.reset_index(drop=True), numerical_features.reset_index(drop=True)], axis=1).reset_index(drop=True)

print("Combined Features Shape:", combined_features.shape)

Combined Features Shape: (50196, 210)


In [8]:
import pandas as pd

from ydata_profiling import ProfileReport

In [7]:
#pip install ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.12.2-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.8.0,>=0.7.5 (from visions[type_image_path]<0.8.0,>=0.7.5->ydata-profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp39-cp39-win_amd64.whl.metadata (5.6 kB)
Collecting seaborn<0.14,>=0.10.1 (from ydata-profiling)
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting statsmodels<1,>=0.13.2 (from ydata-profiling)
  Downloading statsmodels-0.14.4-cp39-cp39-win_amd64.whl.metadata (9.5 kB)
Collecting typeguard<5,>=3 (from ydata-profiling)
 



In [9]:
df=ProfileReport(combined_features, title="Trending Books")

In [10]:
df.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#END