In [7]:
# Cell 1: importy i wczytanie pliku
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display

# ustaw ścieżkę do pliku z przetworzonymi danymi
processed_path = Path("../data/processed/train_preprocessed.csv")  # dostosuj jeśli trzeba
df = pd.read_csv(processed_path)

print("Loaded:", processed_path)
print("Shape:", df.shape)
print("Memory MB:", round(df.memory_usage(deep=True).sum() / 1024**2, 2))
print("Total missing values:", int(df.isna().sum().sum()))
print()



Loaded: ../data/processed/train_preprocessed.csv
Shape: (79541, 72)
Memory MB: 33.63
Total missing values: 0



In [9]:
# Cell 2: funkcja minimalnego podsumowania
def minimal_summary(df, rare_thresh=0.001, id_cols=None, show_top_cats=10):
    if id_cols is None:
        id_cols = []
    out = {}
    out['shape'] = df.shape
    out['memory_mb'] = df.memory_usage(deep=True).sum() / 1024**2
    out['total_missing'] = int(df.isna().sum().sum())

    # usuń id z X do obliczeń sparsity
    X = df.drop(columns=[c for c in id_cols if c in df.columns], errors='ignore')

    # sparsity
    out['overall_sparsity'] = float((X == 0).sum().sum() / (X.shape[0] * X.shape[1]))

    # numeryczne kolumny
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    out['num_stats'] = X[num_cols].describe().T

    # kategoryczne kolumny
    cat_cols = [c for c in X.columns if c not in num_cols]
    out['cat_cols'] = cat_cols
    out['cat_top'] = {c: X[c].value_counts(dropna=True).head(5).to_dict() for c in cat_cols[:show_top_cats]}

    # bool-like (dokładnie 0/1 lub True/False)
    bool_like = []
    for c in X.columns:
        ser = X[c].dropna()
        if ser.shape[0] == 0:
            continue
        uniques = set(ser.unique())
        if uniques.issubset({0,1}) or uniques.issubset({True,False}) or uniques.issubset({'0','1'}):
            bool_like.append(c)
    out['bool_like'] = bool_like

    # prevalence i rzadkie kolumny
    if bool_like:
        safe_int = X[bool_like].replace({True:1, False:0, '1':1, '0':0}).fillna(0).astype(int)
        prevalence = (safe_int.sum(axis=0) / float(len(X))).sort_values()
        out['prevalence'] = prevalence
        out['rare_bool_cols'] = prevalence[prevalence < rare_thresh].index.tolist()
    else:
        out['prevalence'] = pd.Series(dtype=float)
        out['rare_bool_cols'] = []

    # constant cols
    out['constant_cols'] = [c for c in X.columns if X[c].nunique(dropna=True) <= 1]

    # sample rows with any missing (jeśli są)
    out['rows_with_missing_sample'] = df[df.isna().any(axis=1)].head(5) if out['total_missing'] > 0 else pd.DataFrame()

    return out

# uruchomienie
id_cols = ['patient_nbr','encounter_id']  # dostosuj jeśli masz inne id
summary = minimal_summary(df, rare_thresh=0.001, id_cols=id_cols, show_top_cats=15)


In [10]:
# Cell 3: wypisanie wyników w czytelnej formie
print("Shape:", summary['shape'])
print("Memory MB:", round(summary['memory_mb'],2))
print("Total missing values:", summary['total_missing'])
print("Overall sparsity (fraction of zeros):", round(summary['overall_sparsity'],4))
print()

print("Number of numeric columns:", len(summary['num_stats']))
display(summary['num_stats'].loc[:, ['count','mean','std','min','25%','50%','75%','max']].head(20))

print("\nNumber of categorical columns:", len(summary['cat_cols']))
print("\nTop values for first categorical columns:")
for col, top in summary['cat_top'].items():
    print(f"{col}: {top}")

print("\nDetected bool-like columns count:", len(summary['bool_like']))
print("\n20 najrzadszych bool-like kolumn (prevalence):")
display(summary['prevalence'].head(20))

print("\nConstant columns (n_unique <= 1):", summary['constant_cols'])
print("\nVery rare boolean/one-hot columns (prevalence < 0.1%):", summary['rare_bool_cols'][:100])
print("\nNumber of diag dummy columns (prefix diag_):", len([c for c in df.columns if c.startswith('diag_')]))
if not summary['rows_with_missing_sample'].empty:
    print("\nSample rows with missing values:")
    display(summary['rows_with_missing_sample'])


Shape: (79541, 72)
Memory MB: 33.63
Total missing values: 0
Overall sparsity (fraction of zeros): 0.4798

Number of numeric columns: 16


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
admission_type_id,79541.0,2.03216,1.446836,1.0,1.0,1.0,3.0,8.0
discharge_disposition_id,79541.0,3.518336,5.186156,1.0,1.0,1.0,3.0,28.0
admission_source_id,79541.0,5.725374,4.060024,1.0,1.0,7.0,7.0,25.0
time_in_hospital,79541.0,0.094747,0.741449,-0.75,-0.5,0.0,0.5,2.5
num_lab_procedures,79541.0,-0.041384,0.752914,-1.653846,-0.5,0.0,0.5,3.384615
num_procedures,79541.0,0.165286,0.850113,-0.5,-0.5,0.0,0.5,2.5
num_medications,79541.0,0.098366,0.811352,-1.4,-0.5,0.0,0.5,6.6
number_outpatient,79541.0,0.369533,1.244375,0.0,0.0,0.0,0.0,40.0
number_emergency,79541.0,0.198677,0.946064,0.0,0.0,0.0,0.0,76.0
number_inpatient,79541.0,0.631473,1.248577,0.0,0.0,0.0,1.0,21.0



Number of categorical columns: 55

Top values for first categorical columns:
diag_1: {'circulatory': 23739, 'other': 20938, 'respiratory': 11225, 'digestive': 7487, 'injury': 5475}
diag_2: {'other': 26456, 'circulatory': 24860, 'respiratory': 8386, 'urogenital': 6493, 'diabetes': 4900}
diag_3: {'other': 28461, 'circulatory': 23741, 'diabetes': 9170, 'respiratory': 5632, 'urogenital': 5128}
readmitted: {'NO': 42031, '>30': 28412, '<30': 9098}
race_Asian: {False: 79035, True: 506}
race_Caucasian: {True: 59334, False: 20207}
race_Hispanic: {False: 77895, True: 1646}
race_Other: {False: 76551, True: 2990}
gender_Male: {False: 42812, True: 36729}
metformin_Down: {False: 79088, True: 453}
metformin_No: {True: 63555, False: 15986}
metformin_Steady: {False: 64849, True: 14692}
metformin_Up: {False: 78700, True: 841}
repaglinide_No: {True: 78301, False: 1240}
repaglinide_Steady: {False: 78431, True: 1110}

Detected bool-like columns count: 54

20 najrzadszych bool-like kolumn (prevalence):


repaglinide_Up                0.001207
pioglitazone_Down             0.001245
rosiglitazone_Up              0.001710
glimepiride_Down              0.002049
pioglitazone_Up               0.002301
glimepiride_Up                0.003093
acarbose_Steady               0.003105
glipizide_Down                0.005205
glyburide_Down                0.005645
metformin_Down                0.005695
race_Asian                    0.006361
nateglinide_Steady            0.006512
glyburide-metformin_Steady    0.006676
glipizide_Up                  0.007895
glyburide_Up                  0.008147
metformin_Up                  0.010573
repaglinide_Steady            0.013955
race_Hispanic                 0.020694
race_Other                    0.037591
glimepiride_Steady            0.046894
dtype: float64


Constant columns (n_unique <= 1): ['glimepiride-pioglitazone_No']

Very rare boolean/one-hot columns (prevalence < 0.1%): []

Number of diag dummy columns (prefix diag_): 3


In [11]:
# Cell 4: opcjonalnie zapisz listę rzadkich kolumn do pliku do dalszego użycia
rare_list = summary['rare_bool_cols']
out_path = Path("./artifacts/dropped_onehots_from_notebook.json")
out_path.parent.mkdir(parents=True, exist_ok=True)
import json
with open(out_path, "w") as f:
    json.dump({"dropped": rare_list}, f, indent=2)
print("Saved rare list to", out_path, "count:", len(rare_list))


Saved rare list to artifacts/dropped_onehots_from_notebook.json count: 0


ModuleNotFoundError: No module named 'statsmodels'

In [13]:
# VIF bez statsmodels — notebook cell
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# df już załadowane
exclude = {'target', 'readmitted', 'patient_nbr', 'encounter_id'}
num_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude]

# jeśli nie ma numerycznych kolumn, przerwij
if len(num_cols) == 0:
    print("No numeric columns to compute VIF.")
else:
    # przygotuj macierz numeryczną (wypełnij NaN jeśli są)
    X_num = df[num_cols].fillna(0).astype(float)

    # skalowanie stabilizuje obliczenia R^2
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_num)

    vif_list = []
    n_features = X_scaled.shape[1]
    lr = LinearRegression()

    for i in range(n_features):
        y = X_scaled[:, i]
        X_others = np.delete(X_scaled, i, axis=1)
        lr.fit(X_others, y)
        r2 = lr.score(X_others, y)
        # zabezpieczenie przed r2 == 1.0 (dokładna kolinearność)
        if r2 >= 0.9999999:
            vif = np.inf
        else:
            vif = 1.0 / (1.0 - r2)
        vif_list.append((num_cols[i], vif, r2))

    vif_df = pd.DataFrame(vif_list, columns=['feature', 'VIF', 'R2']).sort_values('VIF', ascending=False)
    display(vif_df)
    print("\nHigh VIF (VIF > 10):")
    display(vif_df[vif_df['VIF'] > 10])


Unnamed: 0,feature,VIF,R2
7,number_outpatient,inf,1.0
8,number_emergency,inf,1.0
9,number_inpatient,inf,1.0
13,total_utilization,inf,1.0
6,num_medications,1.661438,0.398112
3,time_in_hospital,1.401204,0.286328
11,change,1.394988,0.283148
12,diabetesMed,1.354754,0.261858
5,num_procedures,1.246622,0.197832
10,number_diagnoses,1.206759,0.171334



High VIF (VIF > 10):


Unnamed: 0,feature,VIF,R2
7,number_outpatient,inf,1.0
8,number_emergency,inf,1.0
9,number_inpatient,inf,1.0
13,total_utilization,inf,1.0
