In [2]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import re
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

# 1. Load and select the first ICU admission record for each patient
chunksize = 50000
sample_fraction = 1
selected_columns = [
    'subject_id_x', 'age_years', 'admission_type', 'admission_location', 
    'discharge_location', 'insurance', 'marital_status', 'ethnicity', 'diagnosis', 
    'positiveculture', 'ab_name', 'antibioticresistance', 
    'gcs', 'gcseyes', 'gcsmotor', 'gcsverbal', 'endotrachflag', 'neutrophil', 
    'creactiveprotein', 'whitebloodcell', 'partialpressureo2', 'bicarbonate', 
    'lactate', 'troponin', 'bloodureanitrogen', 'creatinine', 'alaninetransaminase', 
    'aspartatetransaminase', 'hemoglobin', 'intnormalisedratio', 'platelets', 
    'albumin', 'chloride', 'glucose', 'sodium', 'bilirubin', 'hematocrit', 
    'first_careunit', 'last_careunit', 'los_x', 'intime_x', 'outtime_x', 
    'expire_flag'
]

# Load and filter the first ICU admission record for each patient
filtered_chunks = []
for chunk in pd.read_csv('/root/DATA/filtered_merged_data.csv', usecols=selected_columns, chunksize=chunksize, low_memory=False):
    chunk_sample = chunk.sample(frac=sample_fraction, random_state=42)
    chunk_sorted = chunk_sample.sort_values('intime_x').drop_duplicates(subset='subject_id_x', keep='first')
    filtered_chunks.append(chunk_sorted)

filtered_data = pd.concat(filtered_chunks, ignore_index=True)

# Add a new column for is_weekend_admission based on intime_x
filtered_data['intime_x'] = pd.to_datetime(filtered_data['intime_x'], errors='coerce')
filtered_data['is_weekend_admission'] = filtered_data['intime_x'].dt.weekday >= 5  # 周六和周日为 True

# Remove datetime columns before scaling
filtered_data = filtered_data.drop(columns=['intime_x', 'outtime_x'])

# 2. Separate features and target (using long-term mortality with 'expire_flag')
X = filtered_data.drop(columns=['expire_flag', 'subject_id_x'], errors='ignore')
y = filtered_data['expire_flag']

# 3. Fill missing values
for col in X.columns:
    if X[col].dtype == 'float64':
        X[col] = X[col].fillna(X[col].median())
    elif X[col].dtype == 'object':
        X[col] = X[col].fillna('Unknown')

# 4. Encode categorical variables and standardize
X_encoded = pd.get_dummies(X, drop_first=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# 5. Use Lasso for feature selection
lasso = Lasso(alpha=0.01, random_state=42)
lasso.fit(X_scaled, y)

# 6. Filter Lasso-selected important features
pattern = re.compile(r'(_\d{4}-\d{2}-\d{2})|(_[A-Z]{2,}\d*)')
lasso_selected_features = [feat for feat in X_encoded.columns[(lasso.coef_ != 0)] if not pattern.search(feat)]
print("Filtered Lasso-selected important features:", lasso_selected_features)

# Manually ensure these important features are included, including dummy variables
mandatory_features = [
    'age_years', 'gcsverbal', 'gcsmotor', 'bloodureanitrogen',
    'admission_location_EMERGENCY ROOM ADMIT', 'admission_location_PHYS REFERRAL/NORMAL DELI',
    'admission_location_CLINIC REFERRAL/PREMATURE', 'admission_location_TRANSFER FROM HOSP/EXTRAM',
    'admission_type_URGENT', 'is_weekend_admission'
]

# Add each mandatory feature if it’s missing
for feature in mandatory_features:
    if feature not in lasso_selected_features:
        lasso_selected_features.append(feature)

print("Final feature list after manually adding mandatory features:", lasso_selected_features)

# 7. Generate the final encoded dataset with selected features, ensuring column consistency
final_data_encoded = pd.get_dummies(filtered_data, drop_first=True)
for col in lasso_selected_features:
    if col not in final_data_encoded.columns:
        final_data_encoded[col] = 0  # Add missing columns and fill with 0

# Include 'expire_flag' as the target variable in the final dataset
final_data_selected = final_data_encoded[lasso_selected_features + ['expire_flag']]

# 8. Fill NaN values for specific columns
for col in ['bloodureanitrogen', 'intnormalisedratio', 'chloride', 'hematocrit']:
    if col in final_data_selected.columns:
        final_data_selected[col].fillna(final_data_selected[col].median(), inplace=True)

# 9. Check for NaN values
nan_summary = final_data_selected.isna().sum()
print("NaN Value Statistics After Filling:")
print(nan_summary[nan_summary > 0])

# Check the final shape of the data
print("Final dataset shape:", final_data_selected.shape)

# Save the final data
output_path = '/root/DATA/cleaned_cox_dataset.csv'
final_data_selected.to_csv(output_path, index=False)
print(f"The cleaned data with long-term mortality flag as 'expire_flag' has been exported to {output_path}")


Filtered Lasso-selected important features: ['los_x', 'positiveculture', 'gcs', 'gcsmotor', 'lactate', 'bloodureanitrogen', 'hemoglobin', 'intnormalisedratio', 'albumin', 'chloride', 'hematocrit', 'age_years', 'insurance_Medicare']
Final feature list after manually adding mandatory features: ['los_x', 'positiveculture', 'gcs', 'gcsmotor', 'lactate', 'bloodureanitrogen', 'hemoglobin', 'intnormalisedratio', 'albumin', 'chloride', 'hematocrit', 'age_years', 'insurance_Medicare', 'gcsverbal', 'admission_location_EMERGENCY ROOM ADMIT', 'admission_location_PHYS REFERRAL/NORMAL DELI', 'admission_location_CLINIC REFERRAL/PREMATURE', 'admission_location_TRANSFER FROM HOSP/EXTRAM', 'admission_type_URGENT', 'is_weekend_admission']
NaN Value Statistics After Filling:
gcsmotor         75
lactate       15812
hemoglobin     9493
albumin       18060
gcsverbal        67
dtype: int64
Final dataset shape: (20101, 21)
The cleaned data with long-term mortality flag as 'expire_flag' has been exported to /ro