In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# # Load your dataset (adjust the path as needed)
data = pd.read_csv("final_ml_data.csv")
data[['N170', 'N171', 'N172', 'N178', 'N179']] = data[['N170', 'N171', 'N172', 'N178', 'N179']].fillna(0)
data['aki_occurred'] = data[['N170', 'N171', 'N172', 'N178', 'N179']].max(axis=1)

feature_cols = [
    'drug_encoded', 'creatinine_level', 'previous_creatinine_level',
    'creatinine_change', 'gender_encoded', 'anchor_age',
    'race_encoded', 'avg_bmi'
]
data[feature_cols] = data[feature_cols].fillna(0)
data = data.drop_duplicates(subset=['subject_id', 'hadm_id'])

# Split data
X = data[feature_cols]
y = data['aki_occurred']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check initial class distribution
print("Class distribution in target variable 'aki_occurred':")
print(y_train.value_counts())
print("Name: count, dtype: int64")

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check distribution after SMOTE
print("\nClass distribution after applying SMOTE:")
print(pd.Series(y_train_resampled).value_counts())
print("Name: count, dtype: int64")

# Random Forest Model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Random Forest predictions and evaluation
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"\nRandom Forest Accuracy after handling class imbalance: {accuracy_rf:.4f}")
print("Random Forest Classification Report after handling class imbalance:")
print(classification_report(y_test, y_pred_rf))

Class distribution in target variable 'aki_occurred':
aki_occurred
0.0    72930
1.0    10237
Name: count, dtype: int64
Name: count, dtype: int64

Class distribution after applying SMOTE:
aki_occurred
1.0    72930
0.0    72930
Name: count, dtype: int64
Name: count, dtype: int64

Random Forest Accuracy after handling class imbalance: 0.8471
Random Forest Classification Report after handling class imbalance:
              precision    recall  f1-score   support

         0.0       0.90      0.92      0.91     31292
         1.0       0.35      0.29      0.32      4351

    accuracy                           0.85     35643
   macro avg       0.63      0.61      0.62     35643
weighted avg       0.84      0.85      0.84     35643

