In [2]:
!pip install pandas numpy scikit-learn



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import warnings
import gc
import sys

warnings.filterwarnings('ignore')

# MEMORY OPTIMIZATION SETTINGS
print("=" * 60)
print("MEMORY OPTIMIZATION SETTINGS")
print("=" * 60)

# Reduce sample size for faster processing (adjust as needed)
SAMPLE_SIZE = 150000  # Use subset of data
USE_SAMPLING = True  # Set to False to use full dataset

# Reduce TF-IDF features
MAX_FEATURES = 20000
MAX_DF = 0.90  # Filter very common terms
MIN_DF = 10  # Filter very rare terms

print(f"Sample size: {SAMPLE_SIZE if USE_SAMPLING else 'Full dataset'}")
print(f"Max TF-IDF features: {MAX_FEATURES}")
print()

# 1. LOAD DATA WITH MEMORY OPTIMIZATION
print("=" * 60)
print("STEP 1: Loading Data")
print("=" * 60)

data_path = "/content/drive/MyDrive/mimic_readmission_data"

# Load admissions with only needed columns
df_admissions = pd.read_csv(
    f"{data_path}/admissions_with_readmission_labels.csv",
    usecols=['hadm_id', 'readmitted_30day']
)

print(f"Admissions data shape: {df_admissions.shape}")

# Load notes in chunks and process efficiently
print("Loading discharge notes...")
df_discharge = pd.read_csv(
    f"{data_path}/discharge_notes.csv",
    usecols=['hadm_id', 'text']
)

print("Loading radiology notes...")
df_radiology = pd.read_csv(
    f"{data_path}/radiology_notes.csv",
    usecols=['hadm_id', 'text']
)

print(f"Discharge notes shape: {df_discharge.shape}")
print(f"Radiology notes shape: {df_radiology.shape}")
print()

# 2. PREPARE TEXT FEATURES - MEMORY EFFICIENT
print("=" * 60)
print("STEP 2: Preparing Text Features (Memory Efficient)")
print("=" * 60)

# Combine notes
df_notes = pd.concat([df_discharge, df_radiology], ignore_index=True)
print(f"Combined notes shape: {df_notes.shape}")

# Clear memory
del df_discharge, df_radiology
gc.collect()

# Group and aggregate with string limits to reduce memory
print("Aggregating notes by admission...")
df_notes_agg = df_notes.groupby('hadm_id')['text'].apply(
    lambda texts: ' '.join(str(text)[:5000] for text in texts if pd.notna(text))[:20000]  # Limit text length
).reset_index()

df_notes_agg.columns = ['hadm_id', 'all_notes_text']
print(f"Aggregated notes shape: {df_notes_agg.shape}")

# Clear memory
del df_notes
gc.collect()
print()

# 3. PREPARE FINAL DATASET
print("=" * 60)
print("STEP 3: Preparing Final Dataset")
print("=" * 60)

# Merge with aggregated notes
df_final = df_admissions.merge(df_notes_agg, on='hadm_id', how='inner')

# Clear memory
del df_admissions, df_notes_agg
gc.collect()

# Handle missing text data
df_final['all_notes_text'] = df_final['all_notes_text'].fillna('')

# Remove rows with empty text or missing target
df_final = df_final[df_final['all_notes_text'].str.len() > 50]  # At least 50 chars
df_final = df_final.dropna(subset=['readmitted_30day'])

print(f"Final dataset shape: {df_final.shape}")

# SAMPLING FOR MEMORY EFFICIENCY
if USE_SAMPLING and len(df_final) > SAMPLE_SIZE:
    print(f"\n‚ö†Ô∏è  Using stratified sample of {SAMPLE_SIZE} records for memory efficiency")
    df_final = df_final.groupby('readmitted_30day', group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), int(SAMPLE_SIZE * (len(x) / len(df_final)))), random_state=42)
    ).reset_index(drop=True)
    print(f"Sampled dataset shape: {df_final.shape}")

print(f"Readmission distribution:\n{df_final['readmitted_30day'].value_counts()}")
print(f"Readmission rate: {df_final['readmitted_30day'].mean():.2%}")
print()

# 4. DEFINE X AND y
print("=" * 60)
print("STEP 4: Defining Features and Target")
print("=" * 60)

X = df_final['all_notes_text']
y = df_final['readmitted_30day'].astype(int)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# Clear memory
del df_final
gc.collect()
print()

# 5. TRAIN-TEST SPLIT
print("=" * 60)
print("STEP 5: Train-Test Split")
print("=" * 60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training readmission rate: {y_train.mean():.2%}")
print(f"Test readmission rate: {y_test.mean():.2%}")
print()

# 6. TF-IDF VECTORIZATION - OPTIMIZED
print("=" * 60)
print("STEP 6: TF-IDF Vectorization (Optimized)")
print("=" * 60)

# Initialize TF-IDF Vectorizer with memory-efficient settings
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=MAX_FEATURES,
    ngram_range=(1, 1),  # Only unigrams to save memory
    min_df=MIN_DF,
    max_df=MAX_DF,
    strip_accents='unicode',
    lowercase=True,
    dtype=np.float32  # Use float32 instead of float64
)

# Fit and transform training data
print("Fitting TF-IDF vectorizer on training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF training matrix shape: {X_train_tfidf.shape}")
print(f"TF-IDF test matrix shape: {X_test_tfidf.shape}")

# Clear memory
del X_train, X_test
gc.collect()
print()

# 7. TRAIN AND EVALUATE ALL MODELS
print("=" * 60)
print("STEP 7: Training and Evaluating All Models")
print("=" * 60)

# Calculate scale_pos_weight for XGBoost
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# Get feature names for interpretability
feature_names = tfidf_vectorizer.get_feature_names_out()


def evaluate_and_write(model, model_name, X_test, y_test, file_handle, feature_names=None):
    """
    Evaluate a trained model and write results to file AND print to console.

    Parameters:
    - model: trained sklearn/xgboost model
    - model_name: string name of the model
    - X_test: test features
    - y_test: test labels
    - file_handle: open file handle to write to
    - feature_names: array of feature names (optional, for feature importance)
    """
    # Make predictions
    y_pred = model.predict(X_test)

    # Get probability predictions
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, 'decision_function'):
        # For LinearSVC which doesn't have predict_proba by default
        y_pred_proba = model.decision_function(X_test)
    else:
        y_pred_proba = y_pred

    # Prepare output content
    output = []
    output.append("=" * 60)
    output.append(f"MODEL: {model_name}")
    output.append("=" * 60)
    output.append("")

    # Classification report
    output.append("üìä CLASSIFICATION REPORT:")
    output.append("-" * 60)
    report = classification_report(y_test, y_pred, target_names=['Not Readmitted', 'Readmitted'])
    output.append(report)

    # Confusion matrix
    output.append("\nüìà CONFUSION MATRIX:")
    output.append("-" * 60)
    cm = confusion_matrix(y_test, y_pred)
    output.append(str(cm))
    output.append(f"True Negatives:  {cm[0, 0]}")
    output.append(f"False Positives: {cm[0, 1]}")
    output.append(f"False Negatives: {cm[1, 0]}")
    output.append(f"True Positives:  {cm[1, 1]}")

    # ROC-AUC
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    output.append(f"\nüéØ ROC-AUC SCORE: {roc_auc:.4f}")

    # Summary metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    output.append("\nüìå SUMMARY METRICS:")
    output.append("-" * 60)
    output.append(f"Accuracy:  {accuracy:.4f}")
    output.append(f"Precision: {precision:.4f}")
    output.append(f"Recall:    {recall:.4f}")
    output.append(f"F1-Score:  {f1:.4f}")
    output.append(f"ROC-AUC:   {roc_auc:.4f}")

    # Feature importance
    if feature_names is not None:
        output.append("\nüîù TOP 15 MOST PREDICTIVE FEATURES:")
        output.append("-" * 60)

        # Check if model has coefficients (Linear models)
        if hasattr(model, 'coef_'):
            coefficients = model.coef_[0] if len(model.coef_.shape) > 1 else model.coef_

            top_positive_idx = np.argsort(coefficients)[-15:][::-1]
            output.append("\nMost predictive of READMISSION:")
            for idx in top_positive_idx:
                output.append(f"  {feature_names[idx]}: {coefficients[idx]:.4f}")

            top_negative_idx = np.argsort(coefficients)[:15]
            output.append("\nMost predictive of NO READMISSION:")
            for idx in top_negative_idx:
                output.append(f"  {feature_names[idx]}: {coefficients[idx]:.4f}")

        # Check if model has feature importances (Tree-based models)
        elif hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_

            top_idx = np.argsort(importances)[-15:][::-1]
            output.append("\nTop 15 Most Important Features:")
            for idx in top_idx:
                output.append(f"  {feature_names[idx]}: {importances[idx]:.4f}")

    output.append("")
    output.append("=" * 60)
    output.append("")

    # Join all output lines
    full_output = "\n".join(output)

    # Write to file
    file_handle.write(full_output)
    file_handle.flush()  # Ensure it's written immediately

    # Print to console
    print(full_output)

    return accuracy, precision, recall, f1, roc_auc


# Initialize output file (create empty file at start)
with open('output.txt', 'w') as f:
    f.write("MODEL EVALUATION RESULTS\n")
    f.write("=" * 60 + "\n\n")

print("\nüíæ Output file initialized: output.txt\n")

# Train and evaluate models one by one, writing after each
try:
    # MODEL 1: Logistic Regression
    print("üîÑ Training Logistic Regression...")
    logreg = LogisticRegression(
        class_weight='balanced',
        max_iter=500,
        random_state=42,
        solver='saga',
        penalty='l2',
        C=1.0,
        verbose=0
    )
    logreg.fit(X_train_tfidf, y_train)

    with open('output.txt', 'a') as f:
        evaluate_and_write(logreg, "Logistic Regression", X_test_tfidf, y_test, f, feature_names)

    # Clear model from memory
    del logreg
    gc.collect()
    print("‚úÖ Logistic Regression complete and saved!\n")

except Exception as e:
    print(f"‚ùå Error with Logistic Regression: {e}\n")

try:
    # MODEL 2: Linear SVM
    print("üîÑ Training Linear SVM...")
    linear_svm = LinearSVC(
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
        dual=False,
        verbose=0
    )
    linear_svm.fit(X_train_tfidf, y_train)

    with open('output.txt', 'a') as f:
        evaluate_and_write(linear_svm, "Linear SVM (LinearSVC)", X_test_tfidf, y_test, f, feature_names)

    # Clear model from memory
    del linear_svm
    gc.collect()
    print("‚úÖ Linear SVM complete and saved!\n")

except Exception as e:
    print(f"‚ùå Error with Linear SVM: {e}\n")

try:
    # MODEL 3: Random Forest (REDUCED for memory)
    print("üîÑ Training Random Forest...")
    rf = RandomForestClassifier(
        n_estimators=50,  # REDUCED from 100
        max_depth=20,  # REDUCED from 25
        min_samples_leaf=20,  # INCREASED from 10
        max_features='sqrt',  # Add this to reduce memory
        class_weight='balanced',
        n_jobs=2,  # REDUCED from -1 to limit parallel memory
        random_state=42,
        verbose=0
    )
    rf.fit(X_train_tfidf, y_train)

    with open('output.txt', 'a') as f:
        evaluate_and_write(rf, "Random Forest", X_test_tfidf, y_test, f, feature_names)

    # Clear model from memory
    del rf
    gc.collect()
    print("‚úÖ Random Forest complete and saved!\n")

except Exception as e:
    print(f"‚ùå Error with Random Forest: {e}\n")

try:
    # MODEL 4: XGBoost (REDUCED for memory)
    print("üîÑ Training XGBoost...")
    xgb = XGBClassifier(
        n_estimators=100,  # REDUCED from 200
        max_depth=6,  # REDUCED from 8
        tree_method='hist',
        scale_pos_weight=scale_pos_weight,
        learning_rate=0.1,
        subsample=0.8,  # Add subsampling to reduce memory
        colsample_bytree=0.8,  # Add feature sampling
        random_state=42,
        eval_metric='logloss',
        early_stopping_rounds=10,
        verbosity=0,
        nthread=2  # Limit threads to reduce memory
    )
    xgb.fit(
        X_train_tfidf,
        y_train,
        eval_set=[(X_test_tfidf, y_test)],
        verbose=False
    )

    with open('output.txt', 'a') as f:
        evaluate_and_write(xgb, "XGBoost", X_test_tfidf, y_test, f, feature_names)

    # Clear model from memory
    del xgb
    gc.collect()
    print("‚úÖ XGBoost complete and saved!\n")

except Exception as e:
    print(f"‚ùå Error with XGBoost: {e}\n")

print("\n" + "=" * 60)
print("‚úÖ ALL MODEL EVALUATIONS COMPLETE!")
print("üìÑ Results saved to output.txt")
print("=" * 60)

MEMORY OPTIMIZATION SETTINGS
Sample size: 150000
Max TF-IDF features: 20000

STEP 1: Loading Data
Admissions data shape: (374139, 2)
Loading discharge notes...
Loading radiology notes...
Discharge notes shape: (331731, 2)
Radiology notes shape: (1144023, 2)

STEP 2: Preparing Text Features (Memory Efficient)
Combined notes shape: (1475754, 2)
Aggregating notes by admission...
Aggregated notes shape: (374139, 2)

STEP 3: Preparing Final Dataset
Final dataset shape: (374139, 3)

‚ö†Ô∏è  Using stratified sample of 150000 records for memory efficiency
Sampled dataset shape: (149999, 3)
Readmission distribution:
readmitted_30day
0    119641
1     30358
Name: count, dtype: int64
Readmission rate: 20.24%

STEP 4: Defining Features and Target
X shape: (149999,)
y shape: (149999,)

STEP 5: Train-Test Split
Training set size: 119999
Test set size: 30000
Training readmission rate: 20.24%
Test readmission rate: 20.24%

STEP 6: TF-IDF Vectorization (Optimized)
Fitting TF-IDF vectorizer on training 