In [3]:
!pip install pandas numpy scikit-learn



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
import gc
warnings.filterwarnings('ignore')

# MEMORY OPTIMIZATION SETTINGS
print("=" * 60)
print("MEMORY OPTIMIZATION SETTINGS")
print("=" * 60)

# Reduce sample size for faster processing (adjust as needed)
SAMPLE_SIZE = 150000  # Use subset of data
USE_SAMPLING = True  # Set to False to use full dataset

# Reduce TF-IDF features
MAX_FEATURES = 50000  # Reduced from 5000
MAX_DF = 0.90  # Filter very common terms
MIN_DF = 10  # Filter very rare terms

print(f"Sample size: {SAMPLE_SIZE if USE_SAMPLING else 'Full dataset'}")
print(f"Max TF-IDF features: {MAX_FEATURES}")
print()

# 1. LOAD DATA WITH MEMORY OPTIMIZATION
print("=" * 60)
print("STEP 1: Loading Data")
print("=" * 60)

data_path = "/content/drive/MyDrive/mimic_readmission_data"

# Load admissions with only needed columns
df_admissions = pd.read_csv(
    f"{data_path}/admissions_with_readmission_labels.csv",
    usecols=['hadm_id', 'readmitted_30day']
)

print(f"Admissions data shape: {df_admissions.shape}")

# Load notes in chunks and process efficiently
print("Loading discharge notes...")
df_discharge = pd.read_csv(
    f"{data_path}/discharge_notes.csv",
    usecols=['hadm_id', 'text']
)

print("Loading radiology notes...")
df_radiology = pd.read_csv(
    f"{data_path}/radiology_notes.csv",
    usecols=['hadm_id', 'text']
)

print(f"Discharge notes shape: {df_discharge.shape}")
print(f"Radiology notes shape: {df_radiology.shape}")
print()

# 2. PREPARE TEXT FEATURES - MEMORY EFFICIENT
print("=" * 60)
print("STEP 2: Preparing Text Features (Memory Efficient)")
print("=" * 60)

# Combine notes
df_notes = pd.concat([df_discharge, df_radiology], ignore_index=True)
print(f"Combined notes shape: {df_notes.shape}")

# Clear memory
del df_discharge, df_radiology
gc.collect()

# Group and aggregate with string limits to reduce memory
print("Aggregating notes by admission...")
df_notes_agg = df_notes.groupby('hadm_id')['text'].apply(
    lambda texts: ' '.join(str(text)[:5000] for text in texts if pd.notna(text))[:20000]  # Limit text length
).reset_index()

df_notes_agg.columns = ['hadm_id', 'all_notes_text']
print(f"Aggregated notes shape: {df_notes_agg.shape}")

# Clear memory
del df_notes
gc.collect()
print()

# 3. PREPARE FINAL DATASET
print("=" * 60)
print("STEP 3: Preparing Final Dataset")
print("=" * 60)

# Merge with aggregated notes
df_final = df_admissions.merge(df_notes_agg, on='hadm_id', how='inner')

# Clear memory
del df_admissions, df_notes_agg
gc.collect()

# Handle missing text data
df_final['all_notes_text'] = df_final['all_notes_text'].fillna('')

# Remove rows with empty text or missing target
df_final = df_final[df_final['all_notes_text'].str.len() > 50]  # At least 50 chars
df_final = df_final.dropna(subset=['readmitted_30day'])

print(f"Final dataset shape: {df_final.shape}")

# SAMPLING FOR MEMORY EFFICIENCY
if USE_SAMPLING and len(df_final) > SAMPLE_SIZE:
    print(f"\n‚ö†Ô∏è  Using stratified sample of {SAMPLE_SIZE} records for memory efficiency")
    df_final = df_final.groupby('readmitted_30day', group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), SAMPLE_SIZE // 2), random_state=42)
    ).reset_index(drop=True)
    print(f"Sampled dataset shape: {df_final.shape}")

print(f"Readmission distribution:\n{df_final['readmitted_30day'].value_counts()}")
print(f"Readmission rate: {df_final['readmitted_30day'].mean():.2%}")
print()

# 4. DEFINE X AND y
print("=" * 60)
print("STEP 4: Defining Features and Target")
print("=" * 60)

X = df_final['all_notes_text']
y = df_final['readmitted_30day'].astype(int)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# Clear memory
del df_final
gc.collect()
print()

# 5. TRAIN-TEST SPLIT
print("=" * 60)
print("STEP 5: Train-Test Split")
print("=" * 60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Clear memory
del X, y
gc.collect()

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training readmission rate: {y_train.mean():.2%}")
print(f"Test readmission rate: {y_test.mean():.2%}")
print()

# 6. TF-IDF VECTORIZATION - OPTIMIZED
print("=" * 60)
print("STEP 6: TF-IDF Vectorization (Optimized)")
print("=" * 60)

# Initialize TF-IDF Vectorizer with memory-efficient settings
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=MAX_FEATURES,
    ngram_range=(1, 1),  # Only unigrams to save memory
    min_df=MIN_DF,
    max_df=MAX_DF,
    strip_accents='unicode',
    lowercase=True,
    dtype=np.float32  # Use float32 instead of float64
)

# Fit and transform training data
print("Fitting TF-IDF vectorizer on training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF training matrix shape: {X_train_tfidf.shape}")
print(f"TF-IDF test matrix shape: {X_test_tfidf.shape}")
print(f"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")
print(f"Matrix density: {X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1]):.4%}")

# Clear memory
del X_train, X_test
gc.collect()
print()

# 7. TRAIN LOGISTIC REGRESSION - OPTIMIZED
print("=" * 60)
print("STEP 7: Training Logistic Regression Model (Optimized)")
print("=" * 60)

# Initialize Logistic Regression with optimized settings
logreg = LogisticRegression(
    class_weight='balanced',
    max_iter=500,  # Reduced iterations
    random_state=42,
    solver='saga',  # Better for large datasets
    penalty='l2',
    C=1.0,
    verbose=1  # Show progress
)

# Train the model
print("Training model...")
logreg.fit(X_train_tfidf, y_train)
print("Model training complete!")
print()

# 8. EVALUATE
print("=" * 60)
print("STEP 8: Model Evaluation")
print("=" * 60)

# Make predictions
print("Making predictions...")
y_pred = logreg.predict(X_test_tfidf)
y_pred_proba = logreg.predict_proba(X_test_tfidf)[:, 1]

# Classification Report
print("\nüìä CLASSIFICATION REPORT:")
print("-" * 60)
print(classification_report(y_test, y_pred, target_names=['Not Readmitted', 'Readmitted']))

# Confusion Matrix
print("\nüìà CONFUSION MATRIX:")
print("-" * 60)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print()
print(f"True Negatives:  {cm[0, 0]}")
print(f"False Positives: {cm[0, 1]}")
print(f"False Negatives: {cm[1, 0]}")
print(f"True Positives:  {cm[1, 1]}")

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nüéØ ROC-AUC SCORE: {roc_auc:.4f}")

# Additional Metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nüìå SUMMARY METRICS:")
print("-" * 60)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")

# Top predictive features
print("\nüîù TOP 15 MOST PREDICTIVE FEATURES FOR READMISSION:")
print("-" * 60)
feature_names = tfidf_vectorizer.get_feature_names_out()
coefficients = logreg.coef_[0]

# Get indices of top positive coefficients
top_positive_idx = np.argsort(coefficients)[-15:][::-1]
print("\nMost predictive of READMISSION:")
for idx in top_positive_idx:
    print(f"  {feature_names[idx]}: {coefficients[idx]:.4f}")

# Get indices of top negative coefficients
top_negative_idx = np.argsort(coefficients)[:15]
print("\nMost predictive of NO READMISSION:")
for idx in top_negative_idx:
    print(f"  {feature_names[idx]}: {coefficients[idx]:.4f}")

print("\n" + "=" * 60)
print("‚úÖ MODEL TRAINING AND EVALUATION COMPLETE!")
print("=" * 60)

# Memory cleanup
gc.collect()

print("\nüí° TIPS TO USE FULL DATASET:")
print("-" * 60)
print("1. Set USE_SAMPLING = False at the top")
print("2. Increase MAX_FEATURES gradually (2000 ‚Üí 3000 ‚Üí 5000)")
print("3. Use Colab Pro with more RAM")
print("4. Consider using SGDClassifier for even larger datasets")

MEMORY OPTIMIZATION SETTINGS
Sample size: 150000
Max TF-IDF features: 50000

STEP 1: Loading Data
Admissions data shape: (374139, 2)
Loading discharge notes...
Loading radiology notes...
Discharge notes shape: (331731, 2)
Radiology notes shape: (1144023, 2)

STEP 2: Preparing Text Features (Memory Efficient)
Combined notes shape: (1475754, 2)
Aggregating notes by admission...
Aggregated notes shape: (374139, 2)

STEP 3: Preparing Final Dataset
Final dataset shape: (374139, 3)

‚ö†Ô∏è  Using stratified sample of 150000 records for memory efficiency
Sampled dataset shape: (150000, 3)
Readmission distribution:
readmitted_30day
0    75000
1    75000
Name: count, dtype: int64
Readmission rate: 50.00%

STEP 4: Defining Features and Target
X shape: (150000,)
y shape: (150000,)

STEP 5: Train-Test Split
Training set size: 120000
Test set size: 30000
Training readmission rate: 50.00%
Test readmission rate: 50.00%

STEP 6: TF-IDF Vectorization (Optimized)
Fitting TF-IDF vectorizer on training da

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.5s finished



Most predictive of READMISSION:
  admissions: 4.2712
  gastroparesis: 3.2191
  discharged: 3.0377
  ama: 2.9396
  lymphoma: 2.8950
  recently: 2.5270
  scheduled: 2.4818
  overdose: 2.4647
  fix: 2.3801
  aml: 2.3478
  sarcoma: 2.3309
  desensitization: 2.3307
  dilaudid: 2.2801
  recurrent: 2.2784
  osteosarcoma: 2.2314

Most predictive of NO READMISSION:
  expired: -6.4279
  hospice: -3.4069
  cmo: -2.8911
  comfort: -2.8721
  adrs: -2.6578
  uncomplicated: -2.3155
  osh: -2.2041
  file: -2.1036
  detox: -1.9689
  chronicity: -1.9287
  adverse: -1.8991
  hypoxemia: -1.8454
  school: -1.8349
  family: -1.8276
  dni: -1.8263

‚úÖ MODEL TRAINING AND EVALUATION COMPLETE!

üí° TIPS TO USE FULL DATASET:
------------------------------------------------------------
1. Set USE_SAMPLING = False at the top
2. Increase MAX_FEATURES gradually (2000 ‚Üí 3000 ‚Üí 5000)
3. Use Colab Pro with more RAM
4. Consider using SGDClassifier for even larger datasets
