In [None]:
!pip install azure-storage-blob python-dotenv pandas scikit-learn lightgbm



In [None]:
import os
import io
from dotenv import load_dotenv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import pickle
from azure.storage.blob import BlobServiceClient

In [None]:
load_dotenv('../.env')

# --- CONFIGURATION ---
# If the .env file isn't loading, PASTE YOUR SAS TOKEN BELOW between the quotes:
MANUAL_SAS_TOKEN = ""

STORAGE_ACCOUNT_NAME = os.getenv("STORAGE_ACCOUNT_NAME", "clinicaldatalake25")

# Logic to determine which token to use
SAS_TOKEN = MANUAL_SAS_TOKEN if MANUAL_SAS_TOKEN else os.getenv("SAS_TOKEN", "")

if not SAS_TOKEN:
    print("⚠️ SAS_TOKEN not found in environment variables or manual override.")
    print("Please paste your SAS Token in the input box that appears at the TOP of the screen (VS Code) or below (Colab).")
    try:
        SAS_TOKEN = input("Enter SAS Token: ").strip()
    except Exception:
        pass

if not SAS_TOKEN:
    raise ValueError("SAS_TOKEN is required to proceed. Please paste it in the variable 'MANUAL_SAS_TOKEN' above and run this cell again.")

print(f"✅ Using Storage Account: {STORAGE_ACCOUNT_NAME}")

⚠️ SAS_TOKEN not found in environment variables or manual override.
Please paste your SAS Token in the input box that appears at the TOP of the screen (VS Code) or below (Colab).
✅ Using Storage Account: clinicaldatalake25


In [None]:
# This notebook now operates completely on cloud data.
# Ensure you have run 'scripts/upload_csv_to_azure.py' locally first.

try:
    print("☁️ Connecting to Azure to download dataset...")
    account_url = f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
    blob_service_client = BlobServiceClient(account_url=account_url, credential=SAS_TOKEN)
    
    DATA_CONTAINER = "data-source"
    DATA_BLOB = "PatientNoShowKaggleMay2016.csv"
    
    blob_client = blob_service_client.get_blob_client(container=DATA_CONTAINER, blob=DATA_BLOB)
    downloader = blob_client.download_blob()
    
    # Load data directly into pandas from the downloaded stream
    df = pd.read_csv(io.BytesIO(downloader.readall()))
    
    print(f"✅ Successfully downloaded and loaded '{DATA_BLOB}'.")
    
    # Sanitize column names to match the rest of the script
    df.columns = [c.lower().replace('-', '') for c in df.columns]
    print("Columns sanitized for processing.")
    
except Exception as e:
    print(f"❌ ERROR: Failed to download data from Azure: {e}")
    raise

☁️ Connecting to Azure to download dataset...
✅ Successfully downloaded and loaded 'PatientNoShowKaggleMay2016.csv'.
Columns sanitized for processing.


In [None]:
df['noshow'] = df['noshow'].apply(lambda x: 1 if x == 'Yes' else 0)
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)

df['scheduledday'] = pd.to_datetime(df['scheduledday'])
df['appointmentday'] = pd.to_datetime(df['appointmentday'])

df['scheduled_year'] = df['scheduledday'].dt.year
df['scheduled_month'] = df['scheduledday'].dt.month
df['scheduled_day'] = df['scheduledday'].dt.day
df['scheduled_weekday'] = df['scheduledday'].dt.dayofweek

# Calculate Lead Days (Time between scheduling and appointment)
df['lead_days'] = (df['appointmentday'].dt.normalize() - df['scheduledday'].dt.normalize()).dt.days
# Ensure no negative lead days
df['lead_days'] = df['lead_days'].apply(lambda x: max(0, x))

categorical_features = ['neighbourhood']
for col in categorical_features:
    df[col] = LabelEncoder().fit_transform(df[col])

In [None]:
# --- SPLIT STRATEGY ---
# We will train two separate models:
# 1. Same-Day Model (Lead Days == 0)
# 2. Future Model (Lead Days > 0)
# This prevents the 'SMS Received' feature from acting as a proxy for 'Future Appointment' in the Future Model.

df_same_day = df[df['lead_days'] == 0]
df_future = df[df['lead_days'] > 0]

print(f"Same-Day Records: {len(df_same_day)}")
print(f"Future Records: {len(df_future)}")

features = ['gender', 'age', 'neighbourhood', 'scholarship', 'hipertension', 'diabetes', 'alcoholism', 'handcap', 'sms_received', 'scheduled_year', 'scheduled_month', 'scheduled_day', 'scheduled_weekday', 'lead_days']
target = 'noshow'

def train_model(dataframe, model_name):
    print(f"\n--- Training {model_name} ---")
    X = dataframe[features]
    y = dataframe[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9
    }
    
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=50,
                    valid_sets=lgb_eval,
                    callbacks=[lgb.early_stopping(stopping_rounds=5)])
    return gbm

model_same_day = train_model(df_same_day, "Same-Day Model")
model_future = train_model(df_future, "Future Model")

Same-Day Records: 38568
Future Records: 71959

--- Training Same-Day Model ---
[LightGBM] [Info] Number of positive: 1468, number of negative: 29386
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002632 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 229
[LightGBM] [Info] Number of data points in the train set: 30854, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047579 -> initscore=-2.996617
[LightGBM] [Info] Start training from score -2.996617
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[50]	valid_0's binary_logloss: 0.157822

--- Training Future Model ---
[LightGBM] [Info] Number of positive: 16431, number of negative: 41136
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008317 seconds.
You can set

In [None]:
# Save both models in a single dictionary
model_artifacts = {
    "same_day_model": model_same_day,
    "future_model": model_future
}

with open('no_show_model.pkl', 'wb') as f:
    pickle.dump(model_artifacts, f)
print("Models saved locally as 'no_show_model.pkl'")

# --- Upload to Azure Blob Storage ---
print("\nUploading model artifacts to Azure Blob Storage...")

if not SAS_TOKEN:
    print("⚠️  WARNING: SAS_TOKEN not found. Skipping upload.")
else:
    try:
        account_url = f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
        blob_service_client = BlobServiceClient(account_url=account_url, credential=SAS_TOKEN)
        
        CONTAINER_NAME = "ml-models"
        BLOB_NAME = "ops/no_show_model.pkl"
        
        # Create container if needed
        try:
            blob_service_client.create_container(CONTAINER_NAME)
        except Exception:
            pass # Container likely exists

        blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=BLOB_NAME)
        
        with open("no_show_model.pkl", "rb") as data:
            blob_client.upload_blob(data, overwrite=True)
            
        print(f"✅ SUCCESS: Dual-Model artifacts uploaded to container '{CONTAINER_NAME}' as '{BLOB_NAME}'")
        
    except Exception as e:
        print(f"❌ ERROR: Failed to upload to Azure: {e}")

Models saved locally as 'no_show_model.pkl'

Uploading model artifacts to Azure Blob Storage...
✅ SUCCESS: Dual-Model artifacts uploaded to container 'ml-models' as 'ops/no_show_model.pkl'
