In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import scale
from datetime import datetime
from IPython.display import display

# ------------------------------
# Load simulated data from med_events.csv
# ------------------------------
file_path = '/mnt/data/med_events.csv'
med_events = pd.read_csv(file_path)
med_events['DATE'] = pd.to_datetime(med_events['DATE'], errors='coerce').dt.date
tidy = med_events.copy()

# ------------------------------
# Function: SeeDBSCAN (using DBSCAN Clustering)
# ------------------------------
def SeeDBSCAN(arg1, tidy):
    data_med = tidy[tidy['CATEGORY'] == arg1].copy()
    if data_med.empty:
        raise ValueError(f"No data found for medication {arg1}")
        
    data_med = data_med.sort_values(by=['PATIENT_ID', 'DATE'])
    
    intervals = []
    patient_data_list = []
    for patient_id in data_med['PATIENT_ID'].unique():
        patient_data = data_med[data_med['PATIENT_ID'] == patient_id]
        if len(patient_data) > 1:
            dates = pd.to_datetime(patient_data['DATE'])
            patient_intervals = np.diff(dates).astype('timedelta64[D]').astype(int)
            intervals.extend(patient_intervals)
            patient_data_list.append(patient_data)
            
    if len(intervals) == 0:
        raise ValueError("No intervals found. Each patient must have at least two prescriptions.")
    
    ecdf_x = np.sort(intervals)
    ecdf_y = np.arange(1, len(ecdf_x)+1) / len(ecdf_x)
    dfper = pd.DataFrame({'x': ecdf_x, 'y': ecdf_y})
    dfper = dfper[dfper['y'] <= 0.8]
    if len(dfper) == 0:
        raise ValueError("No data points remain after filtering for 80% ECDF")
    
    # DBSCAN Clustering
    a = dfper[['x']].copy()
    a = scale(a)
    dbscan = DBSCAN(eps=1.5, min_samples=5)
    dfper['cluster'] = dbscan.fit_predict(a)
    
    final_df = pd.concat(patient_data_list, ignore_index=True)
    
    return {'data': final_df, 'ecdf_data': dfper}

# ------------------------------
# Function: SeeKMeans (using K-Means Clustering)
# ------------------------------
def SeeKMeans(arg1, tidy, n_clusters=3):
    data_med = tidy[tidy['CATEGORY'] == arg1].copy()
    if data_med.empty:
        raise ValueError(f"No data found for medication {arg1}")
        
    data_med = data_med.sort_values(by=['PATIENT_ID', 'DATE'])
    
    intervals = []
    patient_data_list = []
    for patient_id in data_med['PATIENT_ID'].unique():
        patient_data = data_med[data_med['PATIENT_ID'] == patient_id]
        if len(patient_data) > 1:
            dates = pd.to_datetime(patient_data['DATE'])
            patient_intervals = np.diff(dates).astype('timedelta64[D]').astype(int)
            intervals.extend(patient_intervals)
            patient_data_list.append(patient_data)
            
    if len(intervals) == 0:
        raise ValueError("No intervals found. Each patient must have at least two prescriptions.")
    
    ecdf_x = np.sort(intervals)
    ecdf_y = np.arange(1, len(ecdf_x)+1) / len(ecdf_x)
    dfper = pd.DataFrame({'x': ecdf_x, 'y': ecdf_y})
    dfper = dfper[dfper['y'] <= 0.8]
    if len(dfper) == 0:
        raise ValueError("No data points remain after filtering for 80% ECDF")
    
    # K-Means Clustering
    a = dfper[['x']].copy()
    a = scale(a)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    dfper['cluster'] = kmeans.fit_predict(a)
    
    final_df = pd.concat(patient_data_list, ignore_index=True)
    
    return {'data': final_df, 'ecdf_data': dfper}

# ------------------------------
# Function: compare_clustering_methods
# ------------------------------
def compare_clustering_methods(arg1, tidy):
    dbscan_result = SeeDBSCAN(arg1, tidy)
    kmeans_result = SeeKMeans(arg1, tidy)
    
    plt.figure(figsize=(12,5))
    
    plt.subplot(1,2,1)
    sns.scatterplot(data=dbscan_result['ecdf_data'], x='x', y='y', hue='cluster', palette='deep', s=50)
    plt.xlabel("Event Interval (days)")
    plt.ylabel("ECDF Value")
    plt.title(f"DBSCAN Clustering ({arg1})")
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.subplot(1,2,2)
    sns.scatterplot(data=kmeans_result['ecdf_data'], x='x', y='y', hue='cluster', palette='deep', s=50)
    plt.xlabel("Event Interval (days)")
    plt.ylabel("ECDF Value")
    plt.title(f"K-Means Clustering ({arg1})")
    plt.grid(True, linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()

# ------------------------------
# Example usage for comparison
# ------------------------------
print("=== Clustering Comparison for medA ===")
compare_clustering_methods("medA", tidy)

print("\n=== Clustering Comparison for medB ===")
compare_clustering_methods("medB", tidy)
