# Generate simulated refill histories


In [27]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from datetime import timedelta, datetime
from IPython.display import display  # Correct import
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# Logistic function to generate values for adherence patterns (Groups 3 and 4)
def logistic(x, L=0, S=1, D=1, h=1, B=None):
    if B is None:
        B = x - D
    return (h - (5 * L)) / (1 + np.exp(S * B)) + L

# Function to create offsets for different groups
def generate_offset(group, n):
    if group == 1:  # High adherence
        L, U, m, s = -0.1, 0.2, 0.05, 0.1
    elif group == 2:  # Erratic adherence
        L, U, m, s = -0.2, 1.2, 0, 1
    elif group == 3:  # Gradual decline
        offsets = norm.rvs(loc=1, scale=1, size=n)
        time_factor = 2 / n * np.arange(1, n + 1)
        return time_factor * offsets
    elif group == 4:  # Intermittent adherence
        offsets = norm.rvs(loc=1, scale=0.1, size=n)
        return logistic(np.arange(1, n + 1), L=0.05, S=10, D=n, B=np.sin(2 * (np.arange(1, n + 1)) - n)) * offsets
    elif group == 5:  # Partial drop-off
        offsets = norm.rvs(loc=1, scale=1, size=n)
        return logistic(np.arange(1, n + 1), L=0.05, S=-15, D=n/3) * offsets
    elif group == 6:  # Non-persistence
        n = np.random.choice([2, 3])
        L, U, m, s = -0.2, 0.8, 0.3, 1
    else:
        raise ValueError("Invalid group number.")

    pL, pU = norm.cdf([L, U], loc=m, scale=s)
    return norm.ppf(np.random.uniform(pL, pU, n), loc=m, scale=s)

# Generate simulated refill patterns
def generate_refills(ntot, start_date='2022-01-01', total_duration=2*365, disp_durations=[30, 60, 90], dist_durations=[0.3, 0.5, 0.2], group_dist=[0.1, 0.2, 0.2, 0.2, 0.2, 0.1]):
    sample = []
    mean_duration = np.dot(disp_durations, dist_durations)
    n = int(np.ceil((total_duration / mean_duration) * 1.5))
    
    patient_id = 1
    for group_id, group_prob in enumerate(group_dist, 1):
        num_patients = int(round(group_prob * ntot))
        for _ in range(num_patients):
            initial_fill = 30
            offsets = generate_offset(group_id, n)
            refill_durations = np.random.choice(disp_durations, size=len(offsets)-1, p=dist_durations)
            refill_durations = np.insert(refill_durations, 0, initial_fill)
            refill_dates = [datetime.strptime(start_date, '%Y-%m-%d')]
            medicine_type = np.random.choice(['Med_A', 'Med_B', 'Med_C'])

            for i in range(1, len(refill_durations)):
                last_date = refill_dates[-1]
                delay = round(offsets[i-1] * refill_durations[i-1])
                refill_dates.append(last_date + timedelta(days=int(refill_durations[i-1] + delay)))
            
            for date, duration in zip(refill_dates[:-1], refill_durations):
                sample.append({
                    'GROUP': group_id,
                    'PATIENT_ID': patient_id,
                    'DATE': date,
                    'DURATION': duration,
                    'ATC' : medicine_type
                })
            patient_id += 1
    
    return pd.DataFrame(sample)

# Dichotomization function
def dichotomize_adherence(duration, event_interval, threshold=0.8):
    return [1 if (duration / event_interval) >= threshold else 0 for _ in range(event_interval)]

# Clustering function
def perform_clustering(data, n_clusters=6):
    pivot_table = data.pivot_table(index='PATIENT_ID', columns='DATE', values='DURATION', fill_value=0)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(pivot_table)
    patient_clusters = pd.DataFrame({'PATIENT_ID': pivot_table.index, 'CLUSTER': clusters})
    return patient_clusters

# Generate sample data
data = generate_refills(100)  # Simulate data for 100 individuals
display(data.head())

# Save to CSV
data.to_csv('simulated_refill_data.csv', index=False)
print("Data saved to 'simulated_refill_data.csv'")

# Apply clustering to validate groups
patient_clusters = perform_clustering(data)

# Merge cluster assignments back to the original data
data = data.merge(patient_clusters, on='PATIENT_ID')

# Evaluate clustering performance using Adjusted Rand Index (ARI)
ari_score = adjusted_rand_score(data['GROUP'], data['CLUSTER'])
print(f'Adjusted Rand Index (ARI) for clustering: {ari_score:.2f}')


Unnamed: 0,GROUP,PATIENT_ID,DATE,DURATION,ATC
0,1,1,2022-01-01,30,Med_B
1,1,1,2022-02-02,60,Med_B
2,1,1,2022-04-03,60,Med_B
3,1,1,2022-05-29,60,Med_B
4,1,1,2022-07-30,90,Med_B


Data saved to 'simulated_refill_data.csv'
Adjusted Rand Index (ARI) for clustering: -0.00
