**Load the required libraries and data**

In [1]:
# Libraries
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns 

In [2]:
# Data  
appointment_df = pd.read_csv("../data/Appointments List.csv", 
                             encoding='UTF-16', delimiter="\t")

  appointment_df = pd.read_csv("../data/Appointments List.csv",


# Explanatory Data Analysis
## Data Preparation

In [3]:
appointment_df.head()

Unnamed: 0,Appointment UID,Appt Status Description,Date Of Service,Chartnumber,NurseName,FirstTreatmentApptInd,Appt Type Group,Appt Type,Created By,Provider,Facility City,Room,Column Heading,#,Count of tblAppointments,Patient FID
0,Grand Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,55975,*
1,1,Made,01/01/00,,,N,Default,,system,",",,Other,,1,1,1
2,109,Seen,01/22/18,1,,N,Intake,TMS INTAKE DR. BRENNER,EMILY,"BRENNER,DANIEL A",CAMBRIDGE,Other,DR. BRENNER,2,1,5283
3,239,Seen,01/22/18,2,,N,MH Infusion,KETAMINE MH -INFUSION ONLY,EMILY,"BRENNER,DANIEL A",CAMBRIDGE,Other,2C,3,1,5286
4,240,Made,01/22/18,9,,N,Intake,TMS INTAKE DR. BRENNER,EMILY,"BRENNER,DANIEL A",CAMBRIDGE,Other,DR. BRENNER,4,1,5351


In [4]:
appointment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55976 entries, 0 to 55975
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Appointment UID           55976 non-null  object
 1   Appt Status Description   55976 non-null  object
 2   Date Of Service           37250 non-null  object
 3   Chartnumber               55975 non-null  object
 4   NurseName                 19321 non-null  object
 5   FirstTreatmentApptInd     54678 non-null  object
 6   Appt Type Group           54678 non-null  object
 7   Appt Type                 54677 non-null  object
 8   Created By                55976 non-null  object
 9   Provider                  55976 non-null  object
 10  Facility City             51738 non-null  object
 11  Room                      55976 non-null  object
 12  Column Heading            54697 non-null  object
 13  #                         55976 non-null  object
 14  Count of tblAppointmen

In [5]:
# Drop the first two rows 
appointment_df.drop(index=[0, 1], inplace=True)

In [6]:
appointment_df["Chartnumber"] = pd.to_numeric(appointment_df["Chartnumber"], errors="coerce")
appointment_df["Date"] = pd.to_datetime(appointment_df["Date Of Service"], errors="coerce")

  appointment_df["Date"] = pd.to_datetime(appointment_df["Date Of Service"], errors="coerce")


In [7]:
# Replace "Made" with "Seen"
appointment_df["Appt Status Description"].replace(to_replace="Made", value="Seen", inplace=True)

In [8]:
appointment_df["Facility City"].isna().sum()

4237

In [9]:
appointment_df["Facility City"] = appointment_df["Facility City"].ffill().backfill()

  appointment_df["Facility City"] = appointment_df["Facility City"].ffill().backfill()


In [10]:
# Confirm if there are still null values
appointment_df["Facility City"].isna().sum()

0

In [11]:
# Select the successfull appointments(visits)
seen_df = appointment_df[appointment_df["Appt Status Description"]=="Seen"]

# subset the data set 
subset_columns = ["Date", "Chartnumber", "Appt Type Group", "Facility City"]
seen_df = seen_df[subset_columns]

# Drop any record with null values 
seen_df.dropna(how="any", inplace=True)

In [12]:
# Calculate the percentage of patients who start with intake 
count = 0
chart_numbers = seen_df["Chartnumber"].unique()

for chart_no in chart_numbers:
    # Filter the data set 
    df_by_chart_no = seen_df[seen_df["Chartnumber"]==chart_no].sort_values(by="Date")
    df_by_chart_no.reset_index(drop=True, inplace=True)
    
    if df_by_chart_no["Appt Type Group"][0]=="Intake":
        count+=1
    else:
        pass
    
# Patients enrolled with an intake    
count/seen_df["Chartnumber"].nunique()

0.8556767158434894

approximately 86% of the patients enrolled with an intake instead of treatment or follow up

Features/Variables to include in the post intake dataset 

* Chart number 
* First Service after enrollment
* First Service after intake
* Days after intake
* Days between MH Infusion and Intake 
* Number of visits
* Number of MH infusions

* Number of MH infusions outside cambridge 
* Number of visits outside cambridge

In [13]:
first_service_after_enrollment = []
first_service_after_intake = []
days_after_intake = []
days_btn_mhinfusion_intake = []
visit_count = []
mhInfusion_count = []

# Additional Modifications
infusion_count_outside_cambridge = []
visit_count_outside_cambridge = []
intake_facility_city = []



for chart_no in chart_numbers:
    # Filter the data set by chart number
    df_by_chart_no = seen_df[seen_df["Chartnumber"]==chart_no].sort_values(by="Date")
    df_by_chart_no.reset_index(drop=True, inplace=True)
    
    # Get the required data 
    first_service_after_enrollment.append(df_by_chart_no["Appt Type Group"][0])
    visit_count.append(df_by_chart_no.shape[0])
    mhInfusion_count.append(df_by_chart_no["Appt Type Group"].to_list().count('MH Infusion'))
    
    # Get the additional data 
    ## Filter out Cambridge visits from the data
    outside_cambridge_df = df_by_chart_no[df_by_chart_no["Facility City"]!="CAMBRIDGE"]
    visit_count_outside_cambridge.append(outside_cambridge_df.shape[0])
    
    ## Count the number of infusions outside cambridge
    infusion_count_outside_cambridge.append(outside_cambridge_df["Appt Type Group"].tolist().count('MH Infusion'))
    
    if df_by_chart_no["Appt Type Group"].tolist()[0] == "Intake":
        # Check if there exists multiple services
        if df_by_chart_no.shape[0] > 1:
            first_service_after_intake.append(df_by_chart_no["Appt Type Group"][1])

            # Calculate days after intake
            days_1 = (df_by_chart_no["Date"][1] - df_by_chart_no["Date"][0]).days
            days_after_intake.append(days_1)
            
            # Get the intake facility city 
            intake_facility_city.append(df_by_chart_no["Facility City"][0])

            # Calculate days between intake and MH infusion
            try:
                mhInfusion_index = df_by_chart_no["Appt Type Group"].tolist().index('MH Infusion')
                days_2 = (df_by_chart_no["Date"][mhInfusion_index] - df_by_chart_no["Date"][0]).days
                days_btn_mhinfusion_intake.append(days_2)
            except ValueError: 
                days_btn_mhinfusion_intake.append(np.nan)        
        
        # The service only includes intake
        else:
            print(chart_no)
            intake_facility_city.append(df_by_chart_no["Facility City"][0])
            
            first_service_after_intake.append(np.nan)
            days_after_intake.append(np.nan)
            days_btn_mhinfusion_intake.append(np.nan)
        
    else: 
        # Intake exists  
        if "Intake" in df_by_chart_no["Appt Type Group"].tolist():
            # Note the intake index
            intake_index = df_by_chart_no["Appt Type Group"].tolist().index("Intake")
                
            try:
                first_service_after_intake.append(df_by_chart_no["Appt Type Group"].tolist()[intake_index + 1])
                
                # Get the intake facility city
                intake_facility_city.append(df_by_chart_no["Facility City"][intake_index])
    
                # Calculate days after intake
                days_1 = (df_by_chart_no["Date"][intake_index + 1] - df_by_chart_no["Date"][intake_index]).days
                days_after_intake.append(days_1)
                
                # Calculate days between intake and MH infusion
                try:
                    mhInfusion_index = df_by_chart_no["Appt Type Group"].tolist().index('MH Infusion')
                    days_2 = (df_by_chart_no["Date"][mhInfusion_index] - df_by_chart_no["Date"][0]).days
                    days_btn_mhinfusion_intake.append(days_2)
                except ValueError: 
                    days_btn_mhinfusion_intake.append(np.nan) 
                    
            # if intake is the last service just insert a null value
            except IndexError:
                # Get the intake facility city
                intake_facility_city.append(df_by_chart_no["Facility City"][intake_index])
                
                first_service_after_intake.append(np.nan)
                days_after_intake.append(np.nan)
                days_btn_mhinfusion_intake.append(np.nan)
            
        else: 
            # Get the intake facility city
            intake_facility_city.append(np.nan)
        
            first_service_after_intake.append(np.nan)
            days_after_intake.append(np.nan)
            days_btn_mhinfusion_intake.append(np.nan)

12
116
127
132
162
189
216
217
235
248
300
320
388
414
446
453
485
502
513
525
542
540
547
564
565
567
574
583
586
634
660
676
696
719
726
730
755
767
775
783
787
797
810
799
832
843
906
918
905
928
932
942
944
953
961
970
986
989
1017
1030
1031
1038
1051
1036
1046
1021
1068
1070
1087
1089
977
1085
1095
1100
1105
1090
1121
1135
1137
1149
1156
1130
1115
1177
1178
1153
1227
1235
1249
1258
1259
1272
1298
1331
1334
1341
1226
1353
1373
1374
1357
1396
1436
1377
1433
1447
1454
1474
1469
1497
1251
1516
1530
1559
1563
1569
1576
1591
1582
1607
1601
1603
1615
1587
1621
1242
1646
1579
1655
779
1675
1705
1706
1714
1720
1722
1728
1736
1739
1753
1758
1761
1648
1787
1814
1839
1857
1862
1877
1894
1907
1918
1867
1957
1797
1874
1996
1987
2004
1871
2009
2035
1768
2024
2025
1975
2058
2045
2068
2094
2119
2120
2110
2165
2159
2139
2192
2214
2199
2233
2242
2252
2253
2052
2295
2301
2304
2201
2322
2209
2284
2347
2352
2369
2368
2377
2455
2544
2565
2548
2582
2657
2669
2674


In [14]:
# Create the dataframe 
post_intake_df = pd.DataFrame({
    "Chartnumber":chart_numbers,
    "First post-enrollment service": first_service_after_enrollment,
    "First post-intake service": first_service_after_intake,
    "Post-intake days": days_after_intake,
    "Days btn MHinfusion and Intake": days_btn_mhinfusion_intake,
    "Visits": visit_count,
    "MH infusions": mhInfusion_count, 
    "infusions outside Cambridge":infusion_count_outside_cambridge,
    "Visits outside Cambridge": visit_count_outside_cambridge,
    "Intake Facility City": intake_facility_city
})

In [15]:
post_intake_df.head()

Unnamed: 0,Chartnumber,First post-enrollment service,First post-intake service,Post-intake days,Days btn MHinfusion and Intake,Visits,MH infusions,infusions outside Cambridge,Visits outside Cambridge,Intake Facility City
0,1,Intake,TMS,14.0,,49,0,0,0,CAMBRIDGE
1,2,MH Infusion,,,,8,8,0,0,
2,9,Intake,TMS,9.0,66.0,68,30,0,0,CAMBRIDGE
3,6,Intake,MH Infusion,0.0,0.0,16,13,0,0,CAMBRIDGE
4,12,Intake,,,,1,0,0,0,CAMBRIDGE


In [16]:
post_intake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1559 entries, 0 to 1558
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Chartnumber                     1559 non-null   int64  
 1   First post-enrollment service   1559 non-null   object 
 2   First post-intake service       1137 non-null   object 
 3   Post-intake days                1137 non-null   float64
 4   Days btn MHinfusion and Intake  945 non-null    float64
 5   Visits                          1559 non-null   int64  
 6   MH infusions                    1559 non-null   int64  
 7   infusions outside Cambridge     1559 non-null   int64  
 8   Visits outside Cambridge        1559 non-null   int64  
 9   Intake Facility City            1343 non-null   object 
dtypes: float64(2), int64(5), object(3)
memory usage: 121.9+ KB


## Patient Segmentation 
Will segment the patients into four segments from the PLP data set; 

- before mid-2023 patients 
- late 2023 patients 
- 2024 patients 
- all-time patients 

## Data Analysis and Visualization

# Conclusion

**Challenges**

- Not all patients in the records started with an intake. Some had no intakes in the appointment records. Also, theere are some patients that had other services before intake for instance chart#s 24 and 118
- It is cumbersome to focus this analysis on cambridge since some patients visited cambridge and other facilities for instance chart# 2194. - Fix this by counting the number of visits and infusions outside the Cambridge facility.
-