**Load the required data and libraries**

In [1]:
# Load the required libraries 
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns
import warnings 

warnings.filterwarnings("ignore")

In [2]:
# Load the required data
appointment_df = pd.read_csv("../data/Appointments List.csv", 
                             encoding='UTF-16', delimiter="\t")
demographics_df = pd.read_csv("../data/Demographic Data.csv",
                             encoding='UTF-16', delimiter="\t")
care_team_df = pd.read_csv("../data/EHRFV Care Team.csv", 
                          encoding='UTF-16', delimiter="\t")
patients_profile_df = pd.read_csv("../data/PLP Patient Lifetime Profile.csv", 
                                 encoding='UTF-16', delimiter="\t")

In [3]:
# import chardet

In [4]:
# with open("data/Appointments List.csv", "rb") as f:
#     result = chardet.detect(f.read(100)) # read a portion of the file 
#     print(result["encoding"])

# Explanatory Data Analysis 
## Data Assessment 

**Appointments data**

In [5]:
# first few rows 
appointment_df.head()

Unnamed: 0,Appointment UID,Appt Status Description,Date Of Service,Chartnumber,NurseName,FirstTreatmentApptInd,Appt Type Group,Appt Type,Created By,Provider,Facility City,Room,Column Heading,#,Count of tblAppointments,Patient FID
0,Grand Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,55975,*
1,1,Made,01/01/00,,,N,Default,,system,",",,Other,,1,1,1
2,109,Seen,01/22/18,1,,N,Intake,TMS INTAKE DR. BRENNER,EMILY,"BRENNER,DANIEL A",CAMBRIDGE,Other,DR. BRENNER,2,1,5283
3,239,Seen,01/22/18,2,,N,MH Infusion,KETAMINE MH -INFUSION ONLY,EMILY,"BRENNER,DANIEL A",CAMBRIDGE,Other,2C,3,1,5286
4,240,Made,01/22/18,9,,N,Intake,TMS INTAKE DR. BRENNER,EMILY,"BRENNER,DANIEL A",CAMBRIDGE,Other,DR. BRENNER,4,1,5351


In [6]:
# Data info summary 
appointment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55976 entries, 0 to 55975
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Appointment UID           55976 non-null  object
 1   Appt Status Description   55976 non-null  object
 2   Date Of Service           37250 non-null  object
 3   Chartnumber               55975 non-null  object
 4   NurseName                 19321 non-null  object
 5   FirstTreatmentApptInd     54678 non-null  object
 6   Appt Type Group           54678 non-null  object
 7   Appt Type                 54677 non-null  object
 8   Created By                55976 non-null  object
 9   Provider                  55976 non-null  object
 10  Facility City             51738 non-null  object
 11  Room                      55976 non-null  object
 12  Column Heading            54697 non-null  object
 13  #                         55976 non-null  object
 14  Count of tblAppointmen

In [7]:
# Duplicated? 
appointment_df.duplicated().sum()

0

In [8]:
appointment_df["#"]

0         Total
1             1
2             2
3             3
4             4
          ...  
55971    55,971
55972    55,972
55973    55,973
55974    55,974
55975    55,975
Name: #, Length: 55976, dtype: object

In [9]:
appointment_df["Facility City"].value_counts()

Facility City
CAMBRIDGE    48349
AMHERST       2457
BEVERLY        931
Total            1
Name: count, dtype: int64

In [10]:
appointment_df["Facility City"].unique()

array(['Total', nan, 'CAMBRIDGE', 'AMHERST', 'BEVERLY'], dtype=object)

**Demographics**

In [11]:
# first few rows  
demographics_df.head()

Unnamed: 0,Chart#,Ethnicity,Race,Gender,Address,City,State,Zipcode,Intake Education Degree,Intake Employment Current Employer,...,Intake Employment Retired,Intake Employment Student,Intake Employment Unemployed,Intake Employment Working,Intake Family Divorced,Intake Family Married,Intake Family Partnered,Intake Family Single,Intake Family Widowed,Unnamed: 21
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,,,F,253 NORFOLK STREET,CAMBRIDGE,MA,2139.0,,,...,,,,,,,,,,
2,2.0,,,F,48 UNION PARK,BOSTON,MA,2118.0,,,...,,,,,,,,,,
3,3.0,,,M,164 PALMER STREET,ARLINGTON,MA,2474.0,,,...,,,,,,,,,,
4,4.0,,,F,110 HARTWELL ROAD,BEDFORD,MA,1730.0,,,...,,,,,,,,,,


In [12]:
# Demographics data info summary 
demographics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2686 entries, 0 to 2685
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Chart#                              2685 non-null   float64
 1   Ethnicity                           2686 non-null   object 
 2   Race                                2686 non-null   object 
 3   Gender                              2685 non-null   object 
 4   Address                             2666 non-null   object 
 5   City                                2663 non-null   object 
 6   State                               2661 non-null   object 
 7   Zipcode                             2665 non-null   object 
 8   Intake Education Degree             2684 non-null   object 
 9   Intake Employment Current Employer  2580 non-null   object 
 10  Intake Employment Disabled          2412 non-null   object 
 11  Intake Employment Occupation        2632 no

In [13]:
demographics_df.duplicated().sum()

0

In [14]:
demographics_df["Intake Education Degree"].value_counts()

Intake Education Degree
                              2198
Masters                         15
BA                              13
PhD                             11
Some college                    10
                              ... 
Some undergraduate               1
4 years college                  1
High School (so far)             1
Doctorare (PhD, chemistry)       1
College - BA                     1
Name: count, Length: 274, dtype: int64

**Patients Profile**

In [15]:
patients_profile_df.head()

Unnamed: 0,Patient FID,ChartNumber,ZipCode (tmtPLPPatientLifeTimeProfile),CreatedDate,First Date Of Service,Last Date Of Service,Last Primary Appt Type Group,Last Facility City,Days Since Last DoS dim,Next Scheduled Appt Date,...,Total Insurance Payments,Total Patient Payments,Total Writeoff Amount,Charge Code Count,Diagnosis Code Count,Vists / Months Span,OutstandingAR,Count of tmtPLPPatientLifeTimeProfile,SeenProvidersCount,PatientsWithVisit%
0,Grand Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,...,"$1,240,855.95","$14,861,653.28","$995,613.48",8024,3473,6599.64,"$1,388,296.81",1602,0,99.94%
1,5283,1,02139,01/22/18,01/22/18,05/29/18,TMS,CAMBRIDGE,2366,,...,"$6,147.36","$3,464.68","$6,187.96",4,1,12.75,"$6,147.36",1,0,100.00%
2,5286,2,02118,01/23/18,01/16/18,06/27/18,MH Infusion,CAMBRIDGE,2337,,...,,"$4,450.00",,2,1,2.6,$0.00,1,0,100.00%
3,5287,3,02474,01/23/18,01/29/18,02/13/18,MH Infusion,CAMBRIDGE,2471,,...,,"$3,400.00",,2,1,7.0,$0.00,1,0,100.00%
4,5347,5,02127,01/23/18,01/25/18,03/27/18,MH Infusion,CAMBRIDGE,2429,,...,,"$3,900.00",,2,1,4.5,$0.00,1,0,100.00%


In [16]:
# Patients Profile data info summary 
patients_profile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1603 entries, 0 to 1602
Data columns (total 29 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   Patient FID                             1603 non-null   object
 1   ChartNumber                             1603 non-null   object
 2   ZipCode (tmtPLPPatientLifeTimeProfile)  1603 non-null   object
 3   CreatedDate                             1603 non-null   object
 4   First Date Of Service                   1603 non-null   object
 5   Last Date Of Service                    1603 non-null   object
 6   Last Primary Appt Type Group            1501 non-null   object
 7   Last Facility City                      1469 non-null   object
 8   Days Since Last DoS dim                 1603 non-null   object
 9   Next Scheduled Appt Date                144 non-null    object
 10  Financial Class                         1603 non-null   object
 11  Seen

**Care team**

In [17]:
# First few lines of the data
care_team_df.head()

Unnamed: 0,Day of Service Date,Chart#,Contact Type,Contact Name,Contact Phone,Contact Email,Unnamed: 6
0,11/15/24,1770,Intake_PCPName,"Nicole Green, PA-C",,,
1,11/12/24,1470,Intake_PCPName,Karen Tenner 617-629-6260,,,
2,11/12/24,2654,Intake_PCPName,,0.0,,
3,11/12/24,2654,Intake_PsychiatristName,,0.0,,
4,11/12/24,2654,Intake_TherapistName,Margot Shinnick,4134931995.0,,


In [18]:
# Data Info 
care_team_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Day of Service Date  1503 non-null   object 
 1   Chart#               1503 non-null   int64  
 2   Contact Type         1503 non-null   object 
 3   Contact Name         1247 non-null   object 
 4   Contact Phone        1326 non-null   object 
 5   Contact Email        0 non-null      float64
 6   Unnamed: 6           0 non-null      float64
dtypes: float64(2), int64(1), object(4)
memory usage: 82.3+ KB


In [19]:
# care_team_df["Unnamed: 6"].unique() # to be deleted

## Data Wrangling 

## Charts and Graphs 