# Import Packages

In [None]:
import pandas as pd
import numpy as np
import datetime
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
data = pd.read_excel(r'C:\Users\91709\Downloads\Work\UpWork\Pete\Raw Data.xlsx', sheet_name = 'Data')

# HBA1C

In [None]:
data.loc[(data['Result Date'].notnull()) & (data['HbA1c'].isnull()) , 'HbA1c'] = 'Not Available' 
##Update NA where date is available but not HBA1C
hbac = data.dropna(subset = ['Result Date']) #Drop rows where result date is blank
hbac = hbac.drop_duplicates(subset=['InternalID', 'Result Date'] , keep="last") 
#Drop rows where date is repeated for patient

temp1 = hbac.groupby(['InternalID']) #Create a temporary and group it with patient
temp1 = temp1.agg(Minimum_Date=('Result Date', np.min), Maximum_Date=('Result Date', np.max)) 
#Get the earliest and latest report date for each patient
temp1 = temp1.reset_index() #Reset Index to access all columns
temp1['months_to_add'] = 90 #Create a new column to be added to the earliest date to get the 1st visit after 90 days
temp1['3 months'] = temp1['Minimum_Date'] + pd.TimedeltaIndex(temp1['months_to_add'], unit='D') 
#Add 90 days to the earliest date

three_month_list = temp1['3 months'].to_list() #Store the above result in a list
patient_list = temp1['InternalID'].to_list() #Store all patient ID in a list
three_month_exact_date = [] #Create an empty list
for (i,j) in zip(patient_list, three_month_list): #Loop through all the patient to get their 1st visit after 90 days
    temp_hbac = hbac[hbac['InternalID'] == i]
    date_working = temp_hbac[(temp_hbac['Result Date'] >= j)]
    date = date_working['Result Date'].min()
    three_month_exact_date.append(date)
temp1['exact_3 months'] = three_month_exact_date #Create a column to get the 1st visit of each patient after 90 days
temp1.loc[(temp1['exact_3 months'].isnull()) , 'treatment_duration'] = 'Less than 3 months'#Mark the new patient < 90 days

hbac = hbac[['InternalID', 'Result Date', 'HbA1c']] #Select relevant columns from a detail database

#Merge the detail database to get the HBA1C data for 1st visit, first visit after 3 months and latest visit of each patient
output = temp1.merge(hbac, how='left', left_on=["InternalID", "Minimum_Date"], right_on=["InternalID","Result Date"])
output.rename(columns = {'HbA1c':'HbA1c_at_start'}, inplace = True)
output = output.merge(hbac, how='left', left_on=["InternalID", "exact_3 months"], right_on=["InternalID","Result Date"])
output.rename(columns = {'HbA1c':'HbA1c_after_3_months'}, inplace = True)
output = output.merge(hbac, how='left', left_on=["InternalID", "Maximum_Date"], right_on=["InternalID","Result Date"])
output.rename(columns = {'HbA1c':'HbA1c_latest'}, inplace = True)

output = output[['InternalID', 'HbA1c_at_start', 'HbA1c_after_3_months', 'HbA1c_latest']] #Select relevant columns

percent_change1 = output[(output['HbA1c_at_start'].notnull()) & (output['HbA1c_after_3_months'].notnull()) & \
                     (output['HbA1c_at_start'] != 'Not Available') & (output['HbA1c_after_3_months'] != 'Not Available') ]
#Filter non null rows to calculate delta between start and 3 months
percent_change1['%hba1c_change_after_first_3_months'] = (percent_change1['HbA1c_after_3_months'] \
                                          - percent_change1['HbA1c_at_start'])/ percent_change1['HbA1c_at_start']
#Calculate % change between start and end of 3 months
percent_change1 = percent_change1[['InternalID', '%hba1c_change_after_first_3_months']] #Select the ID and delta column
#Repeat the steps for end of 3 months till end
percent_change2 = output[(output['HbA1c_latest'].notnull()) & (output['HbA1c_after_3_months'].notnull()) & \
                     (output['HbA1c_latest'] != 'Not Available') & (output['HbA1c_after_3_months'] != 'Not Available') ]
percent_change2['%hba1c_change_from_3_months_till_end'] = (percent_change2['HbA1c_latest'] \
                              - percent_change2['HbA1c_after_3_months'])/ percent_change2['HbA1c_after_3_months']
percent_change2 = percent_change2[['InternalID', '%hba1c_change_from_3_months_till_end']]

#Add both the delta column to main table
output = output.merge(percent_change1, how='left', left_on=["InternalID"], right_on=["InternalID"])
output = output.merge(percent_change2, how='left', left_on=["InternalID"], right_on=["InternalID"])

# Weight

In [None]:
#Repeat all the steps performed for HBA1C except for report date use observation date and instead of HBA1C use Weight(kg)
data.loc[(data['ObservationDate'].notnull()) & (data['Weight (kg)'].isnull()) , 'Weight (kg)'] = 'Not Available'
weight = data.dropna(subset = ['ObservationDate'])
weight = weight.drop_duplicates(subset=['InternalID', 'ObservationDate'] , keep="last")

temp2 = weight.groupby(['InternalID'])
temp2 = temp2.agg(Minimum_Date=('ObservationDate', np.min), Maximum_Date=('ObservationDate', np.max))
temp2 = temp2.reset_index()
temp2['months_to_add'] = 90
temp2['3 months'] = temp2['Minimum_Date'] + pd.TimedeltaIndex(temp2['months_to_add'], unit='D')

three_month_list = temp2['3 months'].to_list()
patient_list = temp2['InternalID'].to_list()
three_month_exact_date = []
for (i,j) in zip(patient_list, three_month_list):
    temp_weight = weight[weight['InternalID'] == i]
    date_working = temp_weight[(temp_weight['ObservationDate'] >= j)]
    date = date_working['ObservationDate'].min()
    three_month_exact_date.append(date)
temp2['exact_3 months'] = three_month_exact_date
temp2.loc[(temp1['exact_3 months'].isnull()) , 'treatment_duration'] = 'Less than 3 months'

weight = weight[['InternalID', 'ObservationDate', 'Weight (kg)']]

output2 = temp2.merge(weight, how='left', left_on=["InternalID", "Minimum_Date"], \
                      right_on=["InternalID","ObservationDate"])
output2.rename(columns = {'Weight (kg)':'Weight_at_start'}, inplace = True)
output2 = output2.merge(weight, how='left', left_on=["InternalID", "exact_3 months"], \
                        right_on=["InternalID","ObservationDate"])
output2.rename(columns = {'Weight (kg)':'Weight_after_3_months'}, inplace = True)
output2 = output2.merge(weight, how='left', left_on=["InternalID", "Maximum_Date"], \
                        right_on=["InternalID","ObservationDate"])
output2.rename(columns = {'Weight (kg)':'Weight_latest'}, inplace = True)

output2= output2[['InternalID', 'Weight_at_start', 'Weight_after_3_months', 'Weight_latest']]

percent_change1 = output2[(output2['Weight_at_start'].notnull()) & (output2['Weight_after_3_months'].notnull()) & \
                     (output2['Weight_at_start'] != 'Not Available') & \
                    (output2['Weight_after_3_months'] != 'Not Available')]
percent_change1['%weight_change_after_first_3_months'] = (percent_change1['Weight_after_3_months'] \
                                          - percent_change1['Weight_at_start'])/ percent_change1['Weight_at_start']

percent_change1 = percent_change1[['InternalID', '%weight_change_after_first_3_months']] 
percent_change2 = output2[(output2['Weight_latest'].notnull()) & (output2['Weight_after_3_months'].notnull()) & \
                     (output2['Weight_latest'] != 'Not Available') & \
                          (output2['Weight_after_3_months'] != 'Not Available') ]
percent_change2['%weight_change_from_3_months_till_end'] = (percent_change2['Weight_latest'] \
                              - percent_change2['Weight_after_3_months'])/ percent_change2['Weight_after_3_months']
percent_change2 = percent_change2[['InternalID', '%weight_change_from_3_months_till_end']]

output2 = output2.merge(percent_change1, how='left', left_on=["InternalID"], right_on=["InternalID"])
output2 = output2.merge(percent_change2, how='left', left_on=["InternalID"], right_on=["InternalID"])

hbcnweight = output.merge(output2, how = 'outer', on = 'InternalID') 
#Merge HBA1C and Weight dateframe to get a consolidated view

# Dose Change

In [None]:
dose_change = data[['InternalID', 'Dose change type']] #Select Relevant columns
dose_change = dose_change.dropna(subset = ['Dose change type']) #Drop rows with no data

dose_change = pd.get_dummies(dose_change) #Transform 1 column to 5 based on dosage change type
dose_change.rename(columns = {'Dose change type_Ceased':'Ceased', 'Dose change type_Decreased': 'Decreased',
'Dose change type_Increased': 'Increased', 'Dose change type_Started': 'Started', \
                             'Dose change type_Unchanged': 'Unchanged'}, inplace = True) #Rename the columns

output3 = dose_change.groupby(['InternalID']).sum() #Group it with patient
output3 = output3.reset_index() #Reset index to get access to all columns

combined = hbcnweight.merge(output3, how = 'outer', on = 'InternalID') #Join it with main table

# Save output as excel

In [None]:
combined.to_excel('Result.xlsx', index = False) #Save as excel