# Import Packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
data = pd.read_excel(r'C:\Users\91709\Downloads\Work\UpWork\Pete\Data - Master - 210915.xlsx', sheet_name = 'Data')

In [3]:
new_header = data.iloc[0] #grab the first row for the header
data = data[1:] #take the data less the header row
data.columns = new_header #set the header row as the df heade

# Personal Details

In [4]:
personal = data[['InternalID', 'Sex', 'DOB']] #Select Relevant Columns
personal = personal.drop_duplicates(keep = 'last') #Remove Duplicates

personal['DOB'] = personal['DOB'].astype(str) #Convert Date to String
def age(born): #Extract Age from DOB
    born = datetime.strptime(born, "%Y-%m-%d %H:%M:%S").date()
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))
  
personal['Age'] = personal['DOB'].apply(age) #Add age column

personal['DOB'] = pd.to_datetime(personal['DOB']).dt.date #Convert date back to date format

# Weight

In [5]:
weight = data.dropna(subset = ['Appt Date']) #Drop rows which does't have Appt Date
weight = weight.drop_duplicates(subset=['InternalID', 'Appt Date'] , keep="last") #Drop duplicate rows

app_count = weight[['InternalID', 'Appt Date']] #Select relevant columns
app_count = app_count.groupby(['InternalID']).count() #Count no of visits by each patient
app_count = app_count.reset_index() #Get access to all columns
app_count.rename(columns = {'Appt Date': '# of Appts'}, inplace = True) #Rename columns accoring to template

personalnweightcount = personal.merge(app_count, how = 'outer', on = 'InternalID') #Merge it with personal details

temp2 = weight.groupby(['InternalID']) #Group data on patients to get the earliest and latest date
temp2 = temp2.agg(Minimum_Date=('Appt Date', np.min), Maximum_Date=('Appt Date', np.max))
temp2 = temp2.reset_index() #Get access to all columns
temp2['months_to_add'] = 90 #Get the date 3 months after the first visit
temp2['3 months'] = temp2['Minimum_Date'] + pd.TimedeltaIndex(temp2['months_to_add'], unit='D')

three_month_list = temp2['3 months'].to_list() #Put the dates created in 14, 15 inside a list
patient_list = temp2['InternalID'].to_list() #Put patient ID inside a list
three_month_exact_date = [] #Create an empty list
for (i,j) in zip(patient_list, three_month_list): #Get the date of the first visit after 90 days in a list
    temp_weight = weight[weight['InternalID'] == i]
    date_working = temp_weight[(temp_weight['Appt Date'] >= j)]
    date = date_working['Appt Date'].min()
    three_month_exact_date.append(date)
temp2['>=3 Month Appt Date'] = three_month_exact_date #Add the list to column

weight = weight[['InternalID', 'Appt Date', 'Weight (kg)']] #Select Relevant columns from the dataset created in 2

weight['Appt Date'] = pd.to_datetime(weight['Appt Date'], errors='coerce') #Convert to Date

#Look up the weight at first, first appointment after 3 months and last appointment and rename the column as per template
output2 = temp2.merge(weight, how='left', left_on=["InternalID", "Minimum_Date"], \
                      right_on=["InternalID","Appt Date"])
output2.rename(columns = {'Minimum_Date': '1st Appt Date','Weight (kg)':'1st Weight (Kg)'}, inplace = True)
output2 = output2.merge(weight, how='left', left_on=["InternalID", ">=3 Month Appt Date"], \
                        right_on=["InternalID","Appt Date"])
output2.rename(columns = {'Weight (kg)':'>=3 Month Weight'}, inplace = True)
output2 = output2.merge(weight, how='left', left_on=["InternalID", "Maximum_Date"], \
                        right_on=["InternalID","Appt Date"])
output2.rename(columns = {'Maximum_Date':'Last Appt Date','Weight (kg)':'Last  Weight (Kg)'}, inplace = True)

output2= output2[['InternalID', '1st Appt Date', '1st Weight (Kg)',  '>=3 Month Appt Date','>=3 Month Weight', \
                  'Last Appt Date', 'Last  Weight (Kg)']] #Select Relevant columns

#Get absolute and % change between first & first appointment after 3 months and first and last appointment
output2.loc[(output2['1st Weight (Kg)'].isnull()) | (output2['>=3 Month Weight'].isnull()), \
            '1st - >=3 Month Appt Change In Weight (Kg)'] = None
output2.loc[(output2['1st Weight (Kg)'].notnull()) | (output2['>=3 Month Weight'].notnull()), \
            '1st - >=3 Month Appt Change In Weight (Kg)'] = output2['>=3 Month Weight'] - output2['1st Weight (Kg)']
output2.loc[(output2['1st Weight (Kg)'].isnull()) | (output2['>=3 Month Weight'].isnull()), \
            '1st - >=3 Month Appt Change In Weight (%)'] = None
output2.loc[(output2['1st Weight (Kg)'].notnull()) | (output2['>=3 Month Weight'].notnull()), \
            '1st - >=3 Month Appt Change In Weight (%)'] = output2['1st - >=3 Month Appt Change In Weight (Kg)']\
/ output2['1st Weight (Kg)']
output2.loc[(output2['1st Weight (Kg)'].isnull()) | (output2['Last  Weight (Kg)'].isnull()), \
            '1st - Last Appt Change in Weight (Kg)'] = None
output2.loc[(output2['1st Weight (Kg)'].notnull()) | (output2['Last  Weight (Kg)'].notnull()), \
            '1st - Last Appt Change in Weight (Kg)'] = output2['Last  Weight (Kg)'] - output2['1st Weight (Kg)']
output2.loc[(output2['1st Weight (Kg)'].isnull()) | (output2['Last  Weight (Kg)'].isnull()), \
            '1st - Last Appt Change in Weight (%)'] = None
output2.loc[(output2['1st Weight (Kg)'].notnull()) | (output2['Last  Weight (Kg)'].notnull()), \
            '1st - Last Appt Change in Weight (%)'] = output2['1st - Last Appt Change in Weight (Kg)']\
/ output2['1st Weight (Kg)']

personalnweight = personalnweightcount.merge(output2, how = 'outer', on = 'InternalID') #Merge it with main dataframe

#Format date as per templates
personalnweight['1st Appt Date'] = personalnweight['1st Appt Date'].dt.date
personalnweight['>=3 Month Appt Date'] = personalnweight['>=3 Month Appt Date'].dt.date
personalnweight['Last Appt Date'] = personalnweight['Last Appt Date'].dt.date

personalnweight['Weight Notes'] = None #Create a notes column

# Diagnostic Result

In [6]:
hbac = data.dropna(subset = ['Results Date']) #Drop rows where result date is blank
hbac = hbac.drop_duplicates(subset=['InternalID', 'Results Date'] , keep="last") 
#Drop rows where date is repeated for patient

res_count = hbac[['InternalID', 'Results Date']] #Select Relevant columns
res_count = res_count.groupby(['InternalID']).count() #Group to get no of result dates per patient
res_count = res_count.reset_index() #Get access to all columns
res_count.rename(columns = {'Results Date': '# of Results'}, inplace = True) #Rename column as per template

aboveallnresultcount = personalnweight.merge(res_count, how = 'outer', on = 'InternalID') #Merge it with main dataframe

temp1 = hbac.groupby(['InternalID']) #Group data on patients to get the earliest and latest date
temp1 = temp1.agg(Minimum_Date=('Results Date', np.min), Maximum_Date=('Results Date', np.max)) 
temp1 = temp1.reset_index() #Reset Index to access all columns
temp1['months_to_add'] = 90 #Get the date 3 months after the first visit
temp1['3 months'] = temp1['Minimum_Date'] + pd.TimedeltaIndex(temp1['months_to_add'], unit='D') 

three_month_list = temp1['3 months'].to_list() #Store the above result in a list
patient_list = temp1['InternalID'].to_list() #Store all patient ID in a list
three_month_exact_date = [] #Create an empty list
for (i,j) in zip(patient_list, three_month_list): #Loop through all the patient to get their 1st visit after 90 days
    temp_hbac = hbac[hbac['InternalID'] == i]
    date_working = temp_hbac[(temp_hbac['Results Date'] >= j)]
    date = date_working['Results Date'].min()
    three_month_exact_date.append(date)
temp1['>=3 Month Date'] = three_month_exact_date #Create a column to get the 1st visit of each patient after 90 days

hbac = hbac[['InternalID', 'Sex', 'Results Date', 'HbA1c', 'Glucose', 'Triglycerides', 'HDL Cholesterol', 'LDL Cholesterol',\
            'Total Cholesterol', 'Non-HDL Cholesterol', 'GGT', 'ALT', 'AST']] #Select relevant columns from a detail database

hbac['Results Date'] = pd.to_datetime(hbac['Results Date'], errors='coerce')

#Merge the detail database to get the HBA1C data for 1st visit, first visit after 3 months and latest visit of each patient
#and classify as Diabetic, Pre Diabetic and Non Diabetic as per result
output = temp1.merge(hbac, how='left', left_on=["InternalID", "Minimum_Date"], right_on=["InternalID","Results Date"])
output.rename(columns = {'Minimum_Date':'First Result Date','HbA1c':'First HbA1c Value', 'Glucose': 'First Glucose Value',\
'Triglycerides': 'First Triglycerides Value', 'HDL Cholesterol': 'First HDL Value', 'LDL Cholesterol': 'First LDL Value',\
'Total Cholesterol': 'First Cholesterol Value', 'Non-HDL Cholesterol': 'First Non-HDL Value', 'GGT': 'First GGT Value',\
'ALT': 'First ALT Value', 'AST': 'First AST Value'}, inplace = True)

output.loc[(output['First HbA1c Value']<= 5.6), 'First HbA1c Category'] = 'Non-Diabetic'
output.loc[(output['First HbA1c Value']>= 5.7) & (output['First HbA1c Value']<= 6.4), 'First HbA1c Category'] =\
'Pre-Diabetic'
output.loc[(output['First HbA1c Value']>= 6.5), 'First HbA1c Category'] = 'Diabetic'

output.loc[(output['First Glucose Value']<= 5.5), 'First Glucose Category'] = 'First Category'
output.loc[(output['First Glucose Value']> 5.5) & (output['First Glucose Value']< 7), 'First Glucose Category'] =\
'Second Category'
output.loc[(output['First Glucose Value']>= 7) & (output['First Glucose Value']<= 11), 'First Glucose Category'] =\
'Third Category'
output.loc[(output['First Glucose Value']>= 11.1), 'First Glucose Category'] = 'Fourth Category'

output.loc[(output['First Triglycerides Value']< 1.7), 'First Triglycerides Category'] = 'First Category'
output.loc[(output['First Triglycerides Value']>= 1.7), 'First Triglycerides Category'] = 'Second Category'

output.loc[(output['First HDL Value']< 1) & (output['Sex']== 'Male'), 'First HDL Category'] = 'First Category'
output.loc[(output['First HDL Value']>= 1) & (output['Sex']== 'Male'), 'First HDL Category'] = 'Second Category'
output.loc[(output['First HDL Value']< 1.3) & (output['Sex']== 'Female'), 'First HDL Category'] = 'First Category'
output.loc[(output['First HDL Value']>= 1.3) & (output['Sex']== 'Female'), 'First HDL Category'] = 'Second Category'

output.loc[(output['First LDL Value']< 2), 'First LDL Category'] = 'First Category'
output.loc[(output['First LDL Value']>= 2), 'First LDL Category'] = 'Second Category'

output.loc[(output['First Cholesterol Value']< 4), 'First Cholesterol Category'] = 'First Category'
output.loc[(output['First Cholesterol Value']>= 4), 'First Cholesterol Category'] = 'Second Category'

output.loc[(output['First Non-HDL Value']< 2.5), 'First Non-HDL Category'] = 'First Category'
output.loc[(output['First Non-HDL Value']>= 2.5), 'First Non-HDL Category'] = 'Second Category'

output.loc[(output['First ALT Value'].isnull()) | (output['First AST Value'].isnull()), \
            '1st ALT:AST'] = None
output.loc[(output['First ALT Value'].notnull()) | (output['First AST Value'].notnull()), \
    '1st ALT:AST'] = output['First ALT Value'] / output['First AST Value']

output = output.merge(hbac, how='left', left_on=["InternalID", ">=3 Month Date"], right_on=["InternalID", \
                                                                                                   "Results Date"])
output.rename(columns = {'HbA1c':'>=3 Month Hba1c', 'Glucose': '>=3 Month Glucose', 'Triglycerides': '>=3 Month Triglycerides', \
'HDL Cholesterol': '>=3 Month HDL', 'LDL Cholesterol': '>=3 Month LDL', 'Total Cholesterol': '>=3 Month Cholesterol', \
'Non-HDL Cholesterol': '>=3 Month Non-HDL', 'GGT': '>=3 Month GGT', 'ALT': '>=3 Month ALT', 'AST': '>=3 Month AST'},\
inplace = True)

output.loc[(output['>=3 Month Hba1c']<= 5.6), '>=3 Month Hba1c Category'] = 'Non-Diabetic'
output.loc[(output['>=3 Month Hba1c']>= 5.7) & (output['>=3 Month Hba1c']<= 6.4), '>=3 Month Hba1c Category'] =\
'Pre-Diabetic'
output.loc[(output['>=3 Month Hba1c']>= 6.5), '>=3 Month Hba1c Category'] = 'Diabetic'

output.loc[(output['>=3 Month Glucose']<= 5.5), '>=3 Glucose Category'] = 'First Category'
output.loc[(output['>=3 Month Glucose']> 5.5) & (output['>=3 Month Glucose']< 7), '>=3 Glucose Category'] =\
'Second Category'
output.loc[(output['>=3 Month Glucose']>= 7) & (output['>=3 Month Glucose']<= 11), '>=3 Glucose Category'] =\
'Third Category'
output.loc[(output['>=3 Month Glucose']>= 11.1), '>=3 Glucose Category'] = 'Fourth Category'

output.loc[(output['>=3 Month Triglycerides']< 1.7), '>=3 Triglycerides Category'] = 'First Category'
output.loc[(output['>=3 Month Triglycerides']>= 1.7), '>=3 Triglycerides Category'] = 'Second Category'

output.loc[(output['>=3 Month HDL']< 1) & (output['Sex_x']== 'Male'), '>=3 HDL Category'] = 'First Category'
output.loc[(output['>=3 Month HDL']>= 1) & (output['Sex_x']== 'Male'), '>=3 HDL Category'] = 'Second Category'
output.loc[(output['>=3 Month HDL']< 1.3) & (output['Sex_x']== 'Female'), '>=3 HDL Category'] = 'First Category'
output.loc[(output['>=3 Month HDL']>= 1.3) & (output['Sex_x']== 'Female'), '>=3 HDL Category'] = 'Second Category'

output.loc[(output['>=3 Month LDL']< 2), '>=3 LDL Category'] = 'First Category'
output.loc[(output['>=3 Month LDL']>= 2), '>=3 LDL Category'] = 'Second Category'

output.loc[(output['>=3 Month Cholesterol']< 4), '>=3 Cholesterol Category'] = 'First Category'
output.loc[(output['>=3 Month Cholesterol']>= 4), '>=3 Cholesterol Category'] = 'Second Category'

output.loc[(output['>=3 Month Non-HDL']< 2.5), '>=3 Non-HDL Category'] = 'First Category'
output.loc[(output['>=3 Month Non-HDL']>= 2.5), '>=3 Non-HDL Category'] = 'Second Category'

output.loc[(output['>=3 Month ALT'].isnull()) | (output['>=3 Month AST'].isnull()), \
            '>=3 Month ALT:AST'] = None
output.loc[(output['>=3 Month ALT'].notnull()) | (output['>=3 Month AST'].notnull()), \
    '>=3 Month ALT:AST'] = output['>=3 Month ALT'] / output['>=3 Month AST']

output = output.merge(hbac, how='left', left_on=["InternalID", "Maximum_Date"], right_on=["InternalID","Results Date"])
output.rename(columns = {'Maximum_Date':'Last Date','HbA1c':'Last Hba1c', 'Glucose': 'Last Glucose', \
'Triglycerides': 'Last Triglycerides', 'HDL Cholesterol': 'Last HDL', 'LDL Cholesterol': 'Last LDL', \
'Total Cholesterol': 'Last Cholesterol', 'Non-HDL Cholesterol': 'Last Non-HDL', 'GGT': 'Last GGT', 'ALT': 'Last ALT',\
'AST': 'Last AST'}, inplace = True)

output.loc[(output['Last Hba1c']<= 5.6), 'Last HbA1c Category'] = 'Non-Diabetic'
output.loc[(output['Last Hba1c']>= 5.7) & (output['Last Hba1c']<= 6.4), 'Last HbA1c Category'] =\
'Pre-Diabetic'
output.loc[(output['Last Hba1c']>= 6.5), 'Last HbA1c Category'] = 'Diabetic'

output.loc[(output['Last Glucose']<= 5.5), 'Last Glucose Category'] = 'First Category'
output.loc[(output['Last Glucose']> 5.5) & (output['Last Glucose']< 7), 'Last Glucose Category'] =\
'Second Category'
output.loc[(output['Last Glucose']>= 7) & (output['Last Glucose']<= 11), 'Last Glucose Category'] =\
'Third Category'
output.loc[(output['Last Glucose']>= 11.1), 'Last Glucose Category'] = 'Fourth Category'

output.loc[(output['Last Triglycerides']< 1.7), 'Last Triglycerides Category'] = 'First Category'
output.loc[(output['Last Triglycerides']>= 1.7), 'Last Triglycerides Category'] = 'Second Category'

output.loc[(output['Last HDL']< 1) & (output['Sex']== 'Male'), 'Last HDL Category'] = 'First Category'
output.loc[(output['Last HDL']>= 1) & (output['Sex']== 'Male'), 'Last HDL Category'] = 'Second Category'
output.loc[(output['Last HDL']< 1.3) & (output['Sex']== 'Female'), 'Last HDL Category'] = 'First Category'
output.loc[(output['Last HDL']>= 1.3) & (output['Sex']== 'Female'), 'Last HDL Category'] = 'Second Category'

output.loc[(output['Last LDL']< 2), 'Last LDL Category'] = 'First Category'
output.loc[(output['Last LDL']>= 2.2), 'Last LDL Category'] = 'Second Category'

output.loc[(output['Last Cholesterol']< 4), 'Last Cholesterol Category'] = 'First Category'
output.loc[(output['Last Cholesterol']>= 4), 'Last Cholesterol Category'] = 'Second Category'

output.loc[(output['Last Non-HDL']< 2.5), 'Last Non-HDL Category'] = 'First Category'
output.loc[(output['Last Non-HDL']>= 2.5), 'Last Non-HDL Category'] = 'Second Category'

output.loc[(output['Last ALT'].isnull()) | (output['Last AST'].isnull()), \
            'Last ALT:AST'] = None
output.loc[(output['Last ALT'].notnull()) | (output['Last AST'].notnull()), \
    'Last ALT:AST'] = output['Last ALT'] / output['Last AST']

output = output[['InternalID', 'First Result Date','First HbA1c Value', 'First HbA1c Category', 'First Glucose Value',\
 'First Glucose Category', 'First Triglycerides Value', 'First Triglycerides Category', 'First HDL Value',\
 'First HDL Category', 'First LDL Value', 'First LDL Category', 'First Cholesterol Value', 'First Cholesterol Category',\
 'First Non-HDL Value', 'First Non-HDL Category', 'First GGT Value', 'First ALT Value', 'First AST Value', '1st ALT:AST',\
 '>=3 Month Date',\
 '>=3 Month Hba1c', '>=3 Month Hba1c Category', '>=3 Month Glucose', '>=3 Glucose Category', '>=3 Month Triglycerides', \
 '>=3 Triglycerides Category', '>=3 Month HDL', '>=3 HDL Category', '>=3 Month LDL', '>=3 LDL Category',\
 '>=3 Month Cholesterol', '>=3 Cholesterol Category', '>=3 Month Non-HDL', '>=3 Non-HDL Category', '>=3 Month GGT',\
 '>=3 Month ALT', '>=3 Month AST','>=3 Month ALT:AST','Last Date',\
 'Last Hba1c', 'Last HbA1c Category', 'Last Glucose', 'Last Glucose Category', 'Last Triglycerides',\
 'Last Triglycerides Category', 'Last HDL', 'Last HDL Category', 'Last LDL', 'Last LDL Category', 'Last Cholesterol',\
'Last Cholesterol Category', 'Last Non-HDL', 'Last Non-HDL Category', 'Last GGT', 'Last ALT', 'Last AST', 'Last ALT:AST']]
#Select relevant columns

#Get absolute and % change of Result between first & first report after 3 months and first report and last report
output.loc[(output['First HbA1c Value'].isnull()) | (output['>=3 Month Hba1c'].isnull()), \
            '1st - >3 Month Change Hba1c (#)'] = None
output.loc[(output['First HbA1c Value'].notnull()) | (output['>=3 Month Hba1c'].notnull()), \
            '1st - >3 Month Change Hba1c (#)'] = output['>=3 Month Hba1c'] - output['First HbA1c Value']

output.loc[(output['1st - >3 Month Change Hba1c (#)'].isnull()), '1st - 3 month Change in hba1c status'] = None
output.loc[(output['1st - >3 Month Change Hba1c (#)'] < 0), '1st - 3 month Change in hba1c status'] = 'Improved'
output.loc[(output['1st - >3 Month Change Hba1c (#)'] > 0), '1st - 3 month Change in hba1c status'] = 'Worse'
output.loc[(output['1st - >3 Month Change Hba1c (#)'] == 0), '1st - 3 month Change in hba1c status'] = 'Same'
            
output.loc[(output['First HbA1c Value'].isnull()) | (output['>=3 Month Hba1c'].isnull()), \
            '1st - >3 Month Change Hba1c (%)'] = None
output.loc[(output['First HbA1c Value'].notnull()) | (output['>=3 Month Hba1c'].notnull()), \
            '1st - >3 Month Change Hba1c (%)'] = output['1st - >3 Month Change Hba1c (#)']\
/ output['First HbA1c Value']

output.loc[(output['First Glucose Value'].isnull()) | (output['>=3 Month Glucose'].isnull()), \
            '1st - >3 Month Change Glucose (#)'] = None
output.loc[(output['First Glucose Value'].notnull()) | (output['>=3 Month Glucose'].notnull()), \
            '1st - >3 Month Change Glucose (#)'] = output['>=3 Month Glucose'] - output['First Glucose Value']

output.loc[(output['1st - >3 Month Change Glucose (#)'].isnull()), '1st - 3 month Change in Glucose status'] = None
output.loc[(output['1st - >3 Month Change Glucose (#)'] < 0), '1st - 3 month Change in Glucose status'] = 'Improved'
output.loc[(output['1st - >3 Month Change Glucose (#)'] > 0), '1st - 3 month Change in Glucose status'] = 'Worse'
output.loc[(output['1st - >3 Month Change Glucose (#)'] == 0), '1st - 3 month Change in Glucose status'] = 'Same'
            
output.loc[(output['First Glucose Value'].isnull()) | (output['>=3 Month Glucose'].isnull()), \
            '1st - >3 Month Change Glucose (%)'] = None
output.loc[(output['First Glucose Value'].notnull()) | (output['>=3 Month Glucose'].notnull()), \
            '1st - >3 Month Change Glucose (%)'] = output['1st - >3 Month Change Glucose (#)']\
/ output['First Glucose Value']

output.loc[(output['First Triglycerides Value'].isnull()) | (output['>=3 Month Triglycerides'].isnull()), \
            '1st - >3 Month Change Triglycerides (#)'] = None
output.loc[(output['First Triglycerides Value'].notnull()) | (output['>=3 Month Triglycerides'].notnull()), \
    '1st - >3 Month Change Triglycerides (#)'] = output['>=3 Month Triglycerides'] - output['First Triglycerides Value']

output.loc[(output['1st - >3 Month Change Triglycerides (#)'].isnull()), '1st - 3 month Change in Triglycerides status'] = None
output.loc[(output['1st - >3 Month Change Triglycerides (#)'] < 0), '1st - 3 month Change in Triglycerides status'] = 'Decreased'
output.loc[(output['1st - >3 Month Change Triglycerides (#)'] > 0), '1st - 3 month Change in Triglycerides status'] = 'Increased'
output.loc[(output['1st - >3 Month Change Triglycerides (#)'] == 0), '1st - 3 month Change in Triglycerides status'] = 'Same'
            
output.loc[(output['First Triglycerides Value'].isnull()) | (output['>=3 Month Triglycerides'].isnull()), \
            '1st - >3 Month Change Triglycerides (%)'] = None
output.loc[(output['First Triglycerides Value'].notnull()) | (output['>=3 Month Triglycerides'].notnull()), \
            '1st - >3 Month Change Triglycerides (%)'] = output['1st - >3 Month Change Triglycerides (#)']\
/ output['First Triglycerides Value']

output.loc[(output['First HDL Value'].isnull()) | (output['>=3 Month HDL'].isnull()), \
            '1st - >3 Month Change HDL (#)'] = None
output.loc[(output['First HDL Value'].notnull()) | (output['>=3 Month HDL'].notnull()), \
            '1st - >3 Month Change HDL (#)'] = output['>=3 Month HDL'] - output['First HDL Value']

output.loc[(output['1st - >3 Month Change HDL (#)'].isnull()), '1st - 3 month Change in HDL status'] = None
output.loc[(output['1st - >3 Month Change HDL (#)'] < 0), '1st - 3 month Change in HDL status'] = 'Improved'
output.loc[(output['1st - >3 Month Change HDL (#)'] > 0), '1st - 3 month Change in HDL status'] = 'Worse'
output.loc[(output['1st - >3 Month Change HDL (#)'] == 0), '1st - 3 month Change in HDL status'] = 'Same'
            
output.loc[(output['First HDL Value'].isnull()) | (output['>=3 Month HDL'].isnull()), \
            '1st - >3 Month Change HDL (%)'] = None
output.loc[(output['First HDL Value'].notnull()) | (output['>=3 Month HDL'].notnull()), \
            '1st - >3 Month Change HDL (%)'] = output['1st - >3 Month Change HDL (#)']\
/ output['First HDL Value']

output.loc[(output['First LDL Value'].isnull()) | (output['>=3 Month LDL'].isnull()), \
            '1st - >3 Month Change LDL (#)'] = None
output.loc[(output['First LDL Value'].notnull()) | (output['>=3 Month LDL'].notnull()), \
            '1st - >3 Month Change LDL (#)'] = output['>=3 Month LDL'] - output['First LDL Value']

output.loc[(output['1st - >3 Month Change LDL (#)'].isnull()), '1st - 3 month Change in LDL status'] = None
output.loc[(output['1st - >3 Month Change LDL (#)'] < 0), '1st - 3 month Change in LDL status'] = 'Improved'
output.loc[(output['1st - >3 Month Change LDL (#)'] > 0), '1st - 3 month Change in LDL status'] = 'Worse'
output.loc[(output['1st - >3 Month Change LDL (#)'] == 0), '1st - 3 month Change in LDL status'] = 'Same'
            
output.loc[(output['First LDL Value'].isnull()) | (output['>=3 Month LDL'].isnull()), \
            '1st - >3 Month Change LDL (%)'] = None
output.loc[(output['First LDL Value'].notnull()) | (output['>=3 Month LDL'].notnull()), \
            '1st - >3 Month Change LDL (%)'] = output['1st - >3 Month Change LDL (#)']\
/ output['First LDL Value']

output.loc[(output['First Cholesterol Value'].isnull()) | (output['>=3 Month Cholesterol'].isnull()), \
            '1st - >3 Month Change Cholesterol (#)'] = None
output.loc[(output['First Cholesterol Value'].notnull()) | (output['>=3 Month Cholesterol'].notnull()), \
            '1st - >3 Month Change Cholesterol (#)'] = output['>=3 Month Cholesterol'] - output['First Cholesterol Value']

output.loc[(output['1st - >3 Month Change Cholesterol (#)'].isnull()), '1st - 3 month Change in Cholesterol status'] = None
output.loc[(output['1st - >3 Month Change Cholesterol (#)'] < 0), '1st - 3 month Change in Cholesterol status'] = 'Improved'
output.loc[(output['1st - >3 Month Change Cholesterol (#)'] > 0), '1st - 3 month Change in Cholesterol status'] = 'Worse'
output.loc[(output['1st - >3 Month Change Cholesterol (#)'] == 0), '1st - 3 month Change in Cholesterol status'] = 'Same'
            
output.loc[(output['First Cholesterol Value'].isnull()) | (output['>=3 Month Cholesterol'].isnull()), \
            '1st - >3 Month Change Cholesterol (%)'] = None
output.loc[(output['First Cholesterol Value'].notnull()) | (output['>=3 Month Cholesterol'].notnull()), \
            '1st - >3 Month Change Cholesterol (%)'] = output['1st - >3 Month Change Cholesterol (#)']\
/ output['First Cholesterol Value']

output.loc[(output['First Non-HDL Value'].isnull()) | (output['>=3 Month Non-HDL'].isnull()), \
            '1st - >3 Month Change Non-HDL (#)'] = None
output.loc[(output['First Non-HDL Value'].notnull()) | (output['>=3 Month Non-HDL'].notnull()), \
            '1st - >3 Month Change Non-HDL (#)'] = output['>=3 Month Non-HDL'] - output['First Non-HDL Value']

output.loc[(output['1st - >3 Month Change Non-HDL (#)'].isnull()), '1st - 3 month Change in Non-HDL status'] = None
output.loc[(output['1st - >3 Month Change Non-HDL (#)'] < 0), '1st - 3 month Change in Non-HDL status'] = 'Improved'
output.loc[(output['1st - >3 Month Change Non-HDL (#)'] > 0), '1st - 3 month Change in Non-HDL status'] = 'Worse'
output.loc[(output['1st - >3 Month Change Non-HDL (#)'] == 0), '1st - 3 month Change in Non-HDL status'] = 'Same'
            
output.loc[(output['First Non-HDL Value'].isnull()) | (output['>=3 Month Non-HDL'].isnull()), \
            '1st - >3 Month Change Non-HDL (%)'] = None
output.loc[(output['First Non-HDL Value'].notnull()) | (output['>=3 Month Non-HDL'].notnull()), \
            '1st - >3 Month Change Non-HDL (%)'] = output['1st - >3 Month Change Non-HDL (#)']\
/ output['First Non-HDL Value']

output.loc[(output['First GGT Value'].isnull()) | (output['>=3 Month GGT'].isnull()), \
            '1st - >3 Month Change GGT (#)'] = None
output.loc[(output['First GGT Value'].notnull()) | (output['>=3 Month GGT'].notnull()), \
    '1st - >3 Month Change GGT (#)'] = output['>=3 Month GGT'] - output['First GGT Value']

output.loc[(output['1st - >3 Month Change GGT (#)'].isnull()), '1st - 3 month Change in GGT status'] = None
output.loc[(output['1st - >3 Month Change GGT (#)'] < 0), '1st - 3 month Change in GGT status'] = 'Decreased'
output.loc[(output['1st - >3 Month Change GGT (#)'] > 0), '1st - 3 month Change in GGT status'] = 'Increased'
output.loc[(output['1st - >3 Month Change GGT (#)'] == 0), '1st - 3 month Change in GGT status'] = 'Same'
            
output.loc[(output['First GGT Value'].isnull()) | (output['>=3 Month GGT'].isnull()), \
            '1st - >3 Month Change GGT (%)'] = None
output.loc[(output['First GGT Value'].notnull()) | (output['>=3 Month GGT'].notnull()), \
            '1st - >3 Month Change GGT (%)'] = output['1st - >3 Month Change GGT (#)']\
/ output['First GGT Value']

output.loc[(output['First HbA1c Value'].isnull()) | (output['Last Hba1c'].isnull()), \
            '1st - Last  Change in Hba1c (#)'] = None
output.loc[(output['First HbA1c Value'].notnull()) | (output['Last Hba1c'].notnull()), \
            '1st - Last  Change in Hba1c (#)'] = output['Last Hba1c'] - output['First HbA1c Value']

output.loc[(output['1st - Last  Change in Hba1c (#)'].isnull()), '1st - Last Change in hba1c status'] = None
output.loc[(output['1st - Last  Change in Hba1c (#)'] < 0), '1st - Last Change in hba1c status'] = 'Improved'
output.loc[(output['1st - Last  Change in Hba1c (#)'] > 0), '1st - Last Change in hba1c status'] = 'Worse'
output.loc[(output['1st - Last  Change in Hba1c (#)'] == 0), '1st - Last Change in hba1c status'] = 'Same'
        
output.loc[(output['First HbA1c Value'].isnull()) | (output['Last Hba1c'].isnull()), \
            '1st - Last  Change in Hba1c (%)'] = None
output.loc[(output['First HbA1c Value'].notnull()) | (output['Last Hba1c'].notnull()), \
            '1st - Last  Change in Hba1c (%)'] = output['1st - Last  Change in Hba1c (#)']\
/ output['First HbA1c Value']

output.loc[(output['First Glucose Value'].isnull()) | (output['Last Glucose'].isnull()), \
            '1st - Last  Change in Glucose (#)'] = None
output.loc[(output['First Glucose Value'].notnull()) | (output['Last Glucose'].notnull()), \
            '1st - Last  Change in Glucose (#)'] = output['Last Glucose'] - output['First Glucose Value']

output.loc[(output['1st - Last  Change in Glucose (#)'].isnull()), '1st - Last Change in Glucose status'] = None
output.loc[(output['1st - Last  Change in Glucose (#)'] < 0), '1st - Last Change in Glucose status'] = 'Improved'
output.loc[(output['1st - Last  Change in Glucose (#)'] > 0), '1st - Last Change in Glucose status'] = 'Worse'
output.loc[(output['1st - Last  Change in Glucose (#)'] == 0), '1st - Last Change in Glucose status'] = 'Same'
        
output.loc[(output['First Glucose Value'].isnull()) | (output['Last Glucose'].isnull()), \
            '1st - Last  Change in Glucose (%)'] = None
output.loc[(output['First Glucose Value'].notnull()) | (output['Last Glucose'].notnull()), \
            '1st - Last  Change in Glucose (%)'] = output['1st - Last  Change in Glucose (#)']\
/ output['First Glucose Value']

output.loc[(output['First Triglycerides Value'].isnull()) | (output['Last Triglycerides'].isnull()), \
            '1st - Last  Change in Triglycerides (#)'] = None
output.loc[(output['First Triglycerides Value'].notnull()) | (output['Last Triglycerides'].notnull()), \
            '1st - Last  Change in Triglycerides (#)'] = output['Last Triglycerides'] - output['First Triglycerides Value']

output.loc[(output['1st - Last  Change in Triglycerides (#)'].isnull()), '1st - Last Change in Triglycerides status'] = None
output.loc[(output['1st - Last  Change in Triglycerides (#)'] < 0), '1st - Last Change in Triglycerides status'] = 'Decreased'
output.loc[(output['1st - Last  Change in Triglycerides (#)'] > 0), '1st - Last Change in Triglycerides status'] = 'Increased'
output.loc[(output['1st - Last  Change in Triglycerides (#)'] == 0), '1st - Last Change in Triglycerides status'] = 'Same'
        
output.loc[(output['First Triglycerides Value'].isnull()) | (output['Last Triglycerides'].isnull()), \
            '1st - Last  Change in Triglycerides (%)'] = None
output.loc[(output['First Triglycerides Value'].notnull()) | (output['Last Triglycerides'].notnull()), \
            '1st - Last  Change in Triglycerides (%)'] = output['1st - Last  Change in Triglycerides (#)']\
/ output['First Triglycerides Value']

output.loc[(output['First HDL Value'].isnull()) | (output['Last HDL'].isnull()), \
            '1st - Last  Change in HDL (#)'] = None
output.loc[(output['First HDL Value'].notnull()) | (output['Last HDL'].notnull()), \
            '1st - Last  Change in HDL (#)'] = output['Last HDL'] - output['First HDL Value']

output.loc[(output['1st - Last  Change in HDL (#)'].isnull()), '1st - Last Change in HDL status'] = None
output.loc[(output['1st - Last  Change in HDL (#)'] < 0), '1st - Last Change in HDL status'] = 'Improved'
output.loc[(output['1st - Last  Change in HDL (#)'] > 0), '1st - Last Change in HDL status'] = 'Worse'
output.loc[(output['1st - Last  Change in HDL (#)'] == 0), '1st - Last Change in HDL status'] = 'Same'
        
output.loc[(output['First HDL Value'].isnull()) | (output['Last HDL'].isnull()), \
            '1st - Last  Change in HDL (%)'] = None
output.loc[(output['First HDL Value'].notnull()) | (output['Last HDL'].notnull()), \
            '1st - Last  Change in HDL (%)'] = output['1st - Last  Change in HDL (#)']\
/ output['First HDL Value']

output.loc[(output['First LDL Value'].isnull()) | (output['Last LDL'].isnull()), \
            '1st - Last  Change in LDL (#)'] = None
output.loc[(output['First LDL Value'].notnull()) | (output['Last LDL'].notnull()), \
            '1st - Last  Change in LDL (#)'] = output['Last LDL'] - output['First LDL Value']

output.loc[(output['1st - Last  Change in LDL (#)'].isnull()), '1st - Last Change in LDL status'] = None
output.loc[(output['1st - Last  Change in LDL (#)'] < 0), '1st - Last Change in LDL status'] = 'Improved'
output.loc[(output['1st - Last  Change in LDL (#)'] > 0), '1st - Last Change in LDL status'] = 'Worse'
output.loc[(output['1st - Last  Change in LDL (#)'] == 0), '1st - Last Change in LDL status'] = 'Same'
        
output.loc[(output['First LDL Value'].isnull()) | (output['Last LDL'].isnull()), \
            '1st - Last  Change in LDL (%)'] = None
output.loc[(output['First LDL Value'].notnull()) | (output['Last LDL'].notnull()), \
            '1st - Last  Change in LDL (%)'] = output['1st - Last  Change in LDL (#)']\
/ output['First LDL Value']

output.loc[(output['First Cholesterol Value'].isnull()) | (output['Last Cholesterol'].isnull()), \
            '1st - Last  Change in Cholesterol (#)'] = None
output.loc[(output['First Cholesterol Value'].notnull()) | (output['Last Cholesterol'].notnull()), \
            '1st - Last  Change in Cholesterol (#)'] = output['Last Cholesterol'] - output['First Cholesterol Value']

output.loc[(output['1st - Last  Change in Cholesterol (#)'].isnull()), '1st - Last Change in Cholesterol status'] = None
output.loc[(output['1st - Last  Change in Cholesterol (#)'] < 0), '1st - Last Change in Cholesterol status'] = 'Improved'
output.loc[(output['1st - Last  Change in Cholesterol (#)'] > 0), '1st - Last Change in Cholesterol status'] = 'Worse'
output.loc[(output['1st - Last  Change in Cholesterol (#)'] == 0), '1st - Last Change in Cholesterol status'] = 'Same'
        
output.loc[(output['First Cholesterol Value'].isnull()) | (output['Last Cholesterol'].isnull()), \
            '1st - Last  Change in Cholesterol (%)'] = None
output.loc[(output['First Cholesterol Value'].notnull()) | (output['Last Cholesterol'].notnull()), \
            '1st - Last  Change in Cholesterol (%)'] = output['1st - Last  Change in Cholesterol (#)']\
/ output['First Cholesterol Value']

output.loc[(output['First Non-HDL Value'].isnull()) | (output['Last Non-HDL'].isnull()), \
            '1st - Last  Change in Non-HDL (#)'] = None
output.loc[(output['First Non-HDL Value'].notnull()) | (output['Last Non-HDL'].notnull()), \
            '1st - Last  Change in Non-HDL (#)'] = output['Last Non-HDL'] - output['First Non-HDL Value']

output.loc[(output['1st - Last  Change in Non-HDL (#)'].isnull()), '1st - Last Change in Non-HDL status'] = None
output.loc[(output['1st - Last  Change in Non-HDL (#)'] < 0), '1st - Last Change in Non-HDL status'] = 'Improved'
output.loc[(output['1st - Last  Change in Non-HDL (#)'] > 0), '1st - Last Change in Non-HDL status'] = 'Worse'
output.loc[(output['1st - Last  Change in Non-HDL (#)'] == 0), '1st - Last Change in Non-HDL status'] = 'Same'
        
output.loc[(output['First Non-HDL Value'].isnull()) | (output['Last Non-HDL'].isnull()), \
            '1st - Last  Change in Non-HDL (%)'] = None
output.loc[(output['First Non-HDL Value'].notnull()) | (output['Last Non-HDL'].notnull()), \
            '1st - Last  Change in Non-HDL (%)'] = output['1st - Last  Change in Non-HDL (#)']\
/ output['First Non-HDL Value']

output.loc[(output['First GGT Value'].isnull()) | (output['Last GGT'].isnull()), \
            '1st - Last  Change in GGT (#)'] = None
output.loc[(output['First GGT Value'].notnull()) | (output['Last GGT'].notnull()), \
            '1st - Last  Change in GGT (#)'] = output['Last GGT'] - output['First GGT Value']

output.loc[(output['1st - Last  Change in GGT (#)'].isnull()), '1st - Last Change in GGT status'] = None
output.loc[(output['1st - Last  Change in GGT (#)'] < 0), '1st - Last Change in GGT status'] = 'Decreased'
output.loc[(output['1st - Last  Change in GGT (#)'] > 0), '1st - Last Change in GGT status'] = 'Increased'
output.loc[(output['1st - Last  Change in GGT (#)'] == 0), '1st - Last Change in GGT status'] = 'Same'
        
output.loc[(output['First GGT Value'].isnull()) | (output['Last GGT'].isnull()), \
            '1st - Last  Change in GGT (%)'] = None
output.loc[(output['First GGT Value'].notnull()) | (output['Last GGT'].notnull()), \
            '1st - Last  Change in GGT (%)'] = output['1st - Last  Change in GGT (#)']\
/ output['First GGT Value']

allexceptdose = aboveallnresultcount.merge(output, how = 'outer', on = 'InternalID') 
#Merge with main dataframe

#Convert date as per required format
allexceptdose['First Result Date'] = allexceptdose['First Result Date'].dt.date
allexceptdose['>=3 Month Date'] = allexceptdose['>=3 Month Date'].dt.date
allexceptdose['Last Date'] = allexceptdose['Last Date'].dt.date

# Dose Change

In [7]:
drug_count = data.dropna(subset = ['Drug name']) #Drop rows with empty drug name
drug_count = drug_count[['InternalID', 'Drug name']] # #Select relevant columns
drug_count = drug_count.groupby(['InternalID']).count() #Get number of drug per patient
drug_count = drug_count.reset_index() #Reset index to get access to all columns
drug_count.rename(columns = {'Drug name': '# of Drugs'}, inplace = True) #Rename column as per template

bp_dose = data[data['Drug category'] == 'Blood pressure'] #Filter data on Blood pressure
bp_dose = bp_dose[['InternalID','Dose change type']] #Select Relevant columns
bp_dose = bp_dose.dropna(subset = ['Dose change type']) #Drop rows with no data

bp_dose = pd.get_dummies(data = bp_dose, columns = ['Dose change type']) #Transform 1 column to 5 based on dosage change type

bp_dose.rename(columns = {'Dose change type_Ceased':'# Blood Pressure Drugs Ceased', \
                              'Dose change type_Decreased': '# Blood Pressure Drugs Decreased',\
                              'Dose change type_Increased\xa0': '# Blood Pressure Drugs Increased', \
                              'Dose change type_Unchanged': '# Blood Pressure Drugs Unchanged', \
                        'Dose change type_Started': '# Blood Pressure Drugs Started'}, inplace = True) #Rename the columns

bp_dose = bp_dose.groupby(['InternalID']).sum() #Group it with patient
bp_dose = bp_dose.reset_index() #Reset index to get access to all columns

bp_dose = bp_dose [['InternalID', '# Blood Pressure Drugs Ceased', '# Blood Pressure Drugs Decreased', \
             '# Blood Pressure Drugs Increased','# Blood Pressure Drugs Unchanged', '# Blood Pressure Drugs Started']]
#Rearrange column as per template

bp_dose = drug_count.merge(bp_dose, how = 'outer', on = 'InternalID') #Merge drug count data

non_bp_dose = data[data['Drug category'] != 'Blood pressure'] #Select data whose category is not blood pressure
non_bp_dose = non_bp_dose[['InternalID','Dose change type']] #Select Relevant columns
non_bp_dose = non_bp_dose.dropna(subset = ['Dose change type']) #Drop rows with no data

non_bp_dose = pd.get_dummies(data = non_bp_dose, columns = ['Dose change type']) #Transform 1 column to 5 based on dosage change type

non_bp_dose.rename(columns = {'Dose change type_Ceased':'# Other Drugs Ceased', \
                              'Dose change type_Decreased': '# Other Drugs Decreased',\
                              'Dose change type_Increased\xa0': '# Other Drugs Increased', \
                              'Dose change type_Unchanged': '# Other Drugs Unchanged', \
                        'Dose change type_Started': '# Other Drugs Started'}, inplace = True) #Rename the columns

non_bp_dose = non_bp_dose.groupby(['InternalID']).sum() #Group it with patient
non_bp_dose = non_bp_dose.reset_index() #Reset index to get access to all columns

non_bp_dose = non_bp_dose [['InternalID', '# Other Drugs Ceased', '# Other Drugs Decreased', \
             '# Other Drugs Increased','# Other Drugs Unchanged', '# Other Drugs Started']] 
#Rearrange columns as per template

dose_change = bp_dose.merge(non_bp_dose, how = 'outer', on = 'InternalID') #Merge with above data in the same category

#Get total in drug change category
dose_change['# Drugs Ceased'] = dose_change['# Blood Pressure Drugs Ceased'] + dose_change['# Other Drugs Ceased']
dose_change['# Drugs Decreased'] = dose_change['# Blood Pressure Drugs Decreased'] + dose_change['# Other Drugs Decreased']
dose_change['# Drugs Increased'] = dose_change['# Blood Pressure Drugs Increased'] + dose_change['# Other Drugs Increased']
dose_change['#  Drugs Unchanged'] = dose_change['# Blood Pressure Drugs Unchanged'] + dose_change['# Other Drugs Unchanged']
dose_change['# Drugs Started'] = dose_change['# Blood Pressure Drugs Started'] + dose_change['# Other Drugs Started']

dose_change=dose_change.replace(0, np.nan)

combined = allexceptdose.merge(dose_change, how = 'outer', on = 'InternalID') #Join it with main table

# Save output as excel

In [8]:
combined.to_excel('Result.xlsx', index = False) #Save as excel