In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import math

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_datasets(insulin_data_file_path, cgm_data_file_path, date_time_format):
    if date_time_format == '':
        date_time_format = '%m/%d/%Y %H:%M:%S'
    insulin_dataset_full = pd.read_csv(insulin_data_file_path, low_memory = False)
    insulin_data = insulin_dataset_full[['Date', 'Time', 'BWZ Carb Input (grams)', 'BWZ Estimate (U)']]
    cgm_data_set_full = pd.read_csv(cgm_data_file_path, low_memory = False)
    cgm_data = cgm_data_set_full[['Date', 'Time', 'Sensor Glucose (mg/dL)']]
    cgm_data.dropna(inplace = True)
    insulin_data['DateTime'] = pd.to_datetime(insulin_data['Date'] + ' ' + insulin_data['Time'], format = date_time_format)
    cgm_data['DateTime'] = pd.to_datetime(cgm_data['Date'] + " " + cgm_data['Time'], format = date_time_format)
    return insulin_data, cgm_data

In [3]:
def get_meal_start_times_with_insulin_bolus(insulin_dataset):
    insulin_data_filtered = insulin_dataset[insulin_dataset['BWZ Carb Input (grams)'].notna() & insulin_dataset['BWZ Carb Input (grams)'] != 0]
    insulin_data_filtered.rename({'DateTime' : 'MealStartDateTime'}, axis = 1, inplace = True)
    insulin_data_with_insulin_bolus = insulin_data_filtered[['MealStartDateTime', 'BWZ Estimate (U)']]
    insulin_data_with_insulin_bolus.sort_values(by = 'MealStartDateTime', inplace = True)

    meal_start_times_with_insulin_bolus = [(x[0], math.ceil(x[1])) for x in insulin_data_with_insulin_bolus.to_numpy()]
    return meal_start_times_with_insulin_bolus

In [4]:
def get_valid_meal_start_times_with_insulin_bolus(meal_start_times_with_insulin_bolus):
    valid_meal_start_times_with_insulin_bolus = []
    for i in range(len(meal_start_times_with_insulin_bolus)):
        timestamp, insulin_bolus = meal_start_times_with_insulin_bolus[i]
        if i > 0:
            previous_timestamp = meal_start_times_with_insulin_bolus[i-1][0]
            if previous_timestamp > timestamp - timedelta(hours = 0.5):
                continue

        if i < len(meal_start_times_with_insulin_bolus) - 1:
            next_timestamp = meal_start_times_with_insulin_bolus[i+1][0]
            if next_timestamp < timestamp + timedelta(hours = 2):
                continue

        valid_meal_start_times_with_insulin_bolus.append((timestamp, insulin_bolus))
    return valid_meal_start_times_with_insulin_bolus

In [5]:
def extract_meal_and_insulin_bolus_data(cgm_dataset, valid_meal_start_times_with_insulin_bolus):
    cgm_dataset_sorted = cgm_dataset.sort_values(by = 'DateTime')
    meal_data = []
    insulin_bolus_data = []
    cgm_at_meal_start_time_data = []
    for meal_time, insulin_bolus in valid_meal_start_times_with_insulin_bolus:
        start_time = meal_time - timedelta(minutes = 30)
        end_time = meal_time + timedelta(hours = 2)
        filtered_data = cgm_dataset[(cgm_dataset['DateTime'] >= start_time) & (cgm_dataset['DateTime'] <= end_time)]
        if len(filtered_data) > 0:
            meal_data.append(list(filtered_data['Sensor Glucose (mg/dL)'].values))
            insulin_bolus_data.append(insulin_bolus)
            cgm_at_meal_start_time_data.append(cgm_dataset_sorted[cgm_dataset_sorted['DateTime'] >= meal_time]['Sensor Glucose (mg/dL)'].iloc[0])
    return meal_data, insulin_bolus_data, cgm_at_meal_start_time_data

In [6]:
def get_min_max_cgm(meal_data):
    meal_data_df = pd.DataFrame(meal_data)
    return min(meal_data_df.min(axis = 1)), max(meal_data_df.max(axis = 1))

In [7]:
def get_bin_list(min_value, value_list, bin_range = 20):
    bin_list = [int((val - min_value)/bin_range) for val in value_list]
    return bin_list

In [8]:
def get_bins_for_max_cgm(overall_min_cgm, overall_max_cgm, meal_data):
    meal_data_df= pd.DataFrame(meal_data)
    max_cgm_list = meal_data_df.max(axis = 1).values.tolist()
    max_cgm_bin_data = get_bin_list(overall_min_cgm, max_cgm_list, bin_range = 20)
    return max_cgm_bin_data

In [9]:
def get_itemsets(insulin_data_file_path, cgm_data_file_path, date_time_format = '%m/%d/%Y %H:%M:%S'):
    insulin_dataset, cgm_dataset = get_datasets(insulin_data_file_path, cgm_data_file_path, date_time_format)
    meal_start_times_with_insulin_bolus = get_meal_start_times_with_insulin_bolus(insulin_dataset)
    valid_meal_start_times_with_insulin_bolus = get_valid_meal_start_times_with_insulin_bolus(meal_start_times_with_insulin_bolus)
    meal_data, insulin_bolus_data, cgm_at_meal_start_time_data = extract_meal_and_insulin_bolus_data(cgm_dataset, valid_meal_start_times_with_insulin_bolus)
    overall_min_cgm, overall_max_cgm = get_min_max_cgm(meal_data)
    B_max_data = get_bins_for_max_cgm(overall_min_cgm, overall_max_cgm, meal_data)
    B_meal_data = get_bin_list(overall_min_cgm, cgm_at_meal_start_time_data, bin_range = 20)
    
    itemsets = []
    for i in range(len(B_max_data)):
        itemsets.append((B_max_data[i], B_meal_data[i], insulin_bolus_data[i]))
    return itemsets

In [23]:
def retrieve_most_frequent_itemsets(itemsets, support_count_threshold):
    count_dict = {}
    for itemset in itemsets:
        if itemset not in count_dict:
            count_dict[itemset] = 1
        else:
            count_dict[itemset] += 1
        count_dict[itemset]
    itemset_counts = list(count_dict.items())
    itemset_counts.sort(key = lambda x: x[1], reverse = True)
    #print(itemset_counts[0])
    frequent_item_sets_with_count = list(filter(lambda x: x[1] >= support_count_threshold,  itemset_counts))
    frequent_item_sets = [item[0] for item in frequent_item_sets_with_count]
    return frequent_item_sets, frequent_item_sets_with_count, count_dict

In [24]:
#Rules are of the form {Bmax, Bmeal} -> Insulin Bolus 
def get_confidence_of_rules(itemsets, count_dict): 
    precedent_count_dict = {}
    for itemset in itemsets:
        precedent = (itemset[0], itemset[1])
        if precedent not in precedent_count_dict:
            precedent_count_dict[precedent] = 1
        else:
            precedent_count_dict[precedent] += 1

    confidence_dict = {}
    for itemset in itemsets:
        precedent = (itemset[0], itemset[1])
        conf = itemset_count_dict[itemset] / precedent_count_dict[precedent]
        confidence_dict[itemset] = conf

    confidence_list = list(confidence_dict.items())
    return confidence_list

In [66]:
insulin_data_file_path = 'InsulinData.csv'
cgm_data_file_path = 'CGMData.csv'
date_time_format = '%m/%d/%Y %H:%M:%S'

support_count_threshold = 4
low_confidence_threshold = 0.15


itemsets = get_itemsets(insulin_data_file_path, cgm_data_file_path, date_time_format)
frequent_item_sets, frequent_item_sets_with_count, itemset_count_dict = retrieve_most_frequent_itemsets(itemsets, support_count_threshold)
confidence_list = get_confidence_of_rules(itemsets, itemset_count_dict)
confidence_list.sort(key = lambda x: x[1], reverse = True)

highest_confidence = confidence_list[0][1]
highest_confidence_list = list(filter(lambda x: x[1] >= highest_confidence, confidence_list))
low_confidence_list = list(filter(lambda x: x[1] < low_confidence_threshold, confidence_list))


def get_rules(conf_list):
    rules = []
    for rule_with_conf in conf_list:
        itemset = rule_with_conf[0]
        rule = f'{{{itemset[0]}, {itemset[1]}}} -> {itemset[2]}'
        #rule = '{{{0}, {1}}} -> {2}'.format(itemset[0], itemset[1], itemset[2])
        rules.append(rule)
    return rules

highest_confidence_rules = get_rules(highest_confidence_list)
low_confidence_rules = get_rules(low_confidence_list)



In [73]:
frequent_item_sets

[(7, 5, 3),
 (6, 5, 2),
 (11, 11, 3),
 (5, 3, 7),
 (6, 2, 2),
 (12, 11, 6),
 (8, 4, 4),
 (6, 3, 3),
 (8, 4, 3),
 (7, 2, 4),
 (11, 11, 4)]

In [69]:
highest_confidence_rules

['{17, 4} -> 4',
 '{14, 8} -> 5',
 '{17, 15} -> 2',
 '{15, 0} -> 3',
 '{16, 9} -> 8',
 '{18, 18} -> 3',
 '{17, 7} -> 10',
 '{13, 8} -> 6',
 '{17, 14} -> 11',
 '{8, 1} -> 7',
 '{15, 14} -> 5',
 '{14, 6} -> 4',
 '{10, 0} -> 5',
 '{12, 0} -> 3',
 '{5, 0} -> 4',
 '{15, 13} -> 3',
 '{1, 4} -> 9',
 '{3, 0} -> 8',
 '{1, 1} -> 9',
 '{16, 16} -> 2',
 '{9, 1} -> 6',
 '{2, 4} -> 2',
 '{14, 13} -> 6',
 '{3, 1} -> 8',
 '{1, 9} -> 1',
 '{2, 0} -> 2',
 '{18, 15} -> 7',
 '{3, 3} -> 3',
 '{12, 7} -> 10',
 '{7, 0} -> 3',
 '{14, 10} -> 5',
 '{4, 8} -> 12',
 '{2, 11} -> 2',
 '{7, 1} -> 2',
 '{15, 10} -> 7',
 '{6, 9} -> 4',
 '{16, 12} -> 2']

In [70]:
low_confidence_rules

['{10, 3} -> 8',
 '{5, 4} -> 8',
 '{12, 11} -> 8',
 '{10, 3} -> 4',
 '{5, 2} -> 3',
 '{5, 2} -> 10',
 '{12, 11} -> 5',
 '{9, 5} -> 2',
 '{10, 3} -> 7',
 '{5, 4} -> 3',
 '{5, 2} -> 7',
 '{5, 2} -> 12',
 '{9, 5} -> 9',
 '{5, 2} -> 2',
 '{9, 5} -> 13',
 '{10, 3} -> 9',
 '{10, 3} -> 3',
 '{5, 4} -> 6',
 '{5, 4} -> 9',
 '{10, 3} -> 6',
 '{5, 2} -> 9',
 '{10, 3} -> 10',
 '{9, 5} -> 5',
 '{5, 2} -> 4',
 '{9, 5} -> 4',
 '{5, 3} -> 5',
 '{6, 3} -> 2',
 '{6, 3} -> 7',
 '{4, 2} -> 7',
 '{9, 7} -> 9',
 '{10, 8} -> 8',
 '{10, 8} -> 9',
 '{11, 8} -> 4',
 '{9, 7} -> 5',
 '{9, 7} -> 3',
 '{11, 8} -> 3',
 '{10, 8} -> 10',
 '{10, 8} -> 5',
 '{9, 7} -> 7',
 '{6, 4} -> 5',
 '{6, 4} -> 11',
 '{11, 10} -> 7',
 '{10, 8} -> 3',
 '{11, 8} -> 6',
 '{6, 4} -> 8',
 '{6, 4} -> 9',
 '{9, 8} -> 7',
 '{4, 2} -> 3',
 '{4, 2} -> 12',
 '{4, 2} -> 6',
 '{11, 10} -> 2',
 '{6, 4} -> 3',
 '{11, 8} -> 1',
 '{11, 10} -> 8',
 '{4, 2} -> 2',
 '{11, 10} -> 6',
 '{9, 8} -> 11',
 '{9, 8} -> 2',
 '{10, 8} -> 2',
 '{9, 8} -> 3',
 '{

In [71]:
frequent_item_sets_str = [str(item) for item in frequent_item_sets]
freq_itemset_df = pd.DataFrame(frequent_item_sets_str)
freq_itemset_df.to_csv('Results_1.csv', index = False, header = False)

In [56]:
highest_conf_rules_df = pd.DataFrame(highest_confidence_rules)
highest_conf_rules_df.to_csv('Results_2.csv', index = False, header = False)

In [57]:
low_conf_rules_df = pd.DataFrame(low_confidence_rules)
low_conf_rules_df.to_csv('Results_3.csv', index = False, header = False)