In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import csv
import os
from datetime import date, timedelta, datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Table 1

In [None]:
demo_df = pd.read_csv('./demographic_data_summary_final.csv')
demo_df.head(3)

In [None]:
adh_dir = './cgm_count_by_day/'
adh_lst = []

for filename in os.listdir(adh_dir):
  if filename[-4:] == '.csv':
    df = pd.read_csv(adh_dir + filename)
    valid = list(df.Valid.values)
    short_valid = valid[:30].count(1) / 30
    long_valid = valid.count(1) / len(valid)
    valid = 1 if long_valid >= .7 else 0 # long term adherence is valid if long_valid > .7
    adh_lst.append([filename[:-4], short_valid, long_valid, valid])
    # break

In [None]:
valid_num = [i[3] for i in adh_lst].count(1)
invalid_num = [i[3] for i in adh_lst].count(0)
print(valid_num, invalid_num)

In [None]:
df_adh = pd.DataFrame(adh_lst, columns=['app_id', 'short_valid', 'long_valid', 'adherence'])

In [None]:
df_merge = pd.merge(demo_df, df_adh, on="app_id")
df_merge.head(3)

In [None]:
gender = []
for i in df_merge.gender.values:
  if i == '1: Male':
    gender.append(1)
  elif i == '2: Female':
    gender.append(2)
  elif i == '3: Other':
    gender.append(3)
  else:
    gender.append(0)

race = []
for i in df_merge.race.values:
  if i == '1: White':
    race.append(1)
  elif i == '2: Black':
    race.append(2)
  elif i == '3: Others':
    race.append(3)
  else:
    race.append(0)

ins_type = []
for i in df_merge.insulin_type.values:
  if i == 'Insulin pump':
    ins_type.append(1)
  elif i == 'Multiple daily injections':
    ins_type.append(2)
  else:
    ins_type.append(0)

In [None]:
df_merge.insert(3, 'Gender', gender)
df_merge.insert(7, 'Race', race)
df_merge.insert(10, 'Insulin_type', ins_type)
df_merge.head()

In [None]:
new_df = df_merge.drop(['app_id', 'gender', 'race', 'insulin_type', 'sweetgoals_id', 'location', 'long_valid'], axis=1)
new_df.head(3)

In [None]:
new_df.to_csv('./multi_model_predictors.csv', index=None)

In [None]:
from scipy.stats.contingency import odds_ratio
def get_odds(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases):
  res = odds_ratio([[exposed_cases, unexposed_cases], [exposed_noncases, unexposed_noncases]])
  print('odds ratio', res.statistic)
  print('95% CI', res.confidence_interval(confidence_level=0.95))

In [None]:
import scipy.stats as stats
def get_fisher_p(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases):
  table = [[exposed_cases, unexposed_cases], [exposed_noncases, unexposed_noncases]]
  _, pvalue = stats.fisher_exact(table)
  print("p-Value:", pvalue)
  # return pvalue

In [None]:
# reference
def get_unexposed(unexposed_df):
  print('unexposed:', unexposed_df.shape)
  unexposed_cases = list(unexposed_df.adherence.values).count(1) # high ahderence (what we want to find)
  unexposed_noncases = list(unexposed_df.adherence.values).count(0)
  print(unexposed_cases, unexposed_noncases)
  return unexposed_cases, unexposed_noncases

In [None]:
def get_exposed(exposed_df):
  print('exposed:', exposed_df.shape)
  exposed_cases = list(exposed_df.adherence.values).count(1)
  exposed_noncases = list(exposed_df.adherence.values).count(0)
  print(exposed_cases, exposed_noncases)
  return exposed_cases, exposed_noncases

In [None]:
age = {1:[19, 21], 2:[22, 25], 3:[26, 29]}

for i in range(2, 4):
  print('---', age[i])
  unexposed_df = new_df[new_df['age']>= 19][new_df['age']<= 21]
  exposed_df = new_df[new_df['age']>= age[i][0]][new_df['age']<= age[i][1]]
  exposed_cases, exposed_noncases = get_exposed(exposed_df)
  unexposed_cases, unexposed_noncases =  get_unexposed(unexposed_df)
  get_odds(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)
  get_fisher_p(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)

In [None]:
exposed_df1 = new_df[new_df['A1c']> 7.5][new_df['A1c']<= 9]
exposed_df2 = new_df[new_df['A1c']> 9]
a1c = {2: '(7.5, 9]', 3:'>9'}
exposed_ = {2:exposed_df1, 3: exposed_df2}

for i in range(2, 4):
  print('---', a1c[i])
  unexposed_df = new_df[new_df['A1c']>= 6][new_df['A1c']<= 7.5]
  exposed_df = exposed_[i]
  exposed_cases, exposed_noncases = get_exposed(exposed_df)
  unexposed_cases, unexposed_noncases =  get_unexposed(unexposed_df)
  get_odds(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)
  get_fisher_p(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)

In [None]:
print('gender')
unexposed_df = new_df[new_df['Gender'] == 1]
exposed_df = new_df[new_df['Gender'] == 2]
exposed_cases, exposed_noncases = get_exposed(exposed_df)
unexposed_cases, unexposed_noncases =  get_unexposed(unexposed_df)
get_odds(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)
get_fisher_p(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)

In [None]:
exposed_df1 = new_df[new_df['diagnosis_duration']> 7][new_df['diagnosis_duration']<= 15]
exposed_df2 = new_df[new_df['diagnosis_duration']> 15]
a1c = {2: '(7, 15]', 3:'> 15'}
exposed_ = {2:exposed_df1, 3: exposed_df2}

for i in range(2, 4):
  print('---', a1c[i])
  unexposed_df = new_df[new_df['diagnosis_duration']>= 2][new_df['diagnosis_duration']<= 7]
  exposed_df = exposed_[i]
  exposed_cases, exposed_noncases = get_exposed(exposed_df)
  unexposed_cases, unexposed_noncases =  get_unexposed(unexposed_df)
  get_odds(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)
  get_fisher_p(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)

In [None]:
print('short valid')
unexposed_df = new_df[new_df['short_valid'] >= 0.7]
exposed_df = new_df[new_df['short_valid'] < 0.7]
exposed_cases, exposed_noncases = get_exposed(exposed_df)
unexposed_cases, unexposed_noncases =  get_unexposed(unexposed_df)
get_odds(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)
get_fisher_p(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)

In [None]:
print('Insurance')
unexposed_df = new_df[new_df['Insurance_0'] == 1] # private as ref
exposed_df = new_df[new_df['Insurance_0'] == 2]
exposed_cases, exposed_noncases = get_exposed(exposed_df)
unexposed_cases, unexposed_noncases =  get_unexposed(unexposed_df)
get_odds(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)
get_fisher_p(exposed_cases, unexposed_cases, exposed_noncases, unexposed_noncases)

# Table 2

## Demographic

In [None]:
demo_file = './demographic_data_summary_final.csv'
df = pd.read_csv(demo_file)
df.head(3)

In [None]:
print('total number of subject:', df.app_id.count())

In [None]:
race_dict = {}
race_lst = list(df.race.values)
for i in df.race.unique():
  race_dict.update({i: race_lst.count(i)})

print(race_dict)

In [None]:
type_dict = {}
type_lst = list(df.insulin_type.values)
for i in df.insulin_type.unique():
  type_dict.update({i: type_lst.count(i)})

print(type_dict)

In [None]:
age = df.age.values[~np.isnan(df.age.values)]

print('age avg and std:', np.mean(age), '+/', np.std(age)) # only keep intergers?
print('age range:', np.min(age), '-' , np.max(age))
print('not available:', len(df.age.values[np.isnan(df.age.values)]))

In [None]:
gender_lst = []
for i in range(len(df.sweetgoals_id.values)):
  gender_lst.append(df.gender.values[i])
print(len(gender_lst))

In [None]:
sex_dict = {}
for i in df.gender.unique():
  sex_dict.update({i: gender_lst.count(i)})

sex_dict = {k: v for k, v in sex_dict.items() if v}

print(sex_dict)

In [None]:
sex_dict['Unknown'] = sex_dict[list(sex_dict.keys())[2]]
del sex_dict[list(sex_dict.keys())[3]]
print(sex_dict)

In [None]:
diag_lst = []
for i in range(len(df.sweetgoals_id.values)):
  # print(i)
  if not np.isnan(df.diagnosis_duration.values[i]):
    diag_lst.append(df.diagnosis_duration.values[i])
print(len(diag_lst), diag_lst)

In [None]:
print("avg and std:", round(np.mean(diag_lst), 2), round(np.std(diag_lst), 2))
print('diag duration range:', np.min(diag_lst), '-' , np.max(diag_lst))
print('The number of subjects does not have diagnosis duration:', 108 - len(diag_lst))

In [None]:
a1c_lst = []
for i in range(len(df.sweetgoals_id.values)):
  # print(i)
  if not np.isnan(df.A1c.values[i]):
    a1c_lst.append(df.A1c.values[i])

print("The number of subjects have a1c values:", len(a1c_lst))

In [None]:
print("avg and std:", round(np.mean(a1c_lst), 2), round(np.std(a1c_lst), 2))
print('a1c range:', np.min(a1c_lst), '-' , np.max(a1c_lst))
print('The number of subjects does not have A1c:', 108 - len(a1c_lst))

In [None]:
insurance_list = pd.Series(list(df.Insurance_0.values)).fillna(0).tolist()

insurance_dict = {}
for i in [0, 1, 2]:
  insurance_dict.update({i: insurance_list.count(i)})

# insurance_dict = {k: v for k, v in insurance_dict.items() if v}

print(insurance_dict)

## CGM data

In [None]:
id = df.app_id.values
demo_file = './demographic.csv'
cgm_model_df = pd.read_csv(demo_file)[['app_id', 'cgm_model']]
# cgm_model_df.head(3)

In [None]:
model_lst = []
for i in cgm_model_df.values:
  if i[0] in id:
    model_lst.append(i[1])
print(len(model_lst))

In [None]:
model_dict = {}
for i in cgm_model_df.cgm_model.unique():
  model_dict.update({i: model_lst.count(i)})

model_dict = {k: v for k, v in model_dict.items() if v}

print(model_dict)

In [None]:
print('dexcom:', 91 + 7 + 4 + 1, '\nmedtronic:', 1, '\nunknown:', 108 - 106 + 2)

In [None]:
cgm_summary = './CGM_summary.csv'

cgm_df = pd.read_csv(cgm_summary)
cgm_df.head(3)

In [None]:
print('total duration:', np.sum(cgm_df.Duration.values))

print('duration avg and std:', np.mean(cgm_df.Duration.values), np.std(cgm_df.Duration.values))
print('range:', np.min(cgm_df.Duration.values), '-', np.max(cgm_df.Duration.values))

print('total dyas without data record:', np.sum(cgm_df.MissingDays.values))
print('total dyas with more than 70% daily data record:', np.sum(cgm_df.ValidDays.values))

In [None]:
# for days with more than 70% record
cgm_df2 = pd.read_csv('./summary_TIR_meanBG_gluVar.csv')
cgm_df2.head(3)

In [None]:
meanBG = cgm_df2.meanBG.values

print('daily mean BG avg and std:', np.round(np.mean(meanBG), 2), np.round(np.std(meanBG), 2))
print('range:', np.round(np.min(meanBG), 2), '-', np.max(meanBG))

In [None]:
TIR = cgm_df2.TIR.values

print('daily TIR avg and std:', np.round(np.mean(TIR), 4), np.round(np.std(TIR), 4))
print('range:', np.min(TIR), '-', np.max(TIR))

# Other paper data

## High adherence ratio

In [None]:
df_valid = pd.read_csv('./yearly_adherence_trends.csv')
df_valid

In [None]:
lst = list(df_valid.Valid_ratio.values)
print(min(df_valid.Valid_ratio.values), lst.index(min(df_valid.Valid_ratio.values)), df_valid.values[lst.index(min(df_valid.Valid_ratio.values))])
print(max(df_valid.Valid_ratio.values), lst.index(max(df_valid.Valid_ratio.values)), df_valid.values[lst.index(max(df_valid.Valid_ratio.values))])