# Table of contents 

1. [Dummy Data Functions](#make_dummy_data)
    1. [Helpers](#helpers)
2. [Dummy Data Validation](#dummy_data_validation)
    1. [Dummy Date Group 1](#dummy_data_group_1)
        1. [Generating Dummy Data 1](#generate_dummy_data_1)
        2. [Validate Dummy Data 1](#validate_dummy_data_1)
    2. [Dummy Data Group 2](#dummy_data_group_2)
        1. [Generating Dummy Data 2](#generate_dummy_data_2)
        2. [Validate Dummy Data 2](#validate_dummy_data_2)
    3. [Dummy Data Group 3](#dummy_data_group_3)
        1. [Generating Dummy Data 3](#generate_dummy_data_3)
        2. [Validate Dummy Data 3](#validate_dummy_data_3)
    4. [Dummy Data Group 4](#dummy_data_group_4)
        1. [Generating Dummy Data 4](#generate_dummy_data_4)
        2. [Validate Dummy Data 4](#validate_dummy_data_4)
    5. [Dummy Data Group 5](#dummy_data_group_5)
        1. [Generating Dummy Data 5](#generate_dummy_data_5)
        2. [Validate Dummy Data 4](#validate_dummy_data_5)
    6. [Dummy Data Group 6](#dummy_data_group_6)
        1. [Generating Dummy Data 6](#generate_dummy_data_6)
        2. [Validate Dummy Data 6](#validate_dummy_data_6)
    7. [Dummy Data Group 7](#dummy_data_group_7)
        1. [Generating Dummy Data 7](#generate_dummy_data_7)
        2. [Validate Dummy Data 7](#validate_dummy_data_7)
    8. [Dummy Data Group 8](#dummy_data_group_8)
        1. [Generating Dummy Data 8](#generate_dummy_data_8)
        2. [Validate Dummy Data 8](#validate_dummy_data_8)

In [124]:
%run decision_support_model.ipynb

# Dummy Data Functions <a name="make_dummy_data"></a>

For default dummy data:

- need 0-29: 70% of ibhs_patients improve(1,2,3), 30% worse/stay the same; 30% fbmhs_patient improve(1,2,3), 70% worse/stay the same

- need 30-59: 70% of fbmhs_patient improve(1,2,3), 30% worse/stay the same; 30% ibhs_patients improve(1,2,3), 70% worse/stay the same

- need 60-99: completely random

## Helpers <a name="helpers"></a>

In [13]:
def find_distribution(df):
    dis = " "
    for i in range(df.shape[1]):
        if i == 0:
            dis =  df[str(0)].value_counts()
        else:
            cur =  df[str(i)].value_counts()
            for j in range(7):
                k = j - 3
                if k not in cur:
                    cur[k] = 0
            dis +=  cur
    for j in range(7):
        k = j - 3
        if k not in dis:
            # print(k)
            dis[k] = 0
    return dis

In [14]:
ibhs_df = pd.read_csv('ibhs_all_improv_table.csv')
fbmh_df = pd.read_csv('fbmhs_all_improv_table.csv')

ibhs_dis = find_distribution(ibhs_df)
fbmh_dis = find_distribution(fbmh_df)

In [15]:
ibhs_dis

 0    270431.0
-1     25553.0
 1     39052.0
-2      6694.0
 2     14419.0
-3      1118.0
 3         0.0
Name: 0, dtype: float64

In [16]:
'''
 1. generator that outputs improvement and disimprovement with values based on distribution(uniform, original) 
 2. whether the output disimproves and improves is predetermined by if_improv
'''
def random_generator(dis, lst, program = 'IBHS', if_improv = True):
    if dis == 'uniform':
        return random.choice(lst)
    if if_improv and dis == 'original':
        if program == 'IBHS':
            num = random.random()
            total = ibhs_dis[0] + ibhs_dis[1] + ibhs_dis[2] + ibhs_dis[3]
            if num < ibhs_dis[0]/total:
                return 0
            elif num < ibhs_dis[1]/total:
                return 1
            elif num < ibhs_dis[2]/total:
                return 2
            else:
                return 3
        elif program == 'FMBHS':
            num = random.random()
            total = fbmh_dis[0] + fbmh_dis[1] + fbmh_dis[2] + fbmh_dis[3]
            if num < fbmh_dis[0]/total:
                return 0
            elif num < fbmh_dis[1]/total:
                return 1
            elif num < fbmh_dis[2]/total:
                return 2
            else:
                return 3
    elif not if_improv and dis == 'original':
        if program == 'IBHS':
            num = random.random()
            total = ibhs_dis[0] + ibhs_dis[-1] + ibhs_dis[-2] + ibhs_dis[-3]
            if num < ibhs_dis[0]/total:
                return 0
            elif num < ibhs_dis[-1]/total:
                return -1
            elif num < ibhs_dis[-2]/total:
                return -2
            else:
                return -3
        elif program == 'FMBHS':
            num = random.random()
            total = fbmh_dis[0] + fbmh_dis[-1] + fbmh_dis[-2] + fbmh_dis[-3]
            if num < fbmh_dis[0]/total:
                return 0
            elif num < fbmh_dis[-1]/total:
                return -1
            elif num < fbmh_dis[-2]/total:
                return -2
            else:
                return -3
        

In [24]:
'''
1. fields: table column names
2. num_patients: number of patients
3. ratio: the probability for disimprovement
each item only improves if improve_only = True, and it improves based on the dis (uniform/original)
2. file_name is the output file name
3. need_groups is a list of the cutting points with length 3. 
   For need_groups = [a,b,c], 
   [0:a] are the items that must improve with IBHS(disimproves with FBMHS), 
   [a:b] are the items that must improve with FBMHS(disimproves with IBHS), 
   [b:c] are the items improve randomly
'''
def make_dummy_data(fields, num_patient, improve_only = True, need_groups = [5,10,90], 
                    ibhs_improv_prob = 0.7, ibhs_disImprov_prob = 0.7, 
                    fbmhs_improv_prob = 0.7, fbmhs_disImprov_prob = 0.7,
                    file_name = 'dummy_data_test', dis = 'uniform',
                    improv = [0,1,2,3], disImprov = [0,-1,-2,-3]):
    rows = []
    for i in range(num_patient):

        #ma number
        row = [str(i)]
        #program
        if i < num_patient//2:
            program = 'IBHS'
        else:
            program = 'FBMHS'
        
        row.append(program)

        # iterate through all needs
        for ni in range(need_groups[-1]):
            if ni < need_groups[0]: # these needs should mostly/all improve for IBHS patients
                if program == 'IBHS': # for IBHS patients
                    if improve_only: # these needs should improve all the times
                        improvement = random_generator(dis, improv)
                        row.append(improvement)
                    else: # these needs should improve with a probability of "ibhs_improv_prob"
                        num = random.random()
                        if num <= ibhs_improv_prob:
                            improvement = random_generator(dis, improv)
                            row.append(improvement)
                        else:
                            improvement = random_generator(dis, disImprov, if_improv = False)
                            row.append(improvement)
                else: # for FBMHS patients
                    if improve_only: # these needs improves 0
                        improvement = 0
                        row.append(improvement)
                    else: # these needs disimprove with a probability of "ibhs_disImprov_prob"
                        num = random.random()
                        if num <= ibhs_disImprov_prob:
                            improvement = random_generator(dis, disImprov, if_improv = False)
                            row.append(improvement)
                        else: 
                            improvement = random_generator(dis, improv)
                            row.append(improvement)
            elif ni >= need_groups[0] and ni < need_groups[1]: # these needs should mostly/all improve for FBMHS patients
                if program == 'FBMHS': # for FBMHS patients
                    if improve_only: # these needs should improve all the times
                        improvement = random_generator(dis, improv)
                        row.append(improvement)
                    else: # these needs should only improve with a probability of "fbmhs_improv_prob"
                        num = random.random()
                        if num <= fbmhs_improv_prob:
                            improvement = random_generator(dis, improv)
                            row.append(improvement)
                        else:
                            improvement = random_generator(dis, disImprov, if_improv = False)
                            row.append(improvement)
                else: # for IBHS patients
                    if improve_only:  # these needs improves 0
                        improvement = 0
                        row.append(improvement)
                    else:  # these needs disimprove with a probability of "fbmhs_disImprov_prob"
                        num = random.random()
                        if num <= fbmhs_disImprov_prob:
                            improvement = random_generator(dis, disImprov, if_improv = False)
                            row.append(improvement)
                        else:
                            improvement = random_generator(dis, improv)
                            row.append(improvement)
            else: #these needs improve randomly according to given distribution
                if improve_only:
                    improvement = random_generator(dis, improv)
                    row.append(improvement)
                else:
                    improvement = random_generator(dis, list(set(disImprov + improv)), if_improv = random.choice([False, True]))
                    row.append(improvement)
        rows.append(row)
        
    filename = file_name
    # writing to csv file 
    with open(filename, 'w') as csvfile: 
        # creating a csv writer object 
        csvwriter = csv.writer(csvfile) 

        # writing the fields 
        csvwriter.writerow(fields) 

        # writing the data rows 
        csvwriter.writerows(rows)
                
                    

# Dummy Data Validation <a name="dummy_data_validation"></a>


## Validation Helpers <a name="validate_helpers"></a>

In [37]:
def import_data(name):
    test_dummy = pd.read_csv(name)
    test_dummy_ibhs = test_dummy[test_dummy['Program'] == 'IBHS']
    test_dummy_fbmh = test_dummy[test_dummy['Program'] == 'FBMHS']
    droplist = ['client_id','Program']
    test_dummy_fbmh = test_dummy_fbmh.reset_index(drop=True)

    test_dummy_fbmh.drop(droplist, axis = 1, inplace = True)
    test_dummy_ibhs.drop(droplist, axis = 1, inplace = True)
    return (test_dummy_ibhs, test_dummy_fbmh)

In [132]:
def validate_result(x_item, need_splits):
    ibhs = x_item[0:need_splits[0]]
    number_of_minus_one_split_0 = sum(ibhs==-1.0)
    number_of_one_split_0 = sum(ibhs==1.0)
    number_of_between_split_0 = sum((ibhs < 1.0) & (ibhs > -1.0))
    
    fbmhs = x_item[need_splits[0]:need_splits[1]]
    number_of_minus_one_split_1 = sum(fbmhs==-1.0)
    number_of_one_split_1 = sum(fbmhs==1.0)
    number_of_between_split_1 = sum((fbmhs < 1.0) & (fbmhs > -1.0))
    
    rest = x_item[need_splits[1]:]
    number_of_minus_one_split_2 = sum(rest==-1.0)
    number_of_one_split_2 = sum(rest==1.0)
    number_of_between_split_2 = sum((rest < 1.0) & (rest > -1.0))
    
    return pd.DataFrame(np.array([[number_of_one_split_0, number_of_minus_one_split_0, number_of_between_split_0], 
                                 [number_of_one_split_1, number_of_minus_one_split_1, number_of_between_split_1], 
                                 [number_of_one_split_2, number_of_minus_one_split_2, number_of_between_split_2]]),
                       columns = ['number_of_item_is_1', 
                                  'number_of_item_is_-1', 
                                  'number_of_item_is_betweem_1_and_-1'],
                      index = ['Items 1-' + str(need_splits[0]), 
                               'Items ' + str(need_splits[0]+1) + '-' + str(need_splits[1]),
                               'Items ' + str(need_splits[1]+1) + '-' + str(need_splits[2])])

### Dummy Data group 1 <a name="dummy_data_group_1"></a>

#### Generating Dummy Data 1<a name="generate_dummy_data_1"></a>
5 IBHS, 5 FBMHS, 80 random, only improvements, 30 patients

In [133]:
num_patient = 30
need_splits = [5,10,90]
fields_need = [str(i+1) for i in range(need_splits[-1])]
fields = ['client_id', 'Program'] + fields_need
name = 'dummy_dataset/dummy_group_1.csv'
make_dummy_data(fields, num_patient, improve_only = True, 
                  need_groups = need_splits, file_name = name)


#### Validate Dummy Data 1<a name="validate_dummy_data_1"></a> 

In [137]:
ibhs_dummy_1, fbmh_dummy_1 = import_data("dummy_dataset/dummy_group_1.csv")
x_item_1 = twosum_improvement_xneed_all(ibhs_dummy_1, fbmh_dummy_1, batch_num = 1, n = 1)
validate_result(x_item_1, need_splits)

Unnamed: 0,number_of_item_is_1,number_of_item_is_-1,number_of_item_is_betweem_1_and_-1
Items 1-5,0,5,0
Items 6-10,5,0,0
Items 11-90,32,48,0


### Dummy Data group 2 <a name="dummy_data_group_2"></a>

#### Generate Dummy Data 2<a name="generate_dummy_data_2"></a>
5 IBHS, 5 FBMHS, 80 random, improvements and disimprovements, 30 patients

In [138]:
num_patient = 30
need_splits = [5,10,90]
fields_need = [str(i+1) for i in range(need_splits[-1])]
fields = ['client_id', 'Program'] + fields_need
name = 'dummy_dataset/dummy_group_2.csv'
make_dummy_data(fields, num_patient, improve_only = False, need_groups = need_splits, 
                    ibhs_improv_prob = 1, ibhs_disImprov_prob = 1, 
                    fbmhs_improv_prob = 1, fbmhs_disImprov_prob = 1,
                    file_name = name, dis = 'uniform',
                    improv = [0,1,2,3], disImprov = [0,-1,-2,-3])


#### Validate Dummy Data 2<a name="validate_dummy_data_2"></a> 

In [140]:
ibhs_dummy_2, fbmh_dummy_2 = import_data("dummy_dataset/dummy_group_2.csv")
x_item_2 = twosum_improvement_xneed_all(ibhs_dummy_2, fbmh_dummy_2, batch_num = 1, n = 1)
validate_result(x_item_2, need_splits)

Unnamed: 0,number_of_item_is_1,number_of_item_is_-1,number_of_item_is_betweem_1_and_-1
Items 1-5,0,5,0
Items 6-10,5,0,0
Items 11-90,38,42,0


### Dummy Data group 3 <a name="dummy_data_group_3"></a>

#### Generate Dummy Data 3<a name="generate_dummy_data_3"></a>
5 IBHS, 5 correlated with FBMHS, 80 random, only improvements, 200 patients.

In [141]:
num_patient = 200
need_splits = [5,10,90]
fields_need = [str(i+1) for i in range(need_splits[-1])]
fields = ['client_id', 'Program'] + fields_need
name = 'dummy_dataset/dummy_group_3.csv'
make_dummy_data(fields, num_patient, improve_only = True, need_groups = need_splits, 
                    ibhs_improv_prob = 1, ibhs_disImprov_prob = 1, 
                    fbmhs_improv_prob = 1, fbmhs_disImprov_prob = 1,
                    file_name = name, dis = 'uniform',
                    improv = [0,1,2,3], disImprov = [0,-1,-2,-3])


#### Validate Dummy Data 3<a name="validate_dummy_data_3"></a> 

In [143]:
ibhs_dummy_3, fbmh_dummy_3 = import_data("dummy_dataset/dummy_group_3.csv")
x_item_3 = twosum_improvement_xneed_all(ibhs_dummy_3, fbmh_dummy_3, batch_num = 1, n = 1)
validate_result(x_item_3, need_splits)

Unnamed: 0,number_of_item_is_1,number_of_item_is_-1,number_of_item_is_betweem_1_and_-1
Items 1-5,0,5,0
Items 6-10,5,0,0
Items 11-90,35,45,0


### Dummy Data group 4 <a name="dummy_data_group_4"></a>

#### Generate Dummy Data Group 4<a name="generate_dummy_data_4"></a>
Group 4: 5 IBHS, 5 FBMHS, 80 random, improvements and disimprovements, 200 patients.

In [149]:
num_patient = 200
need_splits = [5,10,90]
fields_need = [str(i+1) for i in range(need_splits[-1])]
fields = ['client_id', 'Program'] + fields_need
name = 'dummy_dataset/dummy_group_4.csv'
make_dummy_data(fields, num_patient, improve_only = False, need_groups = need_splits, 
                    ibhs_improv_prob = 1, ibhs_disImprov_prob = 1, 
                    fbmhs_improv_prob = 1, fbmhs_disImprov_prob = 1,
                    file_name = name, dis = 'uniform',
                    improv = [0,1,2,3], disImprov = [0,-1,-2,-3])


#### Validate Dummy Data 4<a name="validate_dummy_data_4"></a> 

In [150]:
ibhs_dummy_4, fbmh_dummy_4 = import_data("dummy_dataset/dummy_group_4.csv")
x_item_4 = twosum_improvement_xneed_all(ibhs_dummy_4, fbmh_dummy_4, batch_num = 1, n = 1)
validate_result(x_item_4, need_splits)

Unnamed: 0,number_of_item_is_1,number_of_item_is_-1,number_of_item_is_betweem_1_and_-1
Items 1-5,2,3,0
Items 6-10,3,2,0
Items 11-90,43,37,0


### Dummy Data group 5 <a name="dummy_data_group_5"></a>

#### Generate Dummy Data 5<a name="generate_dummy_data_5"></a>
Group 5: 30 IBHS, 30 FBMHS, 30 random, only improvements, 30 patients

In [151]:
num_patient = 30
need_splits = [30,60,90]
fields_need = [str(i+1) for i in range(need_splits[-1])]
fields = ['client_id', 'Program'] + fields_need
name = 'dummy_dataset/dummy_group_5.csv'
make_dummy_data(fields, num_patient, improve_only = True, need_groups = need_splits, 
                    ibhs_improv_prob = 1, ibhs_disImprov_prob = 1, 
                    fbmhs_improv_prob = 1, fbmhs_disImprov_prob = 1,
                    file_name = name, dis = 'uniform',
                    improv = [0,1,2,3], disImprov = [0,-1,-2,-3])


#### Validate Dummy Data 5<a name="validate_dummy_data_5"></a> 

In [153]:
ibhs_dummy_5, fbmh_dummy_5 = import_data("dummy_dataset/dummy_group_5.csv")
x_item_5 = twosum_improvement_xneed_all(ibhs_dummy_5, fbmh_dummy_5, batch_num = 1, n = 1)
validate_result(x_item_5, need_splits)

Unnamed: 0,number_of_item_is_1,number_of_item_is_-1,number_of_item_is_betweem_1_and_-1
Items 1-30,0,30,0
Items 31-60,30,0,0
Items 61-90,19,11,0


### Dummy Data group 6 <a name="dummy_data_group_6"></a>

#### Generate Dummy Data Group 6<a name="generate_dummy_data_6"></a>
Group 6: 30 IBHS, 30 FBMHS, 30 random, only improvements, 30 patients

In [156]:
num_patient = 30
need_splits = [30,60,90]
fields_need = [str(i+1) for i in range(need_splits[-1])]
fields = ['client_id', 'Program'] + fields_need
name = 'dummy_dataset/dummy_group_6.csv'
make_dummy_data(fields, num_patient, improve_only = False, need_groups = need_splits, 
                    ibhs_improv_prob = 1, ibhs_disImprov_prob = 1, 
                    fbmhs_improv_prob = 1, fbmhs_disImprov_prob = 1,
                    file_name = name, dis = 'uniform',
                    improv = [0,1,2,3], disImprov = [0,-1,-2,-3])


#### Validate Dummy Data 6<a name="validate_dummy_data_6"></a> 

In [157]:
ibhs_dummy_6, fbmh_dummy_6 = import_data("dummy_dataset/dummy_group_6.csv")
x_item_6 = twosum_improvement_xneed_all(ibhs_dummy_6, fbmh_dummy_6, batch_num = 1, n = 1)
validate_result(x_item_6, need_splits)

Unnamed: 0,number_of_item_is_1,number_of_item_is_-1,number_of_item_is_betweem_1_and_-1
Items 1-30,17,13,0
Items 31-60,15,15,0
Items 61-90,13,17,0


### Dummy Data group 7 <a name="dummy_data_group_7"></a>

#### Generate Dummy Data Group 7<a name="generate_dummy_data_7"></a>
Group 7: 30 IBHS, 30 FBMHS, 30 random, only improvements, 200 patients

In [158]:
num_patient = 200
need_splits = [30,60,90]
fields_need = [str(i+1) for i in range(need_splits[-1])]
fields = ['client_id', 'Program'] + fields_need
name = 'dummy_dataset/dummy_group_7.csv'
make_dummy_data(fields, num_patient, improve_only = True, need_groups = need_splits, 
                    ibhs_improv_prob = 1, ibhs_disImprov_prob = 1, 
                    fbmhs_improv_prob = 1, fbmhs_disImprov_prob = 1,
                    file_name = name, dis = 'uniform',
                    improv = [0,1,2,3], disImprov = [0,-1,-2,-3])


#### Validate Dummy Data 7<a name="validate_dummy_data_7"></a> 

In [159]:
ibhs_dummy_7, fbmh_dummy_7 = import_data("dummy_dataset/dummy_group_7.csv")
x_item_7 = twosum_improvement_xneed_all(ibhs_dummy_7, fbmh_dummy_7, batch_num = 1, n = 1)
validate_result(x_item_7, need_splits)

Unnamed: 0,number_of_item_is_1,number_of_item_is_-1,number_of_item_is_betweem_1_and_-1
Items 1-30,0,30,0
Items 31-60,30,0,0
Items 61-90,14,16,0


### Dummy Data group 8 <a name="dummy_data_group_8"></a>

#### Generate Dummy Data Group 8<a name="generate_dummy_data_8"></a>
Group 8: 30 IBHS, 30 FBMHS, 30 random, both improvements and disimprovements, 200 patients

In [170]:
num_patient = 200
need_splits = [30,60,90]
fields_need = [str(i+1) for i in range(need_splits[-1])]
fields = ['client_id', 'Program'] + fields_need
name = 'dummy_dataset/dummy_group_8.csv'
make_dummy_data(fields, num_patient, improve_only = False, need_groups = need_splits, 
                    ibhs_improv_prob = 1, ibhs_disImprov_prob = 1, 
                    fbmhs_improv_prob = 1, fbmhs_disImprov_prob = 1,
                    file_name = name, dis = 'uniform',
                    improv = [0,1,2,3], disImprov = [0,-1,-2,-3])


#### Validate Dummy Data 8<a name="validate_dummy_data_8"></a> 

In [171]:
ibhs_dummy_8, fbmh_dummy_8 = import_data("dummy_dataset/dummy_group_8.csv")
x_item_8 = twosum_improvement_xneed_all(ibhs_dummy_8, fbmh_dummy_8, batch_num = 1, n = 1)
validate_result(x_item_8, need_splits)

Unnamed: 0,number_of_item_is_1,number_of_item_is_-1,number_of_item_is_betweem_1_and_-1
Items 1-30,7,23,0
Items 31-60,19,11,0
Items 61-90,21,9,0
