In [1]:
import pandas as pd
import numpy as np

import more_itertools as mit
import itertools
import random

from graphviz import Digraph
import lxml
from bs4 import BeautifulSoup as bs

In [2]:
data = pd.read_csv('../../datasets/birth_control/cmc.csv', names=["age","wife_ed","husb_ed","no_kids","wife_rel","wife_works",
    "husb_occupation","SOL_index","media_exp","class"])
data.head()

Unnamed: 0,age,wife_ed,husb_ed,no_kids,wife_rel,wife_works,husb_occupation,SOL_index,media_exp,class
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [3]:
values = {}
for col in data.columns[:-1]:
    values[col] = set(data[col])
    print(f"{col}..... {values[col]}")

age..... {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49}
wife_ed..... {1, 2, 3, 4}
husb_ed..... {1, 2, 3, 4}
no_kids..... {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16}
wife_rel..... {0, 1}
wife_works..... {0, 1}
husb_occupation..... {1, 2, 3, 4}
SOL_index..... {1, 2, 3, 4}
media_exp..... {0, 1}


In [20]:
values["age"] = ["[16:19]","[20:29]","[30:39]","[40:49]"]
values["no_kids"] = ["[0:0]","[1:4]","[5:10]","[11:16]"]
values

{'age': ['[16:19]', '[20:29]', '[30:39]', '[40:49]'],
 'wife_ed': {1, 2, 3, 4},
 'husb_ed': {1, 2, 3, 4},
 'no_kids': ['[0:0]', '[1:4]', '[5:10]', '[11:16]'],
 'wife_rel': {0, 1},
 'wife_works': {0, 1},
 'husb_occupation': {1, 2, 3, 4},
 'SOL_index': {1, 2, 3, 4},
 'media_exp': {0, 1}}

In [7]:
def flatten(ls):
    flattened = []
    for l in ls:
        flattened.extend(l)
    return flattened

In [8]:
choices = {c:len(set(values[c])) for c in data.columns[:-1]}
choices

{'age': 4,
 'wife_ed': 4,
 'husb_ed': 4,
 'no_kids': 4,
 'wife_rel': 2,
 'wife_works': 2,
 'husb_occupation': 4,
 'SOL_index': 4,
 'media_exp': 2}

In [124]:
ps = list(mit.set_partitions([1,2,3,4], 2))
ps.extend(list(mit.set_partitions([1,2,3,4], 3)))
ps.extend(list(mit.set_partitions([1,2,3,4], 4)))
for n,p in enumerate(ps):
    print(n,p)

0 [[1], [2, 3, 4]]
1 [[1, 2], [3, 4]]
2 [[2], [1, 3, 4]]
3 [[1, 2, 3], [4]]
4 [[2, 3], [1, 4]]
5 [[1, 3], [2, 4]]
6 [[3], [1, 2, 4]]
7 [[1], [2], [3, 4]]
8 [[1], [2, 3], [4]]
9 [[1], [3], [2, 4]]
10 [[1, 2], [3], [4]]
11 [[2], [1, 3], [4]]
12 [[2], [3], [1, 4]]
13 [[1], [2], [3], [4]]


In [169]:
def create_4tree():
    x = list(range(1,5))
    odds = np.random.random()
    
    res = "<vgh value='[1:4]'>\n"

    part = random.choice(ps) 
    fp = flatten(part)
    
    mapping = {}
    for j, m in enumerate(fp):
        mapping[m] = j+1
    mapped_p = [list(map(lambda x: mapping[x], l)) for l in part]
    
    for p in mapped_p:
        res += f"\t<node value='[{min(p)}:{max(p)}]'/>\n"
        
    res += "</vgh>"
    return res, mapping

res, m = create_4tree()
print(res, m)

<vgh value='[1:4]'>
	<node value='[1:1]'/>
	<node value='[2:2]'/>
	<node value='[3:4]'/>
</vgh> {1: 1, 3: 2, 2: 3, 4: 4}


In [170]:
mappings = {}

full_xml = """
<config method='Datafly' k='2'>
<input filename='../../datasets/birth_control/cmc_ups_4cat.csv' separator=','/>
 <!-- If left blank, separator will be set as comma by default.-->
<output filename='../anon_data/test_tree_sample/k2.csv' format ='genVals'/>
 <!-- Format options = {genVals, genValsDist, anatomy}. If left blank,
output format will be set as genVals by default.-->
<id> <!-- List of identifier attributes, if any, these will be excluded from the output -->
</id>
<qid>"""

for i, attr in enumerate(data.columns[:-1]):
    full_xml = full_xml + (f"<att index='{i}' name='{attr}'>")
    
    if choices[attr] == 2:
        full_xml = full_xml + ("<vgh value='[0:1]'/>")
    else:
        tree, mapping = create_4tree()
        #print(attr, selected_t[0])
        #print(trees_4[selected_t[0]]["xml_tree"])
        mappings[attr] = mapping
        full_xml = full_xml + tree

    full_xml = full_xml + (f"</att>")
    
full_xml = full_xml + ("</qid>")

full_xml = full_xml + ("""
<sens>
<att index='9' name='class'/>
</sens>""")

full_xml = full_xml + "<mapping>"
for attr, mapp in mappings.items():
    full_xml = full_xml + f"<att name='{attr}'>"
    for orig, m in mapp.items():
        full_xml = full_xml + f"<node used='{m}' mapbackto='{orig}'/>"
    full_xml = full_xml + f"</att>"
full_xml = full_xml + "</mapping>"
full_xml = full_xml + "</config>"

xml_tree = bs(full_xml, 'xml')
print(xml_tree.prettify())

<?xml version="1.0" encoding="utf-8"?>
<config k="2" method="Datafly">
 <input filename="../../datasets/birth_control/cmc_ups_4cat.csv" separator=","/>
 <!-- If left blank, separator will be set as comma by default.-->
 <output filename="../anon_data/test_tree_sample/k2.csv" format="genVals"/>
 <!-- Format options = {genVals, genValsDist, anatomy}. If left blank,
output format will be set as genVals by default.-->
 <id>
  <!-- List of identifier attributes, if any, these will be excluded from the output -->
 </id>
 <qid>
  <att index="0" name="age">
   <vgh value="[1:4]">
    <node value="[1:2]"/>
    <node value="[3:4]"/>
   </vgh>
  </att>
  <att index="1" name="wife_ed">
   <vgh value="[1:4]">
    <node value="[1:2]"/>
    <node value="[3:4]"/>
   </vgh>
  </att>
  <att index="2" name="husb_ed">
   <vgh value="[1:4]">
    <node value="[1:1]"/>
    <node value="[2:4]"/>
   </vgh>
  </att>
  <att index="3" name="no_kids">
   <vgh value="[1:4]">
    <node value="[1:1]"/>
    <node valu