# Reprosyn within Python

Here we briefly provide examples of using generators within Reprosyn, with the 1% Census.

Currently, the implemented methods are MST, and IPF.

In [1]:
import pandas as pd
from reprosyn.methods import MST, IPF, CTGAN, PRIVBAYES, DS_INDHIST

In [2]:
census = pd.read_csv('https://raw.githubusercontent.com/alan-turing-institute/reprosyn/main/src/reprosyn/datasets/2011-census-microdata/2011-census-microdata-small.csv')

census.drop(columns=['Person ID'], inplace=True)

census.head()

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000001,H,2,1,2,6,2,2,1,2,1,2,5,8,2,-9,4
1,E12000001,H,5,1,1,4,1,2,1,1,1,2,1,8,6,4,3
2,E12000001,H,3,1,2,4,1,2,1,1,1,1,1,6,11,3,4
3,E12000001,H,3,1,2,2,1,2,1,2,1,2,1,7,7,3,2
4,E12000001,H,3,1,1,5,4,2,1,1,1,2,1,1,4,3,2


## INDHIST

In [3]:
indhist = DS_INDHIST(dataset=census.copy(), size = 10)

In [4]:
indhist.run()

In [5]:
indhist.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000009,H,4,2,2,8,2,1,-9,2,-9,9,2,4,5,1,2
1,E12000002,H,4,3,2,7,3,2,2,2,4,7,9,5,9,4,2
2,E12000009,H,4,2,1,2,3,1,1,3,4,9,4,-9,12,2,3
3,E12000002,H,1,2,2,6,2,1,2,1,5,7,5,9,8,-9,1
4,E12000009,H,3,2,1,8,4,1,1,-9,1,7,1,4,4,-9,2
5,E12000004,H,4,1,2,3,2,2,-9,2,2,6,7,-9,4,-9,-9
6,E12000004,H,4,2,1,2,5,2,2,5,5,6,9,7,12,-9,4
7,E12000001,H,-9,1,2,8,5,1,-9,5,2,4,8,9,10,4,2
8,E12000009,H,3,1,2,4,3,2,2,4,-9,9,9,8,3,3,-9
9,E12000001,H,6,2,1,2,4,1,-9,4,1,1,4,2,10,-9,-9


## PRIVBAYES

In [6]:
pbayes = PRIVBAYES(dataset=census.copy(), size=10, epsilon=1)

In [7]:
pbayes.run()

In [8]:
pbayes.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000009,H,2,1,2,4,2,2,1,1,1,1,1,9,5,3,4
1,E12000003,H,2,1,2,7,2,2,-9,2,-9,-9,5,4,3,-9,2
2,E12000001,H,1,1,2,4,1,2,1,1,1,2,1,6,11,3,1
3,E12000004,H,2,1,1,4,2,2,2,1,4,6,1,5,6,2,2
4,E12000004,H,2,1,1,1,1,1,1,1,4,6,-9,-9,-9,-9,-9
5,E12000009,H,-9,1,2,4,2,2,1,1,1,2,1,1,3,3,2
6,E12000004,H,1,1,2,8,5,2,1,2,1,2,5,8,11,-9,4
7,E12000009,H,2,1,1,4,2,2,1,1,1,2,1,3,2,3,2
8,E12000009,H,5,1,2,2,1,2,1,1,1,2,1,7,4,3,4
9,E12000009,H,3,1,1,4,1,2,1,1,1,2,1,3,4,2,2


## MST

In [9]:
size = 10

mst_gen = MST(dataset=census.copy(), size=size, epsilon = 1)

In [10]:
# can check the settable parameters. Set using gen.params['param'] = val
mst_gen.get_parameters()

{'epsilon': 1, 'delta': 1e-09, 'degree': 2}

In [11]:
mst_gen.run()

In [12]:
mst_gen.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000001,H,2,1,1,4,2,2,1,2,1,2,1,2,3,3,1
1,E12000009,H,2,1,1,5,2,2,1,3,1,1,1,5,8,4,3
2,E12000009,H,2,1,1,3,2,2,1,1,1,2,1,3,6,3,2
3,E12000003,H,3,1,1,4,4,2,1,3,1,1,2,4,4,4,2
4,E12000004,H,4,1,1,1,1,2,1,1,1,1,-9,-9,-9,-9,-9
5,E12000003,H,1,1,1,8,5,2,1,3,1,2,5,3,9,-9,2
6,E12000002,H,3,1,1,2,1,2,1,2,1,1,1,1,11,2,1
7,E12000002,H,2,1,1,5,2,2,1,2,1,2,2,5,2,3,3
8,E12000003,H,5,1,2,5,4,2,1,1,1,2,1,-9,-9,3,2
9,E12000001,H,2,1,1,3,1,2,1,1,1,1,3,9,12,-9,4


## IPF

IPF only has one parameter, `marginals`. Additionally, IPF doesn't scale well and doesn't cope with all 17 features, so we drop a few of the larger features.

In [13]:
dropped = ['Industry','Economic Activity','Occupation','Approximated Social Grade']
census_slim = census.drop(columns = dropped, inplace=False).copy()

In [14]:
ipf_gen = IPF(dataset=census_slim.copy(), size=size, marginals = [(0,1),(2,3),(1,2,4)])

In [15]:
ipf_gen.run()

100%|█| 10/10 [00:04<00:00,  2.10it/s]


In [16]:
ipf_gen.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Hours worked per week
0,E12000005,H,1,1,2,6,3,2,1,1,3,6,1
1,E12000002,H,3,1,1,7,5,1,2,1,-9,9,1
2,E12000009,H,2,1,1,3,1,1,2,4,4,3,-9
3,E12000004,H,2,1,2,3,1,1,-9,-9,4,6,2
4,E12000001,H,1,1,1,6,2,1,2,3,2,4,-9
5,E12000009,H,1,1,1,1,3,2,2,4,5,3,2
6,E12000001,H,3,1,1,7,1,1,2,2,3,4,3
7,E12000004,H,2,1,2,6,5,2,-9,1,1,9,-9
8,E12000004,H,5,1,2,3,2,2,2,-9,4,-9,1
9,E12000004,H,5,1,1,1,2,2,2,2,-9,5,1


# CTGAN



In [None]:
ctgan = CTGAN(dataset=census.copy(), size=size, epochs = 10)

In [None]:
ctgan.run()

In [None]:
ctgan.output