# Manipulate Attribute Value Frequencies
### Idea:
* Split DS into subsets with different attribute values (or ranges for numerical attributes)
* Draw as much as needed from the subsets, to compose the desired attribute value distribution

In [1]:
import pandas as pd
from dataset_modifier import random_sample_wrapper
# get the data
filepath = "data/2021_NCVR_Panse_001/dataset_ncvr_dirty.csv"
col_names = "sourceID,globalID,localID,FIRSTNAME,MIDDLENAME,LASTNAME,YEAROFBIRTH,PLACEOFBIRTH,COUNTRY,CITY,PLZ,STREET,GENDER,ETHNIC,RACE".split(",")
df = pd.read_csv(filepath, names=col_names, dtype={"PLZ": str, "YEAROFBIRTH": int},
                              keep_default_na=False)
df.head()

Unnamed: 0,sourceID,globalID,localID,FIRSTNAME,MIDDLENAME,LASTNAME,YEAROFBIRTH,PLACEOFBIRTH,COUNTRY,CITY,PLZ,STREET,GENDER,ETHNIC,RACE
0,A,AM28693,1fdc308dcc35344c6d5c20845fad641d,SUE,BARNES,SHUFFLER,1946,,BURKE,VALDESE,28690,LAKEVIEW ACRES,F,NL,W
1,A,CZ67291,b36b8c16c743ed415c6a7f3fc1a3b859,RONALD,EUGENE,LATTIMER,1953,OH,MOORE,PINEHURST,28374,BURNING TREE,M,NL,W
2,A,BN204837,401b7f6f5e107dbb009da197a2b05d9e,DEIDRE,MARLANA,MARTIN,1973,NC,FORSYTH,WINSTON SALEM,27105,CARVER GLEN,F,NL,B
3,A,EH737737,85f059d91b803b5835a8b9762a3170cb,KURT,,BALEN,1968,PA,WAKE,APEX,27502,ASHLEY DOWNS,M,NL,W
4,A,DT41462,f70a42ce915731cabafc5daa1bfbceeb,WORTH,BROWN,FOUTZ,1919,,ROWAN,SALISBURY,28146,MORRISON,M,NL,W


### 1. Categorical attributes

#### 1.1 Explicit specification of portions
Given for example: 75% male, 25% female

In [2]:
attr = "GENDER"
desired_distr = {"M": 0.75, "F": 0.25}
desired_size = 50_000

In [3]:
possible_values = df.GENDER.unique()
possible_values

array(['F', 'M', 'U'], dtype=object)

In [4]:
# make sure there are only possible values given
all([key in possible_values for key in desired_distr.keys()])

True

In [5]:
# round to the next even integer, because random_sample requires total size divisible by two
def round_to_even(f):
    return round(f / 2.) * 2

In [6]:
# check if the desired size is possible
def size_possible():
    for key, portion in desired_distr.items():
        if round(portion * desired_size) > df[df[attr]==key].shape[0]:
            return False
    return True
size_possible()

True

In [7]:
# draw randomly according to the desired distribution
result_subsets = [random_sample_wrapper(df[df[attr]==key], total_sample_size=round_to_even(portion*desired_size), seed=42)
                  for key, portion in desired_distr.items()]

In [8]:
# concatenate all and check the result
result = pd.concat(result_subsets)
result.GENDER.value_counts().map(lambda count: count / result.shape[0])

M    0.75
F    0.25
Name: GENDER, dtype: float64

#### 1.2 Implicit specification of portions
##### Example:
* portion of males should be 150% compared to base dataset
* -> if original is 50% M, 45% F, 5% U, then it should come out as 75% M, 22.5% F, 2.5% U

In [None]:
# TODO

### 2. Numerical Attributes

In [None]:
# TODO