## Reading datasets, preprocessing, and combining

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Original datasets
original_df1 = pd.read_csv('data/USCensus1990_1.csv')
original_df2 = pd.read_csv('data/USCensus1990_2.csv')

# We decided that the caseid was not an important field for the model to pick up on
#     and the resulting synthetic datasets do not have this field.
original_df1 = original_df1.drop('caseid', axis=1)
original_df2 = original_df2.drop('caseid', axis=1)

# Synthetic datasets
synth_df1 = pd.read_csv('data/synthetic_data.csv')
synth_df2 = pd.read_csv('data/synthetic_data_2.csv')

# Because both of these data sets are large, we will just sample 10000 entries from
#     each dataset to avoid jupyter complaining/crashing due to lack of memory.
original_df1 = original_df1.sample(n=10000, random_state=420)
original_df2 = original_df2.sample(n=10000, random_state=420)
synth_df1    = synth_df1.sample(n=10000, random_state=0xbeef)
synth_df2    = synth_df2.sample(n=10000, random_state=0xbeef)

# Lastly, the synthetic datasets are floats, and the original datasets are integers so
#     just adjust them so they are all integers
synth_df1 = synth_df1.astype(int)
synth_df2 = synth_df2.astype(int)

### Combining both synthetic datasets and both original datasets
Keep in mind, the combination of the original datasets is not something which would be done in a real life scenario if this method was used. This is only being done for testing purposes

In [3]:
columns = original_df1.columns # They're all the same

# Using numpy to do this because pd.merge didn't want to cooperate
original_df = np.vstack((original_df1, original_df2))
synth_df = np.vstack((synth_df1, synth_df2))

# Finally, conver them back into dataframes
original_df = pd.DataFrame(original_df, columns=columns)
synth_df = pd.DataFrame(synth_df, columns=columns)

In [4]:
original_df

Unnamed: 0,dAge,dAncstry1,dAncstry2,iAvail,iCitizen,iClass,dDepart,iDisabl1,iDisabl2,iEnglish,...,iTmpabsnt,dTravtime,iVietnam,dWeek89,iWork89,iWorklwk,iWWII,iYearsch,iYearwrk,dYrsserv
0,1,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,0,0
1,7,1,2,0,0,1,3,2,2,0,...,0,5,0,1,1,1,0,11,1,0
2,4,1,1,0,0,2,3,2,2,0,...,0,1,0,2,1,1,0,14,1,0
3,5,11,1,0,0,3,4,2,2,0,...,0,3,0,2,1,1,0,11,1,0
4,3,1,1,0,0,3,4,2,2,0,...,0,4,0,2,1,1,0,10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2,11,2,0,0,1,0,2,2,0,...,3,0,0,1,1,2,0,10,1,0
19996,1,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
19997,3,1,1,0,0,1,3,2,2,0,...,0,4,0,2,1,1,0,10,1,0
19998,3,11,1,0,0,1,4,1,2,0,...,0,3,0,2,1,1,0,10,1,2


In [5]:
synth_df

Unnamed: 0,dAge,dAncstry1,dAncstry2,iAvail,iCitizen,iClass,dDepart,iDisabl1,iDisabl2,iEnglish,...,iTmpabsnt,dTravtime,iVietnam,dWeek89,iWork89,iWorklwk,iWWII,iYearsch,iYearwrk,dYrsserv
0,4,9,1,0,0,2,5,2,2,0,...,0,4,0,2,1,1,0,12,1,0
1,5,2,0,0,0,0,0,2,2,0,...,1,0,0,0,2,2,0,15,5,0
2,8,4,2,0,0,0,0,2,2,1,...,4,0,0,0,2,2,0,7,4,0
3,7,0,1,0,0,0,0,2,2,0,...,2,0,0,0,2,2,0,3,6,0
4,2,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,6,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,5,1,4,0,0,3,3,2,2,0,...,0,3,0,2,1,1,0,15,1,0
19996,8,0,4,0,0,0,0,1,1,0,...,3,0,0,0,2,3,0,6,4,0
19997,6,6,4,0,0,1,2,2,2,1,...,0,6,0,2,1,1,0,9,1,0
19998,6,1,4,0,0,4,3,2,2,0,...,0,5,0,2,1,1,0,14,1,1


## Comparisons
Now we run the same calculations on both the combined synthetic sets and the combined original sets and compare the difference between each other

In [6]:
def percent_difference(x, y):
    return abs(x - y) / ((x + y) / 2) * 100

### Comparing Basic Metrics

In [7]:
def mean_metric_df(attribute: str) -> pd.DataFrame:
    
    original_average = original_df[attribute].mean()
    synth_average = synth_df[attribute].mean()
    error = percent_difference(original_average, synth_average)
    
    return pd.DataFrame({
        'original avg': [original_average],
        'synth avg': [synth_average],
        'error': [error]
    })

In [8]:
attributes_of_interest = [
    'dAge'      ,
    'dIncome1'  ,
    'dIncome2'  ,
    'dIncome4'  ,
    'dTravtime' ,
    'dYrsserv'  ,
]
for attribute in attributes_of_interest:
    print(f'\nAverage {attribute}:')
    print(mean_metric_df(attribute))


Average dAge:
   original avg  synth avg     error
0       3.82485     3.8231  0.045764

Average dIncome1:
   original avg  synth avg     error
0       0.90105    0.91675  1.727363

Average dIncome2:
   original avg  synth avg     error
0       0.04975     0.0465  6.753247

Average dIncome4:
   original avg  synth avg     error
0        0.2002    0.21295  6.172092

Average dTravtime:
   original avg  synth avg     error
0        1.4895     1.5014  0.795747

Average dYrsserv:
   original avg  synth avg      error
0         0.138    0.15515  11.700495


### Comparing Correlations Between Attributes

In [9]:
def correlation_df(x: str, y: str, order: int) -> pd.DataFrame:
    
    original_coef = np.polyfit(original_df[x], original_df[y], order)
    synth_coef = np.polyfit(synth_df[x], synth_df[y], order)
    coef_errors = list()
    
    for coef_1, coef_2 in zip(original_coef, synth_coef):
        coef_errors.append(percent_difference(coef_1, coef_2))
        
    return pd.DataFrame({
        'original': original_coef,
        'synth': synth_coef,
        'error': coef_errors
    })

In [10]:
correlations_of_interest = [
    ('iMarital'  , 'dAge'      , 1),
    ('dRpincome' , 'dRearning' , 1),
    ('iImmigr'   , 'iCitizen'  , 1),
    ('iRrelchld' , 'dAge'      , 1),
    ('iRemplpar' , 'iDisabl1'  , 1),
    ('iRemplpar' , 'iDisabl2'  , 1),
]
for correlation in correlations_of_interest:
    attr_1, attr_2, order = correlation
    print(f'\nCorrelation: {attr_1} vs {attr_2}')
    print(correlation_df(attr_1, attr_2, order))


Correlation: iMarital vs dAge
   original     synth     error
0 -0.773432 -0.846135 -8.978079
1  5.302608  5.540247  4.383322

Correlation: dRpincome vs dRearning
   original     synth      error
0  0.884787  0.869264   1.770011
1 -0.162393 -0.106835 -41.272203

Correlation: iImmigr vs iCitizen
   original     synth      error
0  0.459806  0.462904   0.671379
1  0.083414  0.072360  14.192767

Correlation: iRrelchld vs dAge
   original     synth     error
0 -3.543027 -3.504078 -1.105385
1  4.750643  4.736088  0.306858

Correlation: iRemplpar vs iDisabl1
   original     synth     error
0 -0.010311 -0.010289 -0.212097
1  1.796797  1.819534  1.257488

Correlation: iRemplpar vs iDisabl2
   original     synth     error
0 -0.010549 -0.010451 -0.926599
1  1.835518  1.845448  0.539559
