In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sdv.metadata import SingleTableMetadata
from sklearn.preprocessing import LabelEncoder

In [2]:
from sdv.datasets.demo import get_available_demos

get_available_demos(modality='single_table')

Unnamed: 0,dataset_name,size_MB,num_tables
0,KRK_v1,0.06,1
1,adult,3.91,1
2,alarm,4.52,1
3,asia,1.28,1
4,census,98.17,1
5,census_extended,4.95,1
6,child,3.2,1
7,covtype,255.65,1
8,credit,68.35,1
9,expedia_hotel_logs,0.2,1


In [3]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

original_data = real_data
print(original_data)

                       guest_email  has_rewards room_type  amenities_fee  \
0          michaelsanders@shaw.net        False     BASIC          37.89   
1                randy49@brown.biz        False     BASIC          24.37   
2            webermelissa@neal.com         True    DELUXE           0.00   
3                  gsims@terry.com        False     BASIC            NaN   
4                misty33@smith.biz        False     BASIC          16.45   
..                             ...          ...       ...            ...   
495  laurabennett@jones-duncan.net        False     BASIC           8.71   
496             johnny71@cook.info        False     BASIC          16.31   
497      ygarcia@ballard-lopez.net        False     BASIC          30.59   
498            thomasdale@hall.com        False     BASIC           1.93   
499        danieltaylor@harper.com        False     BASIC           3.84   

    checkin_date checkout_date  room_rate  \
0    27 Dec 2020   29 Dec 2020     131.23 

In [4]:
original_data.dropna(inplace=True)

original_data.dropna(axis=1, inplace=True)

## Real Data

In [5]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to the categorical variables in the dataset
for column in original_data.columns:
    if original_data[column].dtype == 'object':
        original_data[column] = label_encoder.fit_transform(original_data[column])
original_data['has_rewards'] = original_data['has_rewards'].astype(int)
print(original_data)

     guest_email  has_rewards  room_type  amenities_fee  checkin_date  \
0            258            0          0          37.89           216   
1            311            0          0          24.37           241   
2            409            1          1           0.00           138   
5            128            1          0           0.00           146   
6            299            0          0          19.56           179   
..           ...          ...        ...            ...           ...   
495          222            0          0           8.71            30   
496          189            0          0          16.31           191   
497          427            0          0          30.59            92   
498          383            0          0           1.93           127   
499           83            0          0           3.84           177   

     checkout_date  room_rate  billing_address   credit_card_number  
0              242     131.23               90  40750

In [6]:
data_types3 = original_data.dtypes


has_strings3 = any(data_types3 == 'object')

if has_strings3:
    print("The dataset contains string values.")
else:
    print("The dataset does not contain string values.")

The dataset does not contain string values.


In [7]:
original_data = original_data.sample(n=100)
original_data.shape

(100, 9)

In [8]:
X_real = original_data.drop(columns=['has_rewards', 'guest_email'])
y_real = original_data['has_rewards']
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X_real, y_real, test_size=0.2, random_state=42)

In [9]:
sensitive_columns = ['guest_email', 'billing_address', 'credit_card_number']

In [10]:
from rdt.transformers import AnonymizedFaker, PseudoAnonymizedFaker
guest_email_transformer = AnonymizedFaker(
    provider_name='misc',
    function_name='uuid4',
    enforce_uniqueness=True
)

billing_address_transformer = PseudoAnonymizedFaker(
    provider_name='address',
    function_name='address'
)

## Copula

In [11]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

real_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,michaelsanders@shaw.net,False,BASIC,37.89,27 Dec 2020,29 Dec 2020,131.23,"49380 Rivers Street\nSpencerville, AK 68265",4075084747483975747
1,randy49@brown.biz,False,BASIC,24.37,30 Dec 2020,02 Jan 2021,114.43,"88394 Boyle Meadows\nConleyberg, TN 22063",180072822063468
2,webermelissa@neal.com,True,DELUXE,0.0,17 Sep 2020,18 Sep 2020,368.33,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380
3,gsims@terry.com,False,BASIC,,28 Dec 2020,31 Dec 2020,115.61,"77 Massachusetts Ave\nCambridge, MA 02139",4969551998845740
4,misty33@smith.biz,False,BASIC,16.45,05 Apr 2020,,122.41,"1234 Corporate Drive\nBoston, MA 02116",3558512986488983


In [12]:
real_data.dropna(inplace=True)

# Drop columns with NaN values
real_data.dropna(axis=1, inplace=True)

In [13]:
from sdv.single_table import GaussianCopulaSynthesizer

# Initialize the GaussianCopulaSynthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)

# Fit the synthesizer with the real data
synthesizer.fit(real_data)

# Now you can update the transformers for the desired columns
synthesizer.update_transformers({
    'guest_email': guest_email_transformer,
    'billing_address': billing_address_transformer
})

# Preprocess the real data using the fitted synthesizer
pre_processed_data_copula = synthesizer.preprocess(real_data)
pre_processed_data_copula.head()



Unnamed: 0_level_0,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address
guest_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
michaelsanders@shaw.net,0.766764,0.379737,37.89,1.609027e+18,1.6092e+18,131.23,0.98651
randy49@brown.biz,0.258814,0.111455,24.37,1.609286e+18,1.609546e+18,114.43,1.05901
webermelissa@neal.com,0.978023,0.831875,0.0,1.600301e+18,1.600387e+18,368.33,2.250086
garciacarol@reid-crawford.biz,0.918621,0.124664,0.0,1.602979e+18,1.603152e+18,177.76,3.005442
phillipsmatthew@powers-martinez.com,0.482957,0.25467,19.56,1.606003e+18,1.606176e+18,108.09,4.464085


In [14]:
synthesizer.fit_processed_data(pre_processed_data_copula)

In [15]:
copuladata = synthesizer.sample(num_rows=100)

In [16]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to the categorical variables in the dataset
for column in copuladata.columns:
    if copuladata[column].dtype == 'object':
        copuladata[column] = label_encoder.fit_transform(copuladata[column])

copuladata['has_rewards'] = copuladata['has_rewards'].astype(int)
print(copuladata)

    guest_email  has_rewards  room_type  amenities_fee  checkin_date  \
0            68            0          1          10.20            48   
1            80            0          0          21.44            50   
2            66            0          0          12.71            63   
3             5            1          2          14.80            27   
4            58            0          0          27.33            13   
..          ...          ...        ...            ...           ...   
95           93            0          0          35.41            49   
96           35            0          1          13.34            54   
97           62            0          1          23.70             5   
98            0            0          0          16.44            62   
99           45            0          2          10.93            88   

    checkout_date  room_rate  billing_address   credit_card_number  
0              49     109.79               30     3537187319841509

In [17]:
X_cop = copuladata.drop(columns=['has_rewards', 'guest_email'])
y_cop = copuladata['has_rewards']
X_train_cop, X_test_cop, y_train_cop, y_test_cop = train_test_split(X_cop, y_cop, test_size=0.2, random_state=42)

# CTGAN

In [18]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

In [19]:
real_data.dropna(inplace=True)

# Drop columns with NaN values
real_data.dropna(axis=1, inplace=True)

In [20]:
from sdv.single_table import CTGANSynthesizer

ctgan_model = CTGANSynthesizer(metadata=metadata, 
                               verbose=True,
                              epochs=100)

ctgan_model.fit(real_data)

pre_processed_data_CTGAN = ctgan_model.preprocess(real_data)
pre_processed_data_CTGAN.head()

Gen. (0.41) | Discrim. (0.14): 100%|██████████| 100/100 [00:20<00:00,  4.81it/s]


Unnamed: 0_level_0,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate
guest_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
michaelsanders@shaw.net,False,BASIC,37.89,1.609027e+18,1.6092e+18,131.23
randy49@brown.biz,False,BASIC,24.37,1.609286e+18,1.609546e+18,114.43
webermelissa@neal.com,True,DELUXE,0.0,1.600301e+18,1.600387e+18,368.33
garciacarol@reid-crawford.biz,True,BASIC,0.0,1.602979e+18,1.603152e+18,177.76
phillipsmatthew@powers-martinez.com,False,BASIC,19.56,1.606003e+18,1.606176e+18,108.09


In [21]:
ctgan_model.fit_processed_data(pre_processed_data_CTGAN)

Gen. (0.41) | Discrim. (-0.11): 100%|██████████| 100/100 [00:19<00:00,  5.09it/s]


In [22]:
ctdata = ctgan_model.sample(num_rows=100)

In [23]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for column in ctdata.columns:
    if ctdata[column].dtype == 'object':
        ctdata[column] = label_encoder.fit_transform(ctdata[column])

ctdata['has_rewards'] = ctdata['has_rewards'].astype(int)
print(copuladata)

    guest_email  has_rewards  room_type  amenities_fee  checkin_date  \
0            68            0          1          10.20            48   
1            80            0          0          21.44            50   
2            66            0          0          12.71            63   
3             5            1          2          14.80            27   
4            58            0          0          27.33            13   
..          ...          ...        ...            ...           ...   
95           93            0          0          35.41            49   
96           35            0          1          13.34            54   
97           62            0          1          23.70             5   
98            0            0          0          16.44            62   
99           45            0          2          10.93            88   

    checkout_date  room_rate  billing_address   credit_card_number  
0              49     109.79               30     3537187319841509

In [24]:
X_gan = ctdata.drop(columns=['has_rewards', 'guest_email'])
y_gan = ctdata['has_rewards']
X_train_gan, X_test_gan, y_train_gan, y_test_gan = train_test_split(X_gan, y_gan, test_size=0.2, random_state=42)

In [25]:
print("Original Data Shape:", original_data.shape)
print("Synthetic Data Shape:", ctdata.shape)

Original Data Shape: (100, 9)
Synthetic Data Shape: (100, 9)
