In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib qt

In [2]:
survey = pd.read_csv("C:/Users/TIYANI/Videos/Final/Survey.csv")

In [3]:
survey.head()

Unnamed: 0,Hole_Id,Depth,Azimuth,Dip,target
0,KRED207,0.0,128.9,-60.0,1
1,KRED207,50.0,128.9,-60.0,1
2,KRED207,104.0,127.9,-59.0,1
3,KRED209,0.0,121.6,-57.5,1
4,KRED209,50.0,121.6,-57.5,1


In [4]:
survey.shape

(278, 5)

In [5]:
survey.dtypes

Hole_Id     object
Depth      float64
Azimuth    float64
Dip        float64
target       int64
dtype: object

In [6]:
survey.isna().any()

Hole_Id    False
Depth      False
Azimuth    False
Dip        False
target     False
dtype: bool

In [8]:
survey.columns

Index(['Hole_Id', 'Depth', 'Azimuth', 'Dip', 'target'], dtype='object')

In [7]:
from sdv.constraints import FixedCombinations

In [9]:
Location = FixedCombinations(column_names=['Depth', 'Azimuth','Dip'])

In [10]:
from sdv.constraints import Negative
dip= Negative(column_name='Dip')

In [11]:
constraints = [Location, dip]

In [12]:
from sdv.tabular import CTGAN

In [13]:
batch_size = 1000
epochs = 100

model = CTGAN(primary_key='Hole_Id',constraints=constraints, generator_dim=(257, 257, 257),
              discriminator_dim=(257, 257, 257), batch_size=batch_size, epochs=epochs, verbose=True)
model.fit(survey)

Epoch 1, Loss G:  5.7164,Loss D: -0.0009
Epoch 2, Loss G:  5.7386,Loss D: -0.0008
Epoch 3, Loss G:  5.7175,Loss D: -0.0054
Epoch 4, Loss G:  5.6885,Loss D: -0.0015
Epoch 5, Loss G:  5.6748,Loss D: -0.0047
Epoch 6, Loss G:  5.6680,Loss D: -0.0044
Epoch 7, Loss G:  5.6644,Loss D: -0.0023
Epoch 8, Loss G:  5.6697,Loss D: -0.0006
Epoch 9, Loss G:  5.6439,Loss D:  0.0048
Epoch 10, Loss G:  5.6355,Loss D:  0.0054
Epoch 11, Loss G:  5.6265,Loss D:  0.0045
Epoch 12, Loss G:  5.6175,Loss D: -0.0044
Epoch 13, Loss G:  5.6187,Loss D: -0.0033
Epoch 14, Loss G:  5.6005,Loss D: -0.0055
Epoch 15, Loss G:  5.5847,Loss D: -0.0073
Epoch 16, Loss G:  5.6109,Loss D:  0.0062
Epoch 17, Loss G:  5.5736,Loss D:  0.0024
Epoch 18, Loss G:  5.5602,Loss D:  0.0041
Epoch 19, Loss G:  5.5346,Loss D:  0.0207
Epoch 20, Loss G:  5.5622,Loss D:  0.0230
Epoch 21, Loss G:  5.5367,Loss D:  0.0275
Epoch 22, Loss G:  5.5424,Loss D:  0.0324
Epoch 23, Loss G:  5.5390,Loss D:  0.0247
Epoch 24, Loss G:  5.5286,Loss D:  0.0295
E

In [14]:
n_generated_data = 278
new_survey = model.sample(n_generated_data)

Sampling rows: 100%|██████████| 278/278 [00:00<00:00, 3123.34it/s]


In [15]:
from sdv.evaluation import evaluate

In [16]:
from sdv.evaluation import evaluate

evaluate(new_survey, survey)

0.47796762589928055

In [17]:
from table_evaluator import load_data, TableEvaluator

In [18]:
from table_evaluator import load_data, TableEvaluator
table_evaluator = TableEvaluator(survey, new_survey)
table_evaluator.evaluate(target_col='target')


Classifier F1-scores and their Jaccard similarities::
                             f1_real  f1_fake  jaccard_similarity
index                                                            
DecisionTreeClassifier_fake   0.4286   0.6607              0.2308
DecisionTreeClassifier_real   1.0000   0.4643              0.3023
LogisticRegression_fake       0.4286   0.8571              0.1667
LogisticRegression_real       0.9821   0.6786              0.5342
MLPClassifier_fake            0.4107   0.8393              0.1429
MLPClassifier_real            0.9821   0.6250              0.4737
RandomForestClassifier_fake   0.4286   0.7857              0.1915
RandomForestClassifier_real   0.9821   0.6071              0.4545

Privacy results:
                                         result
Duplicate rows between sets (real/fake)  (0, 0)
nearest neighbor mean                    1.0220
nearest neighbor std                     0.1147

Miscellaneous results:
                                  Result
Column Cor

In [19]:
table_evaluator.visual_evaluation()

In [20]:
from sdv.tabular import CopulaGAN

In [21]:
model = CopulaGAN(primary_key='Hole_Id',
                 constraints=constraints,epochs=100,batch_size=2000,
                  generator_dim=(256, 256, 256),discriminator_dim=(256, 256, 256),verbose=True)
model.fit(survey)
new_data = model.sample(278)
new_data.head()

Epoch 1, Loss G:  5.7567,Loss D: -0.0016
Epoch 2, Loss G:  5.7461,Loss D:  0.0036
Epoch 3, Loss G:  5.7244,Loss D:  0.0014
Epoch 4, Loss G:  5.7226,Loss D: -0.0031
Epoch 5, Loss G:  5.7086,Loss D: -0.0029
Epoch 6, Loss G:  5.7061,Loss D: -0.0017
Epoch 7, Loss G:  5.7130,Loss D: -0.0037
Epoch 8, Loss G:  5.7004,Loss D:  0.0010
Epoch 9, Loss G:  5.6734,Loss D: -0.0034
Epoch 10, Loss G:  5.6725,Loss D:  0.0034
Epoch 11, Loss G:  5.6448,Loss D:  0.0013
Epoch 12, Loss G:  5.6354,Loss D:  0.0089
Epoch 13, Loss G:  5.6645,Loss D:  0.0071
Epoch 14, Loss G:  5.6513,Loss D:  0.0066
Epoch 15, Loss G:  5.6288,Loss D:  0.0058
Epoch 16, Loss G:  5.6374,Loss D:  0.0082
Epoch 17, Loss G:  5.6257,Loss D:  0.0033
Epoch 18, Loss G:  5.6396,Loss D:  0.0033
Epoch 19, Loss G:  5.6334,Loss D:  0.0016
Epoch 20, Loss G:  5.6055,Loss D:  0.0109
Epoch 21, Loss G:  5.6014,Loss D:  0.0197
Epoch 22, Loss G:  5.6086,Loss D: -0.0022
Epoch 23, Loss G:  5.5892,Loss D:  0.0221
Epoch 24, Loss G:  5.6095,Loss D:  0.0164
E

Sampling rows: 100%|██████████| 278/278 [00:00<00:00, 1836.91it/s]


Unnamed: 0,Hole_Id,Depth,Azimuth,Dip,target
0,a,0.0,93.2,-55.6,2
1,b,0.0,92.1,-56.9,2
2,c,0.0,125.3,-57.4,1
3,d,150.0,128.7,-57.8,2
4,e,50.0,122.9,-61.0,2


In [22]:
from table_evaluator import load_data, TableEvaluator
table_evaluator = TableEvaluator(survey, new_data)
table_evaluator.evaluate(target_col='target')


Classifier F1-scores and their Jaccard similarities::
                             f1_real  f1_fake  jaccard_similarity
index                                                            
DecisionTreeClassifier_fake   0.5536   0.5536              0.3659
DecisionTreeClassifier_real   1.0000   0.3036              0.1789
LogisticRegression_fake       0.5536   0.5536              0.6970
LogisticRegression_real       0.9821   0.2679              0.1429
MLPClassifier_fake            0.5536   0.4643              0.1089
MLPClassifier_real            0.9464   0.2500              0.1789
RandomForestClassifier_fake   0.5536   0.5000              0.2308
RandomForestClassifier_real   1.0000   0.3036              0.1789

Privacy results:
                                         result
Duplicate rows between sets (real/fake)  (0, 0)
nearest neighbor mean                    1.0481
nearest neighbor std                     0.1633

Miscellaneous results:
                                  Result
Column Cor

In [24]:
table_evaluator.visual_evaluation()

In [25]:
from sdv.evaluation import evaluate

evaluate(new_data, survey)

0.4392985611510791

In [26]:
from sdv.tabular import GaussianCopula

In [27]:
model = GaussianCopula(primary_key='Hole_Id',constraints=constraints,
                      field_distributions={'Dip': 'beta'})
model.fit(survey)
new_data2 = model.sample(278)
new_data2.head()

Sampling rows: 100%|██████████| 278/278 [00:00<00:00, 2527.45it/s]


Unnamed: 0,Hole_Id,Depth,Azimuth,Dip,target
0,a,0.0,130.1,-55.3,2
1,b,50.0,128.2,-54.4,1
2,c,92.0,127.8,-55.4,1
3,d,189.0,90.5,-58.5,1
4,e,150.0,90.4,-52.7,2


In [28]:
from table_evaluator import load_data, TableEvaluator
table_evaluator = TableEvaluator(survey, new_data2)
table_evaluator.evaluate(target_col='target')


Classifier F1-scores and their Jaccard similarities::
                             f1_real  f1_fake  jaccard_similarity
index                                                            
DecisionTreeClassifier_fake   0.4286   0.5536              0.1789
DecisionTreeClassifier_real   1.0000   0.5357              0.3659
LogisticRegression_fake       0.4107   0.7321              0.0769
LogisticRegression_real       0.9643   0.7143              0.5135
MLPClassifier_fake            0.4107   0.4643              0.3176
MLPClassifier_real            0.9464   0.3750              0.1915
RandomForestClassifier_fake   0.4464   0.6250              0.3023
RandomForestClassifier_real   0.9821   0.5714              0.4177

Privacy results:
                                         result
Duplicate rows between sets (real/fake)  (0, 0)
nearest neighbor mean                    1.0276
nearest neighbor std                     0.1839

Miscellaneous results:
                                  Result
Column Cor

In [29]:
table_evaluator.visual_evaluation()