In [1]:
from sdv import load_demo

metadata, tables = load_demo(metadata=True)

In [2]:
metadata

Metadata
  root_path: .
  tables: ['users', 'sessions', 'transactions']
  relationships:
    sessions.user_id -> users.user_id
    transactions.session_id -> sessions.session_id

In [4]:
tables

{'users':    user_id country gender  age
 0        0      US      M   34
 1        1      UK      F   23
 2        2      ES   None   44
 3        3      UK      M   22
 4        4      US      F   54
 5        5      DE      M   57
 6        6      BG      F   45
 7        7      ES   None   41
 8        8      FR      F   23
 9        9      UK   None   30,
 'sessions':    session_id  user_id  device       os  minutes
 0           0        0  mobile  android       23
 1           1        1  tablet      ios       12
 2           2        1  tablet  android        8
 3           3        2  mobile  android       13
 4           4        4  mobile      ios        9
 5           5        5  mobile  android       32
 6           6        6  mobile      ios        7
 7           7        6  tablet      ios       21
 8           8        6  mobile      ios       29
 9           9        8  tablet      ios       34,
 'transactions':    transaction_id  session_id           timestamp  amount 

In [5]:
from sdv import SDV
sdv = SDV()

In [6]:
sdv.fit(metadata, tables)

In [7]:
sampled = sdv.sample_all()

In [8]:
sampled

{'users':    user_id country gender  age
 0        0      UK      M   37
 1        1      US    NaN   32
 2        2      US    NaN   34
 3        3      ES      M   49
 4        4      FR      M   45
 5        5      ES    NaN   52
 6        6      FR      F   41
 7        7      UK      M   37
 8        8      UK      M   36
 9        9      US    NaN   40,
 'sessions':    session_id  user_id  device       os  minutes
 0           0        0  tablet      ios       16
 1           1        1  mobile      ios       32
 2           2        4  tablet      ios       18
 3           3        6  mobile      ios       20
 4           4        7  tablet      ios       10
 5           5        8  mobile  android        2,
 'transactions':    transaction_id  session_id           timestamp      amount  cancelled
 0               0           1 2019-01-08 10:55:05   52.042499      False
 1               1           2 2019-01-25 05:36:13  134.707226       True
 2               2           3 2019-0

In [9]:
import pandas as pd
import sqlite3

def sql_identifier(s):
    return '"' + s.replace('"', '""') + '"'

def prepareDB(path):
    db = sqlite3.connect(path)
    rows = db.execute("SELECT name FROM sqlite_master WHERE type = 'table'")
    tables = [row[0].lower() for row in rows]
    tables.remove('sqlite_sequence')
    
    tables_df = {}
    
    for table in tables:
        tables_df[table] = pd.read_sql_query('SELECT * from ' + table, db)
        
        rows = db.execute("PRAGMA table_info({})".format(sql_identifier(table)))
        attributes = rows.fetchall()
        
        pk = []
        for attr in attributes:
            if attr[5] == 1:
                pk.append(attr[1])
        print(table)
        print(pk)
        
        rows = db.execute("PRAGMA foreign_key_list({})".format(sql_identifier(table)))
        foreign_key_list = rows.fetchall()
        fkeys = []
        for fk in foreign_key_list:
            fkeys.append((fk[2].lower(), fk[3], fk[4]))
        print(fkeys)        
        
    return tables_df

In [10]:
tables = prepareDB('soccer.sqlite')

player_attributes
['id']
[('player', 'player_api_id', 'player_api_id'), ('player', 'player_fifa_api_id', 'player_fifa_api_id')]
player
['id']
[]
match
['id']
[('player', 'away_player_11', 'player_api_id'), ('player', 'away_player_10', 'player_api_id'), ('player', 'away_player_9', 'player_api_id'), ('player', 'away_player_8', 'player_api_id'), ('player', 'away_player_7', 'player_api_id'), ('player', 'away_player_6', 'player_api_id'), ('player', 'away_player_5', 'player_api_id'), ('player', 'away_player_4', 'player_api_id'), ('player', 'away_player_3', 'player_api_id'), ('player', 'away_player_2', 'player_api_id'), ('player', 'away_player_1', 'player_api_id'), ('player', 'home_player_11', 'player_api_id'), ('player', 'home_player_10', 'player_api_id'), ('player', 'home_player_9', 'player_api_id'), ('player', 'home_player_8', 'player_api_id'), ('player', 'home_player_7', 'player_api_id'), ('player', 'home_player_6', 'player_api_id'), ('player', 'home_player_5', 'player_api_id'), ('player'

In [11]:
import numpy as np

df_train = {}
df_train['player'] = tables['player'].sample(n=100)
boolean_series = tables['player_attributes'].player_api_id.isin(df_train['player'].player_api_id)
df_train['player_attributes'] = tables['player_attributes'][boolean_series]

In [12]:
for k, v in df_train.items():
    df_train[k] = df_train[k].replace([np.inf, -np.inf], np.nan)
    df_train[k] = df_train[k].dropna()

In [13]:
df_train['player'][df_train['player'].isna().any(axis=1)]

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight


In [14]:
df_train['player_attributes'][df_train['player_attributes'].isna().any(axis=1)]

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes


In [15]:
from sdv import Metadata

metadata = Metadata()
metadata

Metadata
  root_path: .
  tables: []
  relationships:

In [19]:
metadata.add_table(
    name='player',
    data=df_train['player'],
    primary_key='id'
)

In [16]:
metadata.add_table(
    name='player_attributes',
    data=df_train['player_attributes'],
    primary_key='id',
)

In [20]:
metadata

Metadata
  root_path: .
  tables: ['player_attributes', 'player']
  relationships:

In [23]:
metadata.add_relationship(
    parent='player',
    child='player_attributes',
    foreign_key='player_fifa_api_id',
    parent_key='player_fifa_api_id'
)

In [24]:
metadata.add_relationship(
    parent='player',
    child='player_attributes',
    foreign_key='player_api_id',
    parent_key='player_api_id'
)

In [25]:
metadata

Metadata
  root_path: .
  tables: ['player_attributes', 'player']
  relationships:
    player_attributes.player_fifa_api_id -> player.id
    player_attributes.player_api_id -> player.id

In [26]:
metadata.to_dict()

{'tables': {'player_attributes': {'fields': {'id': {'type': 'id',
     'subtype': 'integer'},
    'player_fifa_api_id': {'type': 'id',
     'subtype': 'integer',
     'ref': {'table': 'player', 'field': 'player_fifa_api_id'}},
    'player_api_id': {'type': 'id',
     'subtype': 'integer',
     'ref': {'table': 'player', 'field': 'player_api_id'}},
    'date': {'type': 'categorical'},
    'overall_rating': {'type': 'numerical', 'subtype': 'float'},
    'potential': {'type': 'numerical', 'subtype': 'float'},
    'preferred_foot': {'type': 'categorical'},
    'attacking_work_rate': {'type': 'categorical'},
    'defensive_work_rate': {'type': 'categorical'},
    'crossing': {'type': 'numerical', 'subtype': 'float'},
    'finishing': {'type': 'numerical', 'subtype': 'float'},
    'heading_accuracy': {'type': 'numerical', 'subtype': 'float'},
    'short_passing': {'type': 'numerical', 'subtype': 'float'},
    'volleys': {'type': 'numerical', 'subtype': 'float'},
    'dribbling': {'type': 'nu

In [27]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [30]:
metadata

Metadata
  root_path: .
  tables: ['player_attributes', 'player']
  relationships:
    player_attributes.player_fifa_api_id -> player.id
    player_attributes.player_api_id -> player.id

In [28]:
from sdv.relational import HMA1

model = HMA1(metadata)
model.fit(df_train)

In [29]:
new_data = model.sample()

ValueError: array must not contain infs or NaNs

In [23]:
from sdv.tabular import CTGAN

In [None]:
model = CTGAN()
model.fit(df_train['player_attributes'])

In [None]:
new_data = model.sample(200)

In [None]:
new_data

In [16]:
import pandas as pd

In [18]:
custom_df = pd.read_csv('Customer Segmentation/Train.csv')

In [22]:
custom_df = custom_df.set_index('ID')

In [24]:
model = CTGAN()
model.fit(custom_df)

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


In [31]:
custom_df[:200]

Unnamed: 0_level_0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A
...,...,...,...,...,...,...,...,...,...,...
466544,Male,Yes,41,Yes,Executive,1.0,Average,3.0,Cat_6,A
463610,Female,Yes,31,Yes,Artist,0.0,Average,2.0,Cat_6,A
465539,Female,Yes,49,Yes,Artist,1.0,High,5.0,Cat_6,B
459823,Male,Yes,67,Yes,Executive,1.0,Low,2.0,Cat_6,A


In [30]:
new_data = model.sample(200)
new_data

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Male,Yes,44,No,Entertainment,0.080007,Average,5.041996,Cat_4,A
1,Female,No,17,Yes,Healthcare,-0.004656,Low,1.023081,Cat_6,D
2,Male,Yes,77,No,Executive,1.002408,High,5.046882,Cat_4,A
3,Male,No,18,No,Healthcare,1.015226,Low,2.979587,Cat_2,D
4,Female,Yes,51,Yes,Artist,1.030877,Low,4.012221,Cat_6,B
...,...,...,...,...,...,...,...,...,...,...
195,Female,No,27,No,Engineer,-0.010397,Low,3.990666,Cat_3,D
196,Female,Yes,88,Yes,Lawyer,1.023613,High,2.007848,Cat_6,B
197,Male,No,42,Yes,Artist,0.994560,Low,4.016569,Cat_6,C
198,Female,No,50,Yes,Entertainment,8.596709,Low,1.019549,Cat_6,A


In [32]:
model.save('CTGAN_customer.pkl')

In [33]:
df_train['player']

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
8204,8216,159352,Ousmane Coulibaly,190658,1989-07-09 00:00:00,182.88,165
10705,10721,166019,Wahbi Khazri,194845,1991-02-08 00:00:00,182.88,168
9383,9397,477604,Salva Ruiz,210747,1995-05-17 00:00:00,177.80,161
9622,9637,602211,Seko Fofana,216320,1995-05-07 00:00:00,182.88,170
7842,7854,265913,Nene Bonilha,203211,1992-02-17 00:00:00,175.26,154
...,...,...,...,...,...,...,...
8522,8536,163605,Pele,194020,1991-09-29 00:00:00,182.88,170
6564,6574,164316,Marafona,200798,1987-05-08 00:00:00,190.50,190
10221,10237,278838,Thomas Lam,208018,1993-12-18 00:00:00,187.96,168
10921,10937,34443,Yoann Folly,138682,1985-06-06 00:00:00,175.26,163


In [34]:
from sdv.tabular import GaussianCopula
from sdv.tabular import CTGAN
from sdv.tabular import CopulaGAN
from sdv.tabular import TVAE
from sdv.evaluation import evaluate

In [60]:
def testModel(_model, data):
    model = _model()
    model.fit(data)
    new_data = model.sample()
    _score = evaluate(new_data, data, aggregate=False)
    score = evaluate(new_data, data)
    return (_score, score)

In [69]:
score = testModel(GaussianCopula, df_train['player_attributes'])

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
  improvement from the last five Jacobian evaluations.
  np.max(np.abs(fsim[0] - fsim[1:])) <= fatol):
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternat

In [70]:
score

(                    metric                                     name  \
 0          BNLogLikelihood           BayesianNetwork Log Likelihood   
 1        LogisticDetection             LogisticRegression Detection   
 2             SVCDetection                            SVC Detection   
 11         GMLogLikelihood           GaussianMixture Log Likelihood   
 12                  CSTest                              Chi-Squared   
 13                  KSTest  Inverted Kolmogorov-Smirnov D statistic   
 14          KSTestExtended  Inverted Kolmogorov-Smirnov D statistic   
 15  ContinuousKLDivergence   Continuous Kullback–Leibler Divergence   
 16    DiscreteKLDivergence     Discrete Kullback–Leibler Divergence   
 
          score  min_value  max_value      goal  
 0    -6.160907       -inf        0.0  MAXIMIZE  
 1     0.216502        0.0        1.0  MAXIMIZE  
 2     0.106029        0.0        1.0  MAXIMIZE  
 11 -229.480213       -inf        inf  MAXIMIZE  
 12    0.976095        0.0  

In [71]:
score = testModel(CTGAN, df_train['player_attributes'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_data[pd.isnull(real_data)] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin

In [72]:
score

(                    metric                                     name  \
 0          BNLogLikelihood           BayesianNetwork Log Likelihood   
 1        LogisticDetection             LogisticRegression Detection   
 2             SVCDetection                            SVC Detection   
 11         GMLogLikelihood           GaussianMixture Log Likelihood   
 12                  CSTest                              Chi-Squared   
 13                  KSTest  Inverted Kolmogorov-Smirnov D statistic   
 14          KSTestExtended  Inverted Kolmogorov-Smirnov D statistic   
 15  ContinuousKLDivergence   Continuous Kullback–Leibler Divergence   
 16    DiscreteKLDivergence     Discrete Kullback–Leibler Divergence   
 
          score  min_value  max_value      goal  
 0   -10.872778       -inf        0.0  MAXIMIZE  
 1     0.025651        0.0        1.0  MAXIMIZE  
 2     0.000455        0.0        1.0  MAXIMIZE  
 11 -250.447761       -inf        inf  MAXIMIZE  
 12    0.930953        0.0  

In [73]:
score = testModel(CopulaGAN, df_train['player_attributes'])

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATI

In [74]:
score

(                    metric                                     name  \
 0          BNLogLikelihood           BayesianNetwork Log Likelihood   
 1        LogisticDetection             LogisticRegression Detection   
 2             SVCDetection                            SVC Detection   
 11         GMLogLikelihood           GaussianMixture Log Likelihood   
 12                  CSTest                              Chi-Squared   
 13                  KSTest  Inverted Kolmogorov-Smirnov D statistic   
 14          KSTestExtended  Inverted Kolmogorov-Smirnov D statistic   
 15  ContinuousKLDivergence   Continuous Kullback–Leibler Divergence   
 16    DiscreteKLDivergence     Discrete Kullback–Leibler Divergence   
 
          score  min_value  max_value      goal  
 0   -10.972038       -inf        0.0  MAXIMIZE  
 1     0.083957        0.0        1.0  MAXIMIZE  
 2     0.002046        0.0        1.0  MAXIMIZE  
 11 -239.285902       -inf        inf  MAXIMIZE  
 12    0.963067        0.0  

In [75]:
score = testModel(TVAE, df_train['player_attributes'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_data[pd.isnull(real_data)] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synthetic_data[pd.isnull(synthetic_data)] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [76]:
score

(                    metric                                     name  \
 0          BNLogLikelihood           BayesianNetwork Log Likelihood   
 1        LogisticDetection             LogisticRegression Detection   
 2             SVCDetection                            SVC Detection   
 11         GMLogLikelihood           GaussianMixture Log Likelihood   
 12                  CSTest                              Chi-Squared   
 13                  KSTest  Inverted Kolmogorov-Smirnov D statistic   
 14          KSTestExtended  Inverted Kolmogorov-Smirnov D statistic   
 15  ContinuousKLDivergence   Continuous Kullback–Leibler Divergence   
 16    DiscreteKLDivergence     Discrete Kullback–Leibler Divergence   
 
          score  min_value  max_value      goal  
 0    -4.631169       -inf        0.0  MAXIMIZE  
 1     0.000000        0.0        1.0  MAXIMIZE  
 2     0.000000        0.0        1.0  MAXIMIZE  
 11 -246.879999       -inf        inf  MAXIMIZE  
 12    0.988816        0.0  

In [32]:
text = "E:/GitHub Repos/TensorFlow training/SoccerDataset/database.sqlite"
asd = text.split('.')

In [33]:
asd[-1]

'sqlite'

In [34]:
len(asd)

2

In [35]:
if len(asd) < 2:
    print("asd")