# Extract dataset and Generate Synthetic data

In this notebook we extract 466 datasets from an amazon bucket and we create and save a gaussian copula for each as well as a synthetic dataset with the same number of rows that the real dataset.  
Moreover, for each dataset, 2 json metadata files are defined containing information about the dataset. 'metadata' necessary to use SDMetrics, 'dict_info' to use Pycaret.  
At the end of the notebook a folder with the following structure is created : 
* name_folder
    * name dataset_1
        * Data
            * real_dataset
            * synthetic data
        * Model
            * synthetic data generator
        * metadata
        * dict info 
    * name dataset_2 ...
    * name dataset_466




In [14]:
from sdv.tabular import GaussianCopula, CTGAN, CopulaGAN, TVAE
import requests
import re
from tqdm.notebook import tqdm
import pandas as pd
import os 
from os import path as pa
import numpy as np
import copy
import warnings
from tqdm.notebook import tqdm
import json
warnings.filterwarnings('ignore')

## Extract and save real dataset

In [9]:

url = 'http://atm-data.s3.amazonaws.com/'
n_folder = '../T_Data_3/'                                   # name_folder
t = requests.get(url).text
filenames = re.findall('[^>]+.csv', t)
for file in filenames : 
    data = pd.read_csv(url+file)
    p = os.path.join(n_folder, file[:-4],'Data')
    if not os.path.exists(n_folder+file):
        os.makedirs(p)
    data.to_csv(n_folder+file[:-4]+'/Data/'+file)


## Fit synthetic generator, generate synthetic data and save model and synthetic dataset

In [10]:
l = []
dir = os.listdir(n_folder)
i = 0
for path in dir : 
        print(path)
        real_data = pd.read_csv(n_folder+path+'/Data/'+path+'.csv',index_col=[0])
        model = GaussianCopula()
        l.append(path)
        model.fit(real_data)
        synthetic_data= model.sample(len(real_data))
        synthetic_data.to_csv(n_folder+path+'/Data/'+path+'_GC.csv')
        p = os.path.join(n_folder, path,'Model')
        if not os.path.exists(n_folder+path+'/Model'):
            os.makedirs(p)
        model.save(n_folder+path+'/Model/'+'Model_GC.pkl')

Click_prediction_small_1
witmer_census_1980_1
analcatdata_cyyoung8092_1


## Generate and save metadata

Generate and save metdata json files containing information about dataset.
Especially, find which column are categorical and whch are continous.  

In [11]:

def metadata_generation(folder,save=False):
    '''
    Create metadata information about all datasets in a folder.
    
    INPUTS :

    - folder : string
               Path to the datasets. The final path to the datsets must bethe following : "folder+name_dataset+'/Data/'+name_dataset+'.csv'"
    - save : Bool, default : False
                True if the metadata must be save, False otherwise.

    '''
    i = 0
    dir = os.listdir(folder)
    for path in tqdm(dir) : 
        if os.path.exists(folder+path+'/Data/'+path+'.csv') : 
            print(path)
            real_data = pd.read_csv(folder+path+'/Data/'+path+'.csv',index_col=[0])
            metadata = {}
            dict_lab_prov1 = {}
            dict_lab_prov2 = {}
            dict_lab_prov3 = {}

            dict_info = {}
            list_cat = []
            list_num = []
            list_ignore = []
            list_goal = []
            list_entity = []
            target = ['class','Class']
            for t in target : 
                if t in real_data.columns : 
                    class1 = t


            print(real_data.columns)
            t_v,t_c = np.unique(real_data[class1],return_counts=True) ## Check balance data 
            mean = len(real_data)/len(t_v)
            if np.any(np.abs(t_c-mean)/mean>0.5) : 
                list_goal.append('Balance')                           ##
            
            if real_data.isna() is True :                             ## Check if the dataset should be cleaned 
                num_nan = df.isna().sum().sum()
                if num_nan>0.01*(len(real_data)*len(list(real_data.columns))) :
                    list_goal.append('Clean')

            dict_lab_prov3[class1] = {'type':'categorical'}               ## Define metadata for the target
            for col in real_data.columns : 
                dict_lab_prov2 = {}
                if 'Unnam' in col : 
                    real_data = real_data.drop(col,axis=1)
                elif col != class1 : 
                    dict_lab_prov2['type'] = 'numerical'
                    if np.all(real_data[col]==np.round(real_data[col])) :   ## Check if the column contains integer or float
                        dict_lab_prov2['subtype'] = 'integer'
                        real_data[col] = real_data[col].astype('int32')
                        val = np.unique(real_data[col])
                        if len(val)==1 :                                    ## Check for useless column (always same value)
                            list_ignore.append(col)
                        elif len(val)<int(0.1*len(real_data[col])) and np.all(np.diff(val)==1.0) :   ## Check if the column is categorical
                            real_data[col] = real_data[col].astype('category')
                            dict_lab_prov2['type'] = 'categorical'
                            list_cat.append(col)
                        else :
                            t = False
                            for endity in ['ID','id','Id','user','USER','User'] :
                                if endity in col :
                                    list_entity.append(col)
                                    list_ignore.append(col)
                                    t = True
                            if t is False : 
                                list_num.append(col)
                    else :
                        dict_lab_prov2['subtype'] = 'float'
                        list_num.append(col)
                
                dict_lab_prov3[col] = copy.deepcopy(dict_lab_prov2)
            dict_lab_prov3[class1] = {'type':'categorical'}
            dict_info['ignore_features'] = list_ignore
            dict_info['ordinal_features'] = []
            dict_info['categorical_features'] = list_cat
            dict_info['numeric_features'] = list_num
            dict_info['target'] = class1
            dict_info['goal'] = list_goal

            
            
            dict_lab_prov1['fields'] = copy.deepcopy(dict_lab_prov3)
            dict_lab_prov1['path'] = folder+path+'/'+path+'.csv'
            dict_lab_prov1['target'] = copy.deepcopy(class1)
            dict_lab_prov1['entity_columns'] = list_entity
            if len(list_cat)+len(list_ignore)+len(list_num)+1 !=len(list(real_data.columns)) : 
                    print(path)
                    i=i+1
            metadata[path] = dict_lab_prov1

            if save is True : 
                with open(folder+path+'/'+'dict_info.json','w') as fp :
                    json.dump(dict_info,fp,indent='')

                with open(folder+path+'/'+'metadata.json','w') as fp :
                    json.dump(metadata,fp,indent='')
    return 0

In [15]:
metadata_generation('../T_Data_3/',save=True)

  0%|          | 0/466 [00:00<?, ?it/s]

Click_prediction_small_1
Index(['class', 'impression', 'ad_id', 'advertiser_id', 'depth', 'position',
       'keyword_id', 'title_id', 'description_id', 'user_id'],
      dtype='object')
Click_prediction_small_1
witmer_census_1980_1
Index(['OVER65Perc', 'MEDAGE', 'PERCAP$', 'COLLEGEPerc', 'class'], dtype='object')
analcatdata_cyyoung8092_1
Index(['Year', 'Pitcher', 'League', 'Type', 'Wins', 'Win_pct', 'Saves', 'ERA',
       'Strikeouts', 'Innings_pitched', 'class'],
      dtype='object')
dresses-sales_2
Index(['V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12',
       'V13', 'class'],
      dtype='object')
planning-relax_1
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'class'],
      dtype='object')
pc2_1
Index(['BRANCH_COUNT', 'CALL_PAIRS', 'LOC_CODE_AND_COMMENT', 'LOC_COMMENTS',
       'CONDITION_COUNT', 'CYCLOMATIC_COMPLEXITY', 'CYCLOMATIC_DENSITY',
       'DECISION_COUNT', 'DECISION_DENSITY', 'DESIGN_COMPLEXITY',
       'D

0

In [33]:
real_data['Var229']

0        0
1        0
2        1
3        0
4        2
        ..
49995    0
49996    1
49997    0
49998    0
49999    0
Name: Var229, Length: 50000, dtype: category
Categories (5, int64): [0, 1, 2, 3, 4]