In [42]:
#common distributions
#**normal - good for everything in general
#**poisson - good for number of occurences 
#exponential  
#beta 
#gamma
#**uniform 
#**binomial - need this for indicators too 

## Working with Floats

In [43]:
def check_parameters(parameters):
    for p in parameters:
        if type(p) not in [int, float]:
            raise SyntaxError("parameters must to integers or floats")
check_parameters([20, 2.5])

In [44]:
import re #Regex library
import yaml
import pandas as pd 
from yaml import load, dump
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
import numpy as np # name, patientid, date of death, lat, lon
#Float Generator

#-column: blood sugar
#   type: float
#   distribution: {name: 'normal', mean: '5.5', std: '1.5'}
#   constraints: {upper:8 , lower:2}


# name - string field name
# distribution - dictionary 
# constraints - dictionary
# size - num rows


def float_generator(distribution, constraints, size):
    # Extract parameters from distribution
    # Use the correct function from np and pass in proper parameters + size
    # Clip the distribution based off of lower and upper constraints
    # Insert the values into a Series
   
   #checks if the 'size' input is an integer
    if type(size) != int:
        raise SyntaxError("size must be an integer")
    
    name = distribution['name']
    
    #generates data with numpy random distributions 
    if name == 'normal':
        mean, std = distribution['mean'], distribution['std']
        check_parameters([mean, std])
        data = np.random.normal(mean, std, size)
    

    elif name == 'lognormal':
        mean, std = distribution['mean'], distribution['std']
        check_parameters([mean, std])
        data = np.random.lognormal(mean, std, size)
    
    elif name == 'uniform':
        a, b = distribution['a'], distribution['b']
        check_parameters([a, b])
        if a > b:
            raise SyntaxError("a must be less than b")
        data = np.random.uniform(a, b, size)

    elif name == 'binomial':
        n, p = distribution['n'], distribution['prob']
        check_parameters([n, p])
        data = np.random.binomial(n, p, size)
    
    elif name == 'poisson':
        lam = distribution['lam']
        check_parameters([lam])
        data = np.random.poisson(lam, size)
    
    elif name == 'beta':
        alpha, beta = distribution['alpha'], distribution['beta']
        check_parameters([alpha, beta])
        data = np.random.beta(alpha, beta, size)
    
    elif name == 'gamma':
        shape, scale = distribution['shape'], distribution['scale']
        check_parameters([shape, scale])
        data = np.random.gamma(shape, scale, size)
    
    elif name == 'exponential':
        lam = distribution['lam']
        check_parameters([lam])
        data = np.random.exponential(lam, size)

    #checks if 'name' input is supported by generator 
    else:
        raise SyntaxError(name +" not recognized")

    #checks for constraints 
    if constraints == None:
        return data.tolist()

    #clip outliers based on constraint conditions
    elif 'max' not in constraints:
        data = np.clip(data, a_min = constraints['min'], a_max = None)
        return data.tolist()
    elif 'min' not in constraints:
        data = np.clip(data, a_min = None, a_max = constraints['max'])
        return data.tolist()
    elif 'min' in constraints and 'max' in constraints: 
        data = np.clip(data, a_min = constraints['min'], a_max = constraints['max'])
        return data.tolist()
    #raises error if 'constraints' inputs are invalid
    else:
        raise SyntaxError("min and max not recognized")

float_generator({'name':'normal', 'mean': 20,'std': 2}, {'max': None, 'min':2}, 20)




        

    
    

[21.143467915067635,
 19.19672846130804,
 19.758622798722833,
 20.12147825431966,
 22.200841549825604,
 19.1663508050057,
 15.347278158865784,
 16.801651275190846,
 18.521988781881742,
 18.060327604027258,
 20.75069134771129,
 20.391292558032905,
 20.000336076491482,
 23.189666096997183,
 18.11145591809387,
 20.635059548748547,
 21.27396849960768,
 21.4120638102704,
 17.448926048910618,
 20.53519168650539]

## Int Generator 

In [45]:
def int_generator(distribution, constraints, size):
    # Extract parameters from distribution
    # Use the correct function from np and pass in proper parameters + size
    # Clip the distribution based off of lower and upper constraints
    # Convert values to int type 
    # Insert the values into a Series

    #generate data with np random distributions 
    #checks if user inputs a bernoulli distribution and uses a binomial to generate it
    if type(size) != int:
        raise SyntaxError("size must be an integer")
    
    if distribution['name'] == 'bernoulli':
        data = np.random.binomial(1, distribution['p'], size)
        data = data.tolist()
    
    else:
        data = float_generator(distribution, constraints, size)
    
    #rounds values and convert to ints
    return np.round(data).astype(int).tolist() 

int_generator({'name':'normal', 'mean': 20.5,'std': 2.5}, {'max':30, 'min':2}, 10)
#int_generator({'name':'bernoulli', 'p': 0.5}, {'upper':'na', 'lower':'na'}, 10)

[25, 18, 20, 20, 22, 19, 22, 16, 23, 16]

In [46]:
def string_bestower(dist, choices, i, data_list, number_of_points):
    for j in range(number_of_points):
        if dist['name'] == 'normal':
            assert 'mean' in dist and 'std' in dist, "must provide mean and std for normal dist"
            a = int(np.random.normal(loc=dist['mean'], scale=dist['std']))
        
        elif dist['name'] == 'lognormal':
            assert 'mean' in dist and 'std' in dist, "must provide mean and std for lognormal dist"
            a = int(np.random.lognormal(loc=dist['mean'], scale=dist['std']))

        elif dist['name'] in ['binomial', 'bernoulli']:
            assert 'n' in dist and 'prob' in dist, "must provide n and prob for binomial or bernoulli"
            a = int(np.random.binomial(n=dist['n'], p=dist['prob']))
        
        elif dist['name'] == 'poisson':
            assert 'lam' in dist, "must provide lam for poission"
            a = int(np.random.poisson(dist['lam']))
        
        elif dist['name'] == 'beta':
            assert 'alpha' in dist and 'beta' in dist, "must provide alpha and beta for beta dist"
            a = int(np.random.beta(dist['alpha'], dist['beta']))
        
        elif dist['name'] == 'gamma':
            assert 'shape' in dist and 'scale' in dist, 'must provide shape and scale for gamma dist'
            a = int(np.random.gamma(dist['shape'], dist['scale']))
        
        elif dist['name'] == 'exponential':
            assert 'lam' in dist, 'must provide lam for exponential'
            a = int(np.random.exponential(dist['lam']))
        
        elif dist['name'] == 'uniform':
            assert 'min' in dist and 'max' in dist, 'must provide min and max for uniform dist (separate from constraints)'
            a = int(np.random.randint(dist['min'], dist['max']))
        
        else:
            raise SyntaxError("Bad String dist type")
        data_list[i].append(choices[a])

In [47]:
list_of_names = ["Kendrich Garner", "Xavion Douglas", "Dantrell Lucas", "Dashaud Dunn", "Stefon Vaughn", 
"Blake Mccoy", "Louvenia Murray", "Kwashay Houston", "Chineka Gray", "Kaeja Williams", "Tasia Floyd", "Chaybree Robertson",
"Jori Henry", "Reginal Nash", "Calvon Gray", "Quandell Morris", "Tyrice Gordon", "Quintrell Whitfield", 
"Rhianna Miles", "Ashkira Bradford", "Dustina Graves", "Jomary Solomon", "Kamen Hawkins", "Odessa Morton", 
"Monifa Opeyemi", "Chukwuma Abiodun", "Boipelo Afolayan", "Tapiwa Botha", "Makena Kariuki",
"Wambui Arendse", "Chidimma Ihejirika", "Akpan Okeke", "Oghenero Ayodele", "Oni Okeke",
"Chidiegwu Babatunde", "Kwasi Mwangi", "Chiemeka Kariuki", "Chiumbo Adebayo", "Tsholofelo Afolayan", 
"Wanjiku Kariuki", "Sisay Idowu", "Ayaan Maina", "Efua Adebayo", "Issoufou Opeyemi"]

def generate_name(names):
    return np.random.choice(names)


## Working with Datetime

In [48]:
import datetime
import numpy as np

def generateDate(size, minYear, maxYear):
    if type(size) != int or type(minYear) != int or type(maxYear) != int:
        raise SyntaxError("size must be an integer")
    if len(str(minYear)) != 4 or len(str(maxYear)) != 4:
        raise SyntaxError("year must be 4 digits")
    i=0
    dates_lst = []
    while i < size:
        i+=1
        dateVal = datetime.date(np.random.randint(minYear, maxYear), np.random.randint(1, 12), np.random.randint(1, 28))
        dates_lst.append(str(dateVal))
    
    return dates_lst

dateStuff = generateDate(50, 1900, 2000)
dateStuff


['1905-10-19',
 '1943-06-24',
 '1980-07-21',
 '1935-03-09',
 '1974-07-15',
 '1976-02-26',
 '1955-07-19',
 '1917-09-20',
 '1901-11-24',
 '1986-04-26',
 '1973-03-10',
 '1958-09-23',
 '1993-06-22',
 '1953-04-14',
 '1901-02-21',
 '1957-03-23',
 '1954-06-09',
 '1997-03-20',
 '1996-11-23',
 '1966-05-01',
 '1997-04-07',
 '1973-06-01',
 '1991-01-19',
 '1923-03-22',
 '1990-10-12',
 '1931-05-23',
 '1943-09-03',
 '1912-02-05',
 '1965-09-01',
 '1964-03-13',
 '1933-09-21',
 '1911-06-07',
 '1944-11-22',
 '1915-03-02',
 '1936-02-06',
 '1907-11-05',
 '1922-03-01',
 '1950-02-26',
 '1999-07-10',
 '1950-05-27',
 '1937-05-09',
 '1997-08-04',
 '1979-07-13',
 '1920-08-21',
 '1943-04-02',
 '1935-05-25',
 '1967-02-10',
 '1952-09-15',
 '1970-10-06',
 '1987-10-21']

In [49]:
import datetime
from datetime import timedelta
# ? "BirthDate"
# : 
#   constraints: ~
#   distribution: 
#     mean: "5.5"
#     name: normal
#     std: "1.5"
#   strings: ./default/default_province.txt
#   type: datetime

def generate_birthDate(size, minYear, maxYear):

  def to_integer(dt_time):
    return 1*dt_time.year

  print("Birthdates and Deathdates")
  i = 0
  birthdates_lst = []
  deathdates_lst = []
  while i < size: 
    i+=1
    birthdate = datetime.date(np.random.randint(minYear, maxYear), np.random.randint(1, 12), np.random.randint(1, 28))
    deathdate = datetime.date(np.random.randint(minYear+50, maxYear+50), np.random.randint(1, 12), np.random.randint(1, 28))
    if to_integer(birthdate) > 1960:
      birthdates_lst.append(str(birthdate))
      deathdates_lst.append("N/A")
      # print(birthdates_lst)
      # print(deathdates_lst)
    else:
      if to_integer(deathdate) < 2021:
        birthdates_lst.append(str(birthdate))
        deathdates_lst.append(str(deathdate))
        # print(birthdates_lst)
        # print(deathdates_lst)
      else:
        birthdates_lst.append(str(birthdate))
        deathdates_lst.append("N/A")
        # print(birthdates_lst)
        # print(deathdates_lst)

  return birthdates_lst, deathdates_lst
  
    

  

birthdateSample = generate_birthDate(50, 1900, 2000)
birthdateSample


Birthdates and Deathdates


(['1900-08-20',
  '1977-01-26',
  '1905-01-18',
  '1992-11-26',
  '1922-10-15',
  '1933-04-19',
  '1977-04-13',
  '1912-04-22',
  '1923-06-09',
  '1903-09-05',
  '1962-09-11',
  '1997-11-19',
  '1908-08-07',
  '1905-04-05',
  '1985-06-04',
  '1961-09-20',
  '1976-03-21',
  '1987-09-03',
  '1965-04-06',
  '1945-03-07',
  '1930-01-13',
  '1958-02-07',
  '1930-06-19',
  '1988-05-08',
  '1992-01-15',
  '1927-11-09',
  '1937-06-02',
  '1917-01-20',
  '1903-11-21',
  '1952-11-22',
  '1995-06-03',
  '1997-06-11',
  '1954-02-25',
  '1929-08-27',
  '1939-05-13',
  '1956-02-03',
  '1957-02-02',
  '1904-01-06',
  '1989-11-15',
  '1943-06-26',
  '1946-02-08',
  '1999-01-06',
  '1960-06-27',
  '1989-04-03',
  '1934-10-14',
  '1937-09-07',
  '1980-03-12',
  '1908-11-18',
  '1996-09-14',
  '1961-06-24'],
 ['N/A',
  'N/A',
  '1999-02-13',
  'N/A',
  'N/A',
  '1976-08-10',
  'N/A',
  'N/A',
  '2020-11-25',
  '1969-10-20',
  'N/A',
  'N/A',
  'N/A',
  '2020-05-22',
  'N/A',
  'N/A',
  'N/A',
  'N/A',
  

In [102]:
import pandas as pd
def datagen(direc, filename):
    assert type(direc) == str and type(filename) == str, "datagen accepts a file directory and filename in string format as its arguments."
    data_list = {}
    try:
        a = open(direc + filename)
    except FileNotFoundError:
        raise TypeError("File " + direc + filename + " does not exist at the specified directory.")
    except:
        raise Exception("Another error occurred.")
    try:
        loaded = yaml.load(a, Loader=yaml.FullLoader)
    except:
        raise SyntaxError("Improperly formatted yaml file: " + direc + filename)
    assert 'rows' in loaded, "number of rows not specified in " + direc + filename + "."
    rows = loaded['rows']
    assert type(rows) in [int, float] and rows >= 1, "number of rows improperly specified in " + direc + "."

    for i in list(loaded)[1:]:
        assert 'type' in loaded[i], "no type for column " + i
        the_type = loaded[i]['type']
        assert 'distribution' in loaded[i], "no distribution specified for column " + i
        the_dist = loaded[i]['distribution']

        the_consts = loaded[i]['constraints']

        if the_type == 'int':
            data_list[i] = int_generator(the_dist, the_consts, rows)
        
        elif the_type == 'float':
            data_list[i] = float_generator(the_dist, the_consts, rows)
        
        elif the_type == 'string': 
            assert 'strings' in loaded[i] and type(loaded[i]['strings'] == str), 'improperly specified string path for ' + i
            try:
                choices = open(direc + loaded[i]['strings'],'r').read().split(", ")
            except:
                raise FileNotFoundError("Bad file directory for " + i + ":" + direc + loaded[i]['strings'])
            data_list[i] = []
            string_bestower(the_dist, choices, i, data_list, rows)
        
        elif loaded[i]['type'] == 'date':
            data_list[i] = generateDate(rows, the_consts['min'], the_consts['max'])
            # OLD CODE BELOW
            #if loaded[i]['pair_of_dates'] == 1:
            #    generated = generate_birthDate(rows, the_consts['min'], the_consts['max'])
            #    data_list[i + "_start"] = generated[0]
            #    data_list[i + "_end"] = generated[1]
            #elif loaded[i]['pair_of_dates'] == 0:
            #    generated = generate_birthDate(rows, the_consts['min'], the_consts['max'])
            #    data_list[i] = generated[0]
        else:
            raise SyntaxError("Type for " + i + " not 'int', 'float', 'string', or 'date': " + loaded[i]['type'])
        df = pd.DataFrame(data_list)
    return df.to_csv("../synthetic-data/" + filename[:-5] + ".csv") #(CSV for each yaml file -> synthetic-data directory)

datagen("../yaml-files/", "properdelivery.yaml")

TypeError: File ../yaml-files/properly_delivery.yaml does not exist at the specified directory.

In [51]:
def to_integer(dt_time):
    return 1*dt_time.year

birthdate = datetime.date(np.random.randint(1920, 2000), np.random.randint(1, 12), np.random.randint(1, 28))
deathdate = datetime.date(np.random.randint(1920, 2000), np.random.randint(1, 12), np.random.randint(1, 28))
    
print(to_integer(deathdate), to_integer(birthdate))

1942 1940


In [108]:
from os import walk
def generate_all_yamls(yaml_directory):
    returned = []

    for root, dirs, files in walk(yaml_directory):
        for filename in files:
            if ".yaml" in filename:
                #try:
                returned.append(datagen(yaml_directory, filename))
                #except:
                #    returned.append({filename: "generation failed"})
    return returned

generate_all_yamls("../yaml-files/")
#datagen("../yaml-files/", "properdelivery.yaml")

SyntaxError: Improperly formatted yaml file: ../yaml-files/deliveryTest.yaml (<string>)

FileNotFoundError: [Errno 2] No such file or directory: '../yaml-files/default/default_province.txt'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=59003b3a-4258-4703-b30d-75642543bba1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>