In [2]:
import pandas as pd
import numpy as np
import random 
import string
from scipy.stats import truncnorm

In [3]:
#Funcion para calcular el numero de registros si random es False
def get_random_rows(columns):
    max_values = {} 

    for index, element in enumerate(columns):
        element_type = element["type"]
        if element_type == "numeric":
            max_values[index] = ((element["values"]["max"]+1) - element["values"]["min"])
            
        elif element_type == "date":
            #calcular los dias de diferencia entre ambas fechas
            number_days = (
                pd.to_datetime(element["values"]["max"]) - 
                pd.to_datetime(element["values"]["min"])
            ).days
            max_values[index] = number_days + 1

        elif element_type == "category":
            max_values[index] = len(element["values"])

        elif element_type == "unique":
            max_values[index] = 1
    
    max_key = max(max_values, key=max_values.get)
    return { 
        "column" : max_key,
        "value": max_values[max_key] }

#Funcion para crear una lista de valores unicos
def get_random_unique(n):
    list_id = set()

    while len(list_id) < n:
        characters = string.ascii_letters + string.digits
        id = ''.join(random.choices(characters, k = 16))
        list_id.add(id)  
    return list(list_id)

#Funcion para crear categorias aleatorias
def get_random_categories( items , n  ):
    return random.choices(items, k=n)

def get_random_numbers( min , max, n ):
    values = []
    for i in range(n):
        values.append(random.randint(min, max))
    return values

def get_random_dates( min , max , n ):
    items = pd.date_range( start = min , end = max , freq = 'D' )
    return get_random_categories( items , n )

def generate_truncated_normal_data(mean, std, min, max, n):
    # Calcular los parámetros de la distribución normal truncada
    a, b = (min - mean) / std, (max - mean) / std
    data = truncnorm(a, b, loc=mean, scale=std).rvs(n)
    return list(map(int, data))

#Funcion para calcular la columna con mas valores en random = false
def get_list_random_false(column):
    data = []

    if(column["type"] == "category"):
        data = column["values"]
    elif(column["type"] == "date"):
        items = pd.date_range( start = column["values"]["min"] , end = column["values"]["max"] , freq = 'D' )
        data = list(items)
    elif(column["type"] == "numeric"):
        data = list(range(column["values"]["min"], column["values"]["max"] +1 ) )
    else:
        raise Error_dataframe("La columna {} es de tipo de dato no válido".format(column["name"]))
    return data

In [4]:
#Funcion para crear un dataFrame
def get_columns(columns, number_row, dependent_column = None):
    data = {}

    types = [ "category" , "unique" , "date", "foreign", "numeric"]
    for index, element in enumerate(columns):
        element_type = element["type"]
        if element_type in types:
            if element_type == "category":
                values = get_random_categories( element["values"] , number_row  )

            elif element_type == "unique":
                values = get_random_unique(number_row)

            elif element_type == "date":
                values = get_random_dates( 
                    element["values"]["min"], element["values"]["max"]
                    , number_row)

            elif element_type == "foreign":
                for j in dependent_column:
                    if(element["values"] == j["name_df"]):
                        values = get_random_categories( j["data"] , number_row  )
            
            elif element_type == "numeric":
                if "std" in element["values"]:
                    values = generate_truncated_normal_data(
                        element["values"]["mean"], element["values"]["std"]
                        , element["values"]["min"], element["values"]["max"]
                        , number_row)
                else:
                    values = get_random_numbers( 
                        element["values"]["min"], element["values"]["max"]
                        , number_row)
        
            data[element["name"]] = values
            
    return pd.DataFrame( data )
    

In [5]:
def create_dataframe(setting, dependent_column = None):
    try:
        if "random" not in setting:
            raise ErrorDataFrame("El atributo 'random' es obligatorio")
        
        if setting["random"]:
            if "random_rows" not in setting:
                raise ErrorDataFrame("El atributo 'random_rows' es obligatorio")
            
            if not isinstance(setting["random_rows"], (int, float)):
                raise ErrorDataFrame("El valor de 'random_rows' debe ser numérico")
                
            n = int(setting["random_rows"])

            data = get_columns(setting["columns"], n, dependent_column)
            data.name = setting["ds"]
        
        else:
            copy_columns = setting["columns"].copy()
            max_rows = get_random_rows(setting["columns"])
            del copy_columns[max_rows["column"]]
            
            data = get_columns(copy_columns, max_rows["value"], dependent_column)
            new_data = get_list_random_false(setting["columns"][max_rows["column"]])
            name_column = setting["columns"][max_rows["column"]]["name"]
            
            data.insert(max_rows["column"], name_column, new_data)
            data.name = setting["ds"]

        return data
    
    except Exception as e:
        print(f"Error en la creación del DataFrame: {e}")


In [6]:
def search_foreign(dictionaries): 
    relations = {}
    for dictionary in dictionaries:
        for column in dictionary['columns']:
            if column['type'] == 'foreign':
                foreign_ds, foreign_col = column['values'].split('.')
                found_dataset = False
                found_foreign = False
                for dic in dictionaries:
                    if dic['ds'] == foreign_ds:
                        found_dataset = True
                        for col in dic['columns']:
                            if col['name'] == foreign_col:
                                relations[foreign_ds+"."+foreign_col] = dictionary['ds']
                                found_foreign = True
                                break
                        if found_foreign:
                            break
                if not found_dataset:
                    raise Error_dataframe(
                        "No se encontró el dataset {}. Verifique que el dataset exista.".format(foreign_ds)
                    )
                if not found_foreign:
                    raise Error_dataframe(
                        "No se encontró la columna {} en el dataset {}. "
                        "Las posibles columnas son: {}".format(
                            foreign_col, foreign_ds, 
                            [col['name'] for col in dic['columns']]
                        )
                    )
    return relations

In [7]:
class Error_dataframe(Exception):
    def __init__(self, message):
        super().__init__(message)

In [8]:
def build_dataframes(conf_list):
    dataframe_list = []
    try:
        relations = search_foreign(conf_list)
        
        for element in conf_list:
            dependent = False
            for key in relations:
                if element["ds"] == relations[key]:
                    dependent = True
                    break
                                
            if not dependent:
                new = create_dataframe(element)
                dataframe_list.append(new)           
                    
        for element in conf_list:
            if element["ds"] in relations.values():
                dependency = []
                dependency_keys = [k for k, v in relations.items() if v == element["ds"]]
                
                for k in dependency_keys:
                    df, column = k.split(".")
                    for index, item in enumerate(dataframe_list):
                        if item.name == df:
                            dependency.append(
                                {
                                    "name_df": k,
                                    "data": list(dataframe_list[index][column])
                                }
                            )
                
                new = create_dataframe(element, dependency)
                dataframe_list.append(new)                      
                                   
        return dataframe_list
    except Exception as e:
        print(f"Error en la build_dataframes: {e}")    

In [190]:
def get_categorical_dataset_simulated(simulation_extended, category_cols, n):
    category_combinations = simulation_extended.groupby(category_cols, observed=True).size().reset_index(name='counts')
    category_combinations['prob'] = category_combinations['counts'] / len(simulation_extended)
    category_combinations['num_samples'] = (category_combinations['prob'] * n).round().astype(int)
    
    simulated = pd.DataFrame()
    for _, row in category_combinations.iterrows():
        sample = pd.DataFrame([row[category_cols]] * row['num_samples'], columns=category_cols)
        simulated = pd.concat([simulated, sample], ignore_index=True)
        
    return simulated

In [None]:
def get_numeric_column_simulated( simulated , df_origin , categories , column_name ):

    a1 = df_origin.groupby(
        categories
        , as_index = False
    ).agg(
        {
            column_name: ["min","max","mean","std"]
        }
    )
    nc = [ c for c in categories ]
    nc.extend( ["Min" , "Max" , "Mean" , "Std"] )
    a1.columns = nc

    ColumnSimulated = pd.DataFrame()
    for i in a1.index:
        rs = a1.loc[i]
        OneSegmnetCountryProduct = simulated.loc[i].copy()    
    
        data = generate_truncated_normal_data(
            rs["Mean"]
            , 1 if rs["Std"] == 0 else rs["Std"]
            , rs["Min"] - 1 if rs["Std"] == 0 else rs["Min"]
            , rs["Max"] + 1 if rs["Std"] == 0 else rs["Max"]
            , OneSegmnetCountryProduct.shape[0]
        )
    
        OneSegmnetCountryProduct[column_name] = data
    
        ColumnSimulated = pd.concat( [ ColumnSimulated , OneSegmnetCountryProduct  ] )

    return ColumnSimulated.reset_index(drop=True)


In [9]:
d1 = {
    "ds": "pacientes",
    "columns": [
        {
            "name": "id",
            "type": "unique"
        },
        {
            "name": "edad",
            "type": "numeric",
            "values": {
                "min": 1,
                "max": 100,
                "std": 3.48,
                "mean": 46.5
            }
        },
        {
            "name": "genero",
            "type": "category",
            "values": ["Femenino", "Masculino"]
        },
        {
            "name": "primera_cita",
            "type": "date",
            "values": {
                "min": "1930-02-23",
                "max": "2024-06-04"
            }
        }
    ],
    "random": False
}

In [22]:
d2 = {
    "ds": "doctores",
    "columns": [
        {
            "name": "id", 
            "type": "unique"
        },
        {
            "name": "fecha_ingreso",
            "type": "date",
            "values": {
                "min": "2000-02-23",
                "max": "2024-03-04"
            }
        },
        {
            "name": "departamento",
            "type": "category",
            "values": ["MGeneral", "Cirugía", "Oncología", "Pediatría"]
        },
        {
            "name": "hospital",
            "type": "foreign",
            "values": "hospitales.id"
        }
    ],
    "random": False
}

In [23]:
d3 = {
    "ds": "hospitales",
    "columns": [
        {
            "name": "id",
            "type": "unique",
        },
        {
            "name": "ciudad",
            "type": "category",
            "values": ["SP", "T", "C", "M", "SB", "PC"]
        },
        {
            "name": "nombre",
            "type": "category",
            "values": ["nombre E", "nombre T", "nombre B", "nombre Z", "nombre C", "nombre P"]
        },
        {
            "name": "cantidad_empleados",
            "type": "numeric",
            "values": {
                "min": 800,
                "max": 10000
            }
        }
    ],
    "random": True,
    "random_rows": 1500
}

In [12]:
d4 = {
    "ds": "citas",
    "columns": [
        {
            "name": "doctor",
            "type": "foreign",
            "values": "doctores.id"
        },
        {
            "name": "paciente",
            "type": "foreign",
            "values": "pacientes.id"
        },
        {
            "name": "fecha_cita",
            "type": "date",
            "values": {
                "min":  "1930-02-23",
                "max": "2024-06-04"
            }
        }, 
        {
            "name": "enfermedad",
            "type": "foreign",
            "values": "enfermedades.id"
        }
    ],
    "random": True,
    "random_rows": 10000000
}

In [13]:
d5 = {
    "ds": "enfermedades",
    "columns": [
        {
            "name": "id",
            "type": "unique",
        },
        {
            "name": "estado",
            "type": "category",
            "values": ["leve", "moderado", "grave"]
        },
        {
            "name": "porcentaje_recuperacion",
            "type": "numeric",
            "values": {
                "min": 1,
                "max": 80,
                "std": 3.48,
                "mean": 75.9
            }
        }
    ],
    "random": False
}

In [24]:
conf_list = [d5, d2, d4, d1, d3]
dataframe_list = build_dataframes(conf_list)

In [130]:
simulation_extended = dataframe_list[2]

Análisis de las variables categóricas

In [131]:
new_simulation = simulation_extended.copy()
new_simulation = (
    new_simulation
    .drop(["id"], axis = 1)
)

In [155]:
group = new_simulation.groupby(
    ["nombre", "ciudad"]
    , as_index=False
).agg({ "cantidad_empleados": ["count"]})
group.columns = ["nombre", "ciudad", "Cantidad"]
group["probs"] = group["Cantidad"] / simulation_extended.shape[0]

In [192]:
category_cols = []
numeric_cols = []
for column in simulation_extended.columns:
    if simulation_extended[column].dtype == "object":
        simulation_extended[column] = simulation_extended[column].astype('category')
        category_cols.append(column)
    if (simulation_extended[column].dtype == "float64" or 
            simulation_extended[column].dtype == "int64"):
        numeric_cols.append(column)

