In [1]:
import pandas as pd
import random
import numpy as np
import datetime

In [2]:
def generate_dataframe():
    # Configuración de las variables
    num_rows = 10

    numeric_variables = {
        'age': {'min': 20, 'mean': 35, 'max': 70},
        'bmi': {'min': 15.96, 'mean': 30.4, 'max': 53.3},
        'children': {'min': 0, 'mean': 1, 'max': 5}}

    categorical_variables = {
        'smoker': {'classes': ['no', 'yes'], 'distribution': [80, 20]},
        'region': {'classes': ['southeast', 'southwest','northwest','northeast'], 'distribution': [25, 25, 25, 25]},
        'sex': {'classes': ['male', 'female'], 'distribution': [50, 50]}}

    # Generación de los datos
    data = {}

    idies = np.array([str(''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=6)))
                 for _ in range(num_rows)])
    
    data['user_cod'] = idies

    current_date = datetime.date.today()
    data['date'] = current_date

    for variable, info in categorical_variables.items():
        classes = info['classes']
        distribution = info['distribution']
        data[variable] = np.random.choice(classes, size=num_rows, p=[p/100 for p in distribution])


    for variable, info in numeric_variables.items():
        min_val = info['min']
        mean_val = info['mean']
        max_val = info['max']
        num_rows = 10  # Número de filas de datos que deseas generar

        if variable == 'bmi':
            data[variable] = np.random.normal(loc=mean_val, scale=(max_val - min_val) / 6, size=num_rows).round(2)
        else:
            data[variable] = np.random.normal(loc=mean_val, scale=(max_val - min_val) / 6, size=num_rows).astype(int)
            data[variable] = np.clip(data[variable], min_val, max_val)
        
    df = pd.DataFrame(data)
    df['user_cod'] = df['user_cod'].astype(str)
    df['date'] = pd.to_datetime(df['date']).dt.date
    df['age'] = df['age'].astype(int)
    df['sex'] = df['sex'].astype(str)
    df['bmi'] = df['bmi'].astype(float)
    df['children'] = df['children'].astype(int)
    df['smoker'] = df['smoker'].astype(str)
    df['region'] = df['region'].astype(str)
    
    new_order = ['user_cod','date','age','sex','bmi','children','smoker','region']
    df = df.reindex(columns=new_order)
    
    return df


In [3]:
df = generate_dataframe()

In [4]:
df

Unnamed: 0,user_cod,date,age,sex,bmi,children,smoker,region
0,QBCMYY,2023-07-11,36,male,23.84,1,no,southwest
1,XJQAMU,2023-07-11,20,male,35.71,1,yes,northeast
2,XMVLON,2023-07-11,24,female,42.66,0,no,northwest
3,YGFPHC,2023-07-11,44,male,34.57,2,yes,northeast
4,BLOTVZ,2023-07-11,33,male,27.04,0,no,southwest
5,EFKOFW,2023-07-11,24,male,40.13,1,no,northeast
6,MIQCTW,2023-07-11,44,male,30.54,1,no,southeast
7,VKAGWG,2023-07-11,28,female,39.33,0,yes,southeast
8,IUKXPD,2023-07-11,48,male,41.02,1,no,southwest
9,KNYOEL,2023-07-11,35,female,38.13,0,no,northeast
