In [1]:
# Importar libs necessárias
import pandas as pd
import numpy as np
import random
import requests
import json
from itertools import cycle
from random import randint
from random import choice

#### Função de gerar fake dataframes
Reference: https://towardsdatascience.com/generating-fake-data-with-pandas-very-quickly-b99467d4c618

In [2]:
def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None):
    
    categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'],
                       'names'  : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'],
                       'cities' : ['São Paulo', 'Belo Horizonte', 'Recife', 'Fortaleza', 'Salvador', 'Curitiba', 'Porto Alegre', 'Manaus', 'Palmas', 'Río de Janeiro', 'Macapá', 'Teresina', 'Goiânia', 'Brasília', 'Vitória', 'Florianópolis', 'Curitiba', 'João Pessoa', 'São Luís', 'Natal', 'Cuiabá', 'Campo Grande'],
                       'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white']
                      }
    default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")}
    rng = np.random.default_rng(seed)

    first_c = default_intervals["c"][0]
    categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c])
    default_intervals["c"] = (categories_names, default_intervals["c"][1])
    
    if isinstance(col_names,list):
        assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements"
    elif col_names is None:
        suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
        col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]

    if isinstance(intervals,list):
        assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements"
    else:
        if isinstance(intervals,dict):
            assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys"
            default_intervals.update(intervals)
        intervals = [default_intervals[col] for col in cols]
    df = pd.DataFrame()
    for col, col_name, interval in zip(cols, col_names, intervals):
        if interval is None:
            interval = default_intervals[col]
        assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings."
        if col in ("i","f","d"):
            start, end = interval
        if col == "i":
            df[col_name] = rng.integers(start, end, size)
        elif col == "f":
            df[col_name] = np.round(rng.uniform(start, end, size), 2)
        elif col == "c":
            if isinstance(interval, list):
                categories = np.array(interval)
            else:
                cat_family, length = interval
                if isinstance(cat_family, cycle):
                    cat_family = next(cat_family)
                assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}"
                categories = rng.choice(categories_dict[cat_family], length, shuffle = True)
            df[col_name] = rng.choice(categories, size, shuffle = True)
        elif col == "d":
            df[col_name] = rng.choice(pd.date_range(start, end), size)
    return df  

#### Função para gerar colunas de quantidade preterida e tipo de consumidos (física ou jurídica)
Pensando na coluna de quantidade preterida foi levado em conta se o consumidor é uma pessoa jurídica ou física. Isso foi levado em conta, pois uma pessoa jurídica normalmente é um estabelecimento, ou seja, busca uma maior quantidade de cafés para compra

In [3]:
def generate_person_type_and_desired_quantity(size):
    aux_array = []
    for _ in range(size):
        person_type = choice([1, 2])
        if person_type == 2:
            desired_quantity = 60
        else: 
            available_quantity = [0.5, 1]
            desired_quantity = choice(available_quantity)
        aux_array.append([person_type, desired_quantity])
    return aux_array

#### Função para gerar colunas de condição financeira do consumidor e se procura café especial
Possivelmente quem tem interesse em café especial, esta disposto a gastar um pouco mais

In [4]:
def generate_search_special_coffee_and_financial_condition(size):
    percentage = 0.65
    size_not_special = round(size*percentage)
    size_special = size - size_not_special
    aux_array = []
    for _ in range(size_not_special):
        financial_condition = round(random.gauss(100, 20))
        aux_array.append([False, financial_condition])
    for _ in range(size_special):
        financial_condition = round(random.gauss(300, 50))
        aux_array.append([True, financial_condition])
    
    np.random.shuffle(aux_array)
    return aux_array

#### Função para gerar colunas de pontuação do café e residências
Pontuação do café gerada com base na altitude e qualidade médias dos cafés nos estados pesquisados

In [5]:
def generate_coffee_score_and_residences(size):
    aux_array = []
    for _ in range(size):
        state_id = choice([1, 2, 3, 4, 5, 6])
        if state_id == 1:
            # Bahia
            coffee_score = round(random.gauss(62, 7))
        elif state_id == 2:
            # Minas Gerais
            coffee_score = round(random.gauss(79, 3))
        elif state_id == 3:
            # Espírito Santo
            coffee_score = round(random.gauss(62, 8))
        elif state_id == 4:
            # Rio de Janeiro
            coffee_score = round(random.gauss(68, 7))
        elif state_id == 5:
            # São Paulo
            coffee_score = round(random.gauss(76, 5))
        else:
            # Paraná
            coffee_score = round(random.gauss(72, 7))
        aux_array.append([state_id, coffee_score])
    return aux_array

#### Função para gerar colunas de quantidade produzida do café e se produz café especial
Normalmente produtores que produzem café especial, produzem em uma quantidade menor de que quem não produz café especial

In [6]:
def generate_produces_special_coffee_and_quantity_produced(size):
    percentage = 0.65
    size_not_special = round(size*percentage)
    size_special = size - size_not_special
    aux_array = []
    for _ in range(size_not_special):
        quantity_produced = round(random.gauss(900, 200))
        aux_array.append([False, quantity_produced])
    for _ in range(size_special):
        quantity_produced = round(random.gauss(700, 200))
        aux_array.append([True, quantity_produced])
    
    np.random.shuffle(aux_array)
    return aux_array