# População e amostra

In [69]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import random
import numpy as np

### Leitura de dados

In [70]:
# Input data
path_file = r"../data/raw_data/census.csv"
dataset = pd.read_csv(path_file, sep=',', encoding='utf8')
print("Base de dados")
print(f'Dimensão: {dataset.shape[0]} linhas e {dataset.shape[1]} colunas')
dataset.sample(n=5)

Base de dados
Dimensão: 32561 linhas e 15 colunas


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
3121,21,Private,155066,Some-college,10,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
9891,30,Private,241583,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K
25999,37,Private,205339,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,0,0,49,United-States,<=50K
16704,19,Private,252862,Assoc-voc,11,Never-married,Prof-specialty,Own-child,White,Female,0,0,40,United-States,<=50K
10785,45,Private,242994,Some-college,10,Never-married,Sales,Not-in-family,White,Male,0,0,52,United-States,<=50K


## Amostragem simples

In [71]:
# Functions
def amostragem_simples(dataset, samples):
    """ Return a simple random sample """
    return dataset.sample(n = samples, random_state=42)

In [72]:
df_amostra_simples = amostragem_simples(dataset, 100)

print("Linhas e colunas")
print(df_amostra_simples.shape)
df_amostra_simples.head()

Linhas e colunas
(100, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
14160,27,Private,160178,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,38,United-States,<=50K
27048,45,State-gov,50567,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
28868,29,Private,185908,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,55,United-States,>50K
5667,30,Private,190040,Bachelors,13,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
7827,29,Self-emp-not-inc,189346,Some-college,10,Divorced,Craft-repair,Not-in-family,White,Male,2202,0,50,United-States,<=50K


## Amostragem sistemática

In [73]:
def amostragem_sistematica(dataset, amostras):
    """Returna uma amostra sistemática"""
    intervalo = len(dataset) // amostras
    random.seed(1)
    inicio = random.randint(0, intervalo)
    indices = np.arange(inicio, len(dataset), step=intervalo)
    amostra_sistematica = dataset.iloc[indices]
    return amostra_sistematica

In [74]:
# Amostra sistemática
df_amostra_sistematica = amostragem_sistematica(dataset, 100)

print("Linhas e colunas")
print(df_amostra_sistematica.shape)
df_amostra_sistematica.head()

Linhas e colunas
(100, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
68,49,Self-emp-inc,191681,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
393,34,State-gov,98101,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,?,>50K
718,22,Private,214399,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K
1043,44,Private,167005,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,60,United-States,>50K
1368,52,Private,152234,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,99999,0,40,Japan,>50K


## Amostragem por agrupamento 

In [75]:
def amostragem_agrupamento(dataset, numero_grupos):
    intervalo = len(dataset) // numero_grupos
    grupos = []
    id_grupo = 0
    contagem = 0

    for _ in dataset.iterrows():
        grupos.append(id_grupo)
        contagem += 1
        if contagem > intervalo:
            contagem = 0
            id_grupo += 1

    dataset['grupo'] = grupos
    random.seed(1)
    grupo_selecionado = random.randint(0, numero_grupos)
    return dataset[dataset['grupo'] == grupo_selecionado]

In [76]:
# Amostra por grupos
df_amostra_agrupamento = amostragem_agrupamento(dataset, 100)
print("\nAmostra agrupamento")
print(f'Dimensão: {df_amostra_agrupamento.shape[0]} linhas e {df_amostra_agrupamento.shape[1]} colunas')
df_amostra_agrupamento.head()


Amostra agrupamento
Dimensão: 326 linhas e 16 colunas


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
5542,40,Self-emp-inc,169878,Assoc-acdm,12,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K,17
5543,44,Private,296728,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K,17
5544,33,Local-gov,342458,Assoc-acdm,12,Divorced,Protective-serv,Not-in-family,White,Male,0,0,56,United-States,<=50K,17
5545,21,Local-gov,38771,Some-college,10,Never-married,Adm-clerical,Own-child,White,Male,0,0,40,United-States,<=50K,17
5546,35,Self-emp-not-inc,269300,Bachelors,13,Never-married,Other-service,Not-in-family,Black,Female,0,0,60,United-States,<=50K,17


## Amostragem estratificada

O objetivo é obter uma amostra que se respeite a proporção entre os elementos. Por exemplo, numa população de 90 pessoas, com 54 mulheres e 36 homens, se queremos uma amostra de 10%, teremos que selecionar proporcionalmente os 9 elementos

In [77]:
from sklearn.model_selection import StratifiedShuffleSplit

def amostragem_estratificada(dataset, tamanho_amostra):
    # Calcula o tamanho da amostra relativo ao total do dataset
    test_size = tamanho_amostra / len(dataset)
    random_state = 1
    
    # Configura o split para realizar uma divisão estratificada
    split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
    
    # Realiza o split
    for train_index, test_index in split.split(dataset, dataset['income']):
        df_amostra_estratificada = dataset.iloc[test_index]
    
    return df_amostra_estratificada

In [78]:
# Exemplo de uso
df_amostra_estratificada = amostragem_estratificada(dataset, 100)
print("\nAmostra estratificada")
print(f'Dimensão: {df_amostra_estratificada.shape[0]} linhas e {df_amostra_estratificada.shape[1]} colunas')
df_amostra_estratificada.head()


Amostra estratificada
Dimensão: 100 linhas e 16 colunas


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
15955,51,Private,392286,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,>50K,48
19059,53,Private,132304,HS-grad,9,Divorced,Machine-op-inspct,Not-in-family,White,Female,0,0,40,Scotland,<=50K,58
26608,60,Private,166330,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,<=50K,81
20737,56,Private,274475,9th,5,Widowed,Sales,Unmarried,White,Female,0,0,40,United-States,<=50K,63
26493,51,Private,137815,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,7298,0,40,United-States,>50K,81


In [79]:
df_amostra_estratificada['income'].value_counts()

income
<=50K    76
>50K     24
Name: count, dtype: int64

## Amostragem de reservatório

Esta técnica é aplicada a _data stream_. Sortear itens num stream. Mas eles deve ter a mesma probabilidade.

In [80]:
def amostragem_reservatorio(dataset, amostras):
    stream = []
    tamanho = len(dataset)
    reservatorio = [0]*amostras
    
    for i in range(tamanho):
        stream.append(i)

    i = 0
    for i in range(amostras):
        reservatorio[i] = stream[i]

    
    while i < tamanho:
        j = random.randrange(i + 1)
        if j < amostras:
            reservatorio[j] = stream[i]
        i += 1
    
    return dataset.iloc[reservatorio]

In [81]:
df_amostragem_reservatorio = amostragem_reservatorio(dataset, 100)
df_amostragem_reservatorio.shape

(100, 16)

In [82]:
df_amostragem_reservatorio.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
29608,41,Self-emp-inc,114580,Prof-school,15,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,2415,55,United-States,>50K,90
21696,37,Federal-gov,329088,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,Black,Male,0,0,40,United-States,<=50K,66
30676,42,Private,355728,Assoc-voc,11,Never-married,Craft-repair,Not-in-family,White,Male,0,0,44,United-States,<=50K,94
28550,43,Private,110970,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,40,United-States,>50K,87
8768,52,Federal-gov,221532,Bachelors,13,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,45,United-States,>50K,26


## Comparativo de amostragem

In [83]:
dataset['age'].mean()

38.58164675532078

In [84]:
df_amostra_simples['age'].mean()

38.32

In [85]:
df_amostra_sistematica['age'].mean()

37.57

In [86]:
df_amostra_agrupamento['age'].mean()

39.23312883435583

In [87]:
df_amostra_estratificada['age'].mean()

39.37

In [88]:
df_amostragem_reservatorio['age'].mean()

37.85