# Dados

In [2]:
import pandas as pd
import numpy as np
import random

path = 'Bases de dados/'
df = pd.read_csv(path + 'census.csv')

# Amostragem Sistemática

In [3]:
def amostragem_sistematica(dataset, amostras):
    intervalo = len(dataset)//amostras
    random.seed(1)
    inicio = random.randint(0, intervalo)
    indices = np.arange(inicio, len(dataset), step = intervalo)
    amostra_sistematica = dataset.iloc[indices]
    return amostra_sistematica

In [4]:
amostragem_sistematica(df, 100)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
68,49,Self-emp-inc,191681,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
393,34,State-gov,98101,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,?,>50K
718,22,Private,214399,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K
1043,44,Private,167005,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,60,United-States,>50K
1368,52,Private,152234,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,99999,0,40,Japan,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30943,33,Private,48010,Some-college,10,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,United-States,<=50K
31268,43,Private,306440,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,66,France,<=50K
31593,37,Private,171968,Assoc-voc,11,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K
31918,26,Private,154571,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,Asian-Pac-Islander,Male,0,0,45,United-States,>50K


# Amostragem por grupos

In [5]:
def amostragem_agrupamento(dataset, numero_grupos):
    intervalo = len(dataset) / numero_grupos
    grupos = []
    id_grupo = 0
    contagem = 0
    for _ in dataset.iterrows():
        grupos.append(id_grupo)
        contagem += 1
        if contagem > intervalo:
            contagem = 0
            id_grupo += 1

    dataset['grupo'] = grupos
    random.seed(1)
    grupo_selecionado = random.randint(0, numero_grupos)
    return dataset[dataset['grupo']== grupo_selecionado]


In [6]:
df_amostra_agrupamento = amostragem_agrupamento(df, 100)
df_amostra_agrupamento.shape, df_amostra_agrupamento['grupo'].value_counts()

((326, 16),
 grupo
 17    326
 Name: count, dtype: int64)

# Amostragem Estratificada

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit


In [8]:
df['income'].value_counts(normalize=True)

income
<=50K    0.75919
>50K     0.24081
Name: proportion, dtype: float64

In [9]:
amostras = 100
split = StratifiedShuffleSplit(test_size=amostras / len(df))
for x,y in split.split(df,df['income']):
    df_x = df.iloc[x]
    df_y = df.iloc[y]


In [10]:
amostras / len(df)

0.0030711587481956942

In [11]:
df_x.shape, df_y.shape

((32461, 16), (100, 16))

In [12]:
df_y['income'].value_counts()

income
<=50K    76
>50K     24
Name: count, dtype: int64

In [14]:
def amostragem_estratificada(df, percentual):
    split = StratifiedShuffleSplit(test_size=percentual, random_state=1)
    for x,y in split.split(df,df['income']):
        df_x = df.iloc[x]
        df_y = df.iloc[y]

    return df_y

df_amostra_estratificada = amostragem_estratificada(df, 0.0030711587481956942)
df_amostra_estratificada

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
5611,47,Private,176319,HS-grad,9,Married-civ-spouse,Sales,Own-child,White,Female,0,0,38,United-States,>50K,17
30532,41,Private,242619,Assoc-acdm,12,Divorced,Prof-specialty,Not-in-family,White,Male,0,0,80,United-States,<=50K,93
4586,21,Private,152540,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,35,United-States,<=50K,14
15764,33,Private,132705,9th,5,Separated,Adm-clerical,Not-in-family,White,Male,0,0,48,United-States,<=50K,48
4511,24,Private,140001,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,21,Private,185582,Some-college,10,Never-married,Sales,Own-child,White,Male,0,0,43,United-States,<=50K,43
23856,32,Private,72744,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K,73
2964,27,Private,193701,HS-grad,9,Never-married,Craft-repair,Own-child,White,Female,0,0,45,United-States,<=50K,9
15845,46,Private,207301,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,37,United-States,<=50K,48


# Amostragem de reservatório

In [None]:
def amostragem_reservatorio(dataset, amostras):
    stream = []
    for i in range(len(dataset)):
        stream.append(i)

    i = 0
    tamanho = len(dataset)

    reservatorio = [0] * amostras
    for i in range(amostras):
        reservatorio[i] = stream[i]

    while i < tamanho:
        j = random.randrange(i + 1)
        if j < amostras:
            reservatorio[j] = stream[i]
        i += 1

    return dataset.iloc[reservatorio]


In [None]:
df_amostra_reservatorio = amostragem_reservatorio(df, 100)
df_amostra_reservatorio.shape

(100, 16)

In [None]:
df_amostra_reservatorio.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
28106,22,Private,221480,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K,86
310,26,Private,59306,Bachelors,13,Never-married,Sales,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
2845,50,Private,121685,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,>50K,8
27135,61,Private,270056,HS-grad,9,Divorced,Adm-clerical,Not-in-family,Asian-Pac-Islander,Female,0,0,40,Japan,<=50K,83
26804,44,Private,33155,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,45,United-States,<=50K,82
