## 1 - Amostragem

In [11]:
# importando as bibliotecas
import numpy as np
import statsmodels.api as sm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy import stats
import statistics
import random

In [3]:
# import da base de dados de censo americano
df = pd.read_csv('bases/census.csv')

In [4]:
df.shape

(32561, 15)

In [5]:
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### 1.1 - Amostragem Aleatória Simples

In [7]:
# selecionando aleatoriamente 100 registros diferentes
df_amostra_aleatoria_simples = df.sample(n = 100)
df_amostra_aleatoria_simples.shape

(100, 15)

In [8]:
df_amostra_aleatoria_simples.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
8416,25,Private,213412,Bachelors,13,Never-married,Tech-support,Unmarried,White,Male,0,0,40,United-States,<=50K
25160,38,Private,119177,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,55,United-States,>50K
2073,51,Local-gov,133050,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,40,United-States,>50K
23628,72,Self-emp-not-inc,139889,Some-college,10,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,74,United-States,<=50K
23598,42,Private,201466,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K


### 1.2 - Amostragem Sistemática

In [9]:
# queremos amostra de 100 pessoas
# então, selecionaremos as pessoas de 325 e 325 pessoas
df.shape[0] // 100

325

In [14]:
# função de geração da amostragem sistemática
def amostragem_sistematica(dataset, amostras):
    intervalo = dataset.shape[0] // amostras
    random.seed(1)
    # seleção do número aleatório inicial entre 0 e 325
    inicio = random.randint(0, intervalo)
    indices = np.arange(inicio, len(dataset), step = intervalo)
    amostra_sistematica = dataset.iloc[indices]
    return amostra_sistematica

In [15]:
# aplicação da função
df_amostragem_sistematica = amostragem_sistematica(df, 100)
df_amostragem_sistematica.shape

(100, 15)

In [16]:
df_amostragem_sistematica.head(3)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
68,49,Self-emp-inc,191681,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
393,34,State-gov,98101,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,?,>50K
718,22,Private,214399,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K


### 1.3 - Amostragem por Grupos

In [19]:
# função amostragem por grupos
# segmenta a base de dados em grupos com qtd de pessoas iguais e seleciona aleatoriamente um grupo
def amostragem_agrupamento(dataset, numero_grupos):
    intervalo = len(dataset) // numero_grupos

    grupos = []
    id_grupo = 0
    contagem = 0
    for _ in dataset.iterrows():
        grupos.append(id_grupo)
        contagem += 1
        if contagem > intervalo:
            contagem = 0
            id_grupo += 1
    
    dataset['grupo'] = grupos
    random.seed(2)
    grupo_selecionado = random.randint(0, numero_grupos)
    return dataset[dataset['grupo']==grupo_selecionado]

In [21]:
# aplicando a função de amostragem por grupos
df_amostra_agrupamento = amostragem_agrupamento(df, 326)
df_amostra_agrupamento.shape

(100, 16)

In [22]:
df_amostra_agrupamento.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
2800,47,Private,168283,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K,28
2801,17,Private,295488,11th,7,Never-married,Other-service,Own-child,Black,Female,0,0,25,United-States,<=50K,28
2802,35,Private,190895,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,<=50K,28
2803,33,Private,164190,Masters,14,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States,<=50K,28
2804,25,Private,216010,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,<=50K,28


### 1.4 - Amostragem Estratificada

In [23]:
# importar biblioteca que faz a extração estratificada
from sklearn.model_selection import StratifiedShuffleSplit

In [25]:
# quantidade de pessoas por salário: menor ou maior que 50K ano
# 75% da base recebe menos que 50k ano
df['income'].value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64