# EXERCÍCIO Amostras Vs População
---

### O objetivo deste exercício é utilizar outra base de dados para testar as amostragens e comparar os resultados
---

* Faça o download e carregue a base de dados `credit_data.csv`, que possui informações sobre empréstimos (se o cliente pagará ou não pagará o empréstimo)


* Teste cada uma das técnicas de amostragem, selecionando 1000 registros


* Para a amostragem estratificada, utilize o atributo `c#default` para separar as categorias


* No final, faça o comparativo da média utilizando os atributos `age`, `income` e `loan`

In [1]:
import pandas as pd
from Models.Sample import Sample

sample = Sample()

### Population

In [2]:
df_pop = pd.read_csv('CSVs/credit_data.csv', index_col=0)
df_pop.shape

(2000, 4)

In [3]:
df_pop.head()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,66155.925095,59.017015,8106.532131,0
2,34415.153966,48.117153,6564.745018,0
3,57317.170063,63.108049,8020.953296,0
4,42709.534201,45.751972,6103.64226,0
5,66952.688845,18.584336,8770.099235,1


In [4]:
df_pop.tail()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996,59221.044874,48.518179,1926.729397,0
1997,69516.127573,23.162104,3503.176156,0
1998,44311.449262,28.017167,5522.786693,1
1999,43756.056605,63.971796,1622.722598,0
2000,69436.579552,56.152617,7378.833599,0


---
### Ramdom Simple Sample

In [5]:
df_sp_sample = sample.simple_sample(df_pop, 1000, 42)
df_sp_sample.shape

(1000, 4)

In [6]:
sample.df_sp_sample.head()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1861,40240.727556,26.959005,7498.630447,1
354,46706.458861,18.830336,7084.263509,1
1334,51211.654039,45.628568,4093.360006,0
906,67675.804771,37.740396,4396.076877,0
1290,36965.742479,53.762359,6333.391588,0


In [7]:
sample.df_sp_sample.tail()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26,47451.630123,27.031741,5361.282716,0
1859,60672.145586,43.055909,6279.687007,0
715,50831.427535,24.351603,9572.586884,1
1010,35620.418626,40.824674,3611.295903,0
194,42522.922407,53.868651,6790.850263,0


In [8]:
df_sys_sample = sample.systematic_sample(df_pop, 1000, 42)
df_sys_sample.shape

(999, 4)

In [9]:
sample.df_sys_sample.head()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,57317.170063,63.108049,8020.953296,0
5,66952.688845,18.584336,8770.099235,1
7,48430.359613,26.809132,5722.581981,0
9,40654.892537,55.496853,4755.82528,0
11,64131.415372,25.679575,4351.028971,0


In [10]:
sample.df_sys_sample.tail()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1991,34237.575419,34.101654,2658.090632,0
1993,30803.806165,23.250084,623.024153,0
1995,24254.700791,37.751622,2225.284643,0
1997,69516.127573,23.162104,3503.176156,0
1999,43756.056605,63.971796,1622.722598,0


---
### Group Sample

In [11]:
# To receive a sample with 1000 individuals
# Population = 2000
# n_groups: Population / 2 = 1000
df_grp_sample = sample.group_sample(df_pop, 2, 42)
df_grp_sample.shape

(1000, 5)

In [12]:
sample.df_grp_sample.head()

Unnamed: 0_level_0,income,age,loan,c#default,group
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,50793.35572,41.601886,421.64038,0,1
1002,62422.203789,32.145221,2841.633423,0,1
1003,63166.994955,56.51004,4058.789534,0,1
1004,23717.567853,49.32577,1530.090242,0,1
1005,66797.664673,21.380429,11921.199537,1,1


In [13]:
sample.df_grp_sample.tail()

Unnamed: 0_level_0,income,age,loan,c#default,group
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1996,59221.044874,48.518179,1926.729397,0,1
1997,69516.127573,23.162104,3503.176156,0,1
1998,44311.449262,28.017167,5522.786693,1,1
1999,43756.056605,63.971796,1622.722598,0,1
2000,69436.579552,56.152617,7378.833599,0,1


In [14]:
sample.df_grp_sample['group'].value_counts()

1    1000
Name: group, dtype: int64

---
### Stratify Sample

In [15]:
df_stf_sample = sample.stratify_sample(df_pop, 'c#default', 1000, 42)
df_stf_sample.shape

(1000, 4)

In [16]:
sample.df_stf_sample.head()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
705,57575.009791,33.800135,9857.22995,1
479,38665.033928,55.125922,6152.004833,0
903,48991.853679,18.621307,7453.264268,1
1206,50289.664748,24.074054,6127.381688,1
133,37049.386236,29.423019,6056.817214,1


In [17]:
sample.df_stf_sample.tail()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
747,45622.29071,41.632545,528.18125,0
881,67730.443696,26.303147,8881.583636,1
619,29668.32072,38.68374,2042.436463,0
1831,53812.226483,44.919152,3245.041667,0
1146,49032.662406,54.556072,1777.953131,0


In [18]:
df_pop['c#default'].value_counts() / len(df_pop) * 1000

0    858.5
1    141.5
Name: c#default, dtype: float64

In [19]:
df_stf_sample['c#default'].value_counts()

0    858
1    142
Name: c#default, dtype: int64

---
### Reservoir Sample

In [20]:
df_res_sample = sample.reservoir_sample(df_pop, 1000)
df_res_sample.shape

(1000, 4)

In [21]:
sample.df_res_sample.head()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
547,53719.65111,42.890102,1670.737893,0
525,66505.775687,25.618241,6571.197021,0
1068,33274.050271,23.953435,2244.883109,0
14,27267.995458,61.576776,4759.787581,0
413,43509.757756,18.075336,7363.037639,1


In [22]:
sample.df_res_sample.tail()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1834,28713.830519,54.700846,1936.813257,0
1579,29849.967144,39.928724,3678.899676,0
1,66155.925095,59.017015,8106.532131,0
1,66155.925095,59.017015,8106.532131,0
1166,67470.117023,52.232637,12715.294721,0


In [26]:
df_res_sample = sample.reservoir_sample(df_pop, 1000)
df_res_sample.shape

(1000, 4)

In [27]:
sample.df_res_sample.head()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1797,43052.968562,31.526854,488.937273,0
325,62040.889629,62.049801,7643.631046,0
1628,24877.684405,29.82362,1546.422886,0
126,36116.365092,22.535884,1494.984568,0
1051,69456.567771,48.053557,13190.365886,0


In [28]:
sample.df_res_sample.tail()

Unnamed: 0_level_0,income,age,loan,c#default
i#clientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1372,45435.267237,21.042198,2143.386972,0
1242,34051.528042,35.745652,6224.152886,0
1509,66274.072898,21.825604,11576.542237,1
1466,47704.380825,21.840361,2717.079485,0
1,66155.925095,59.017015,8106.532131,0


---
## Samples Comparation

### Age

In [45]:
print(f'Population mean age = {round(df_pop.age.mean(), 2)}'.rjust(40, " "))
print(f'Random sample mean age = {round(df_sp_sample.age.mean(), 2)}'.rjust(40, " "))
print(f'Systematic sample mean age = {round(df_sys_sample.age.mean(), 2)}'.rjust(40, " "))
print(f'Group sample mean age = {round(df_grp_sample.age.mean(), 2)}'.rjust(40, " "))
print(f'Stratify sample mean age = {round(df_stf_sample.age.mean(), 2)}'.rjust(40, " "))
print(f'Reservoir sample mean age = {round(df_res_sample.age.mean(), 2)}'.rjust(40, " "))

             Population mean age = 40.81
          Random sample mean age = 40.55
      Systematic sample mean age = 40.89
           Group sample mean age = 40.57
        Stratify sample mean age = 40.25
       Reservoir sample mean age = 45.23


In [63]:
sp_age_dif = df_pop.age.mean() - df_sp_sample.age.mean()
sys_age_dif = df_pop.age.mean() - df_sys_sample.age.mean()
grp_age_dif = df_pop.age.mean() - df_grp_sample.age.mean()
stf_age_dif = df_pop.age.mean() - df_stf_sample.age.mean()
res_age_dif = df_pop.age.mean() - df_res_sample.age.mean()
print(f'Difference between the population age and:\n\
      Simple sample {sp_age_dif}\n\
      Systematic sample {sys_age_dif}\n\
      Group sample {grp_age_dif}\n\
      stratify sample {stf_age_dif}\n\
      Reservoir sample {res_age_dif}')

Difference between the population age and:
      Simple sample 0.25680476639567473
      Systematic sample -0.08545411079079201
      Group sample 0.23439807910931876
      stratify sample 0.5590485436506114
      Reservoir sample -4.418495081946311


---
### Income

In [52]:
print(f'Population mean income = {round(df_pop.income.mean(), 2)}'.rjust(40, " "))
print(f'Random sample mean income = {round(df_sp_sample.income.mean(), 2)}'.rjust(40, " "))
print(f'Systematic sample mean income = {round(df_sys_sample.income.mean(), 2)}'.rjust(40, " "))
print(f'Group sample mean income = {round(df_grp_sample.income.mean(), 2)}'.rjust(40, " "))
print(f'Stratify sample mean income = {round(df_stf_sample.income.mean(), 2)}'.rjust(40, " "))
print(f'Reservoir sample mean income = {round(df_res_sample.income.mean(), 2)}'.rjust(40, " "))

        Population mean income = 45331.6
    Random sample mean income = 45703.02
Systematic sample mean income = 45671.01
      Group sample mean income = 45822.4
  Stratify sample mean income = 45058.05
 Reservoir sample mean income = 50675.66


In [64]:
sp_income_dif = df_pop.income.mean() - df_sp_sample.income.mean()
sys_income_dif = df_pop.income.mean() - df_sys_sample.income.mean()
grp_income_dif = df_pop.income.mean() - df_grp_sample.income.mean()
stf_income_dif = df_pop.income.mean() - df_stf_sample.income.mean()
res_income_dif = df_pop.income.mean() - df_res_sample.income.mean()
print(f'Difference between the population income and:\n\
      Simple sample {sp_income_dif}\n\
      Systematic sample {sys_income_dif}\n\
      Group sample {grp_income_dif}\n\
      stratify sample {stf_income_dif}\n\
      Reservoir sample {res_income_dif}')

Difference between the population income and:
      Simple sample -371.418907976491
      Systematic sample -339.4138216205174
      Group sample -490.79736439164117
      stratify sample 273.5465766338675
      Reservoir sample -5344.057714828428


---
### Loan

In [51]:
print(f'Population mean loan = {round(df_pop.loan.mean(), 2)}'.rjust(40, " "))
print(f'Random sample mean loan = {round(df_sp_sample.loan.mean(), 2)}'.rjust(40, " "))
print(f'Systematic sample mean loan = {round(df_sys_sample.loan.mean(), 2)}'.rjust(40, " "))
print(f'Group sample mean loan = {round(df_grp_sample.loan.mean(), 2)}'.rjust(40, " "))
print(f'Stratify sample mean loan = {round(df_stf_sample.loan.mean(), 2)}'.rjust(40, " "))
print(f'Reservoir sample mean loan = {round(df_res_sample.loan.mean(), 2)}'.rjust(40, " "))

          Population mean loan = 4444.37
       Random sample mean loan = 4498.39
   Systematic sample mean loan = 4503.18
        Group sample mean loan = 4494.61
     Stratify sample mean loan = 4426.27
    Reservoir sample mean loan = 5263.86


In [65]:
sp_loan_dif = df_pop.loan.mean() - df_sp_sample.loan.mean()
sys_loan_dif = df_pop.loan.mean() - df_sys_sample.loan.mean()
grp_loan_dif = df_pop.loan.mean() - df_grp_sample.loan.mean()
stf_loan_dif = df_pop.loan.mean() - df_stf_sample.loan.mean()
res_loan_dif = df_pop.loan.mean() - df_res_sample.loan.mean()
print(f'Difference between the population loan and:\n\
      Simple sample {sp_loan_dif}\n\
      Systematic sample {sys_loan_dif}\n\
      Group sample {grp_loan_dif}\n\
      stratify sample {stf_loan_dif}\n\
      Reservoir sample {res_loan_dif}')

Difference between the population loan and:
      Simple sample -54.01909530442845
      Systematic sample -58.81493423570464
      Group sample -50.239679829896886
      stratify sample 18.102045979438117
      Reservoir sample -819.4865086518721
