In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Dataset.csv')

In [3]:
print(df.shape)

(1460, 3)


In [4]:
df.describe()

Unnamed: 0,Id,LotArea,SalePrice
count,1460.0,1460.0,1460.0
mean,730.5,10516.828082,180921.19589
std,421.610009,9981.264932,79442.502883
min,1.0,1300.0,34900.0
25%,365.75,7553.5,129975.0
50%,730.5,9478.5,163000.0
75%,1095.25,11601.5,214000.0
max,1460.0,215245.0,755000.0


In [5]:
df.head()

Unnamed: 0,Id,LotArea,SalePrice
0,1,8450,208500
1,2,9600,181500
2,3,11250,223500
3,4,9550,140000
4,5,14260,250000


# simple random sampling

In [7]:
sample = df.sample(n=10).sort_values(by = 'Id')
print(sample)

        Id  LotArea  SalePrice
115    116     3230     176000
175    176    12615     243000
215    216    10011     134450
376    377     8846     148000
728    729    11475     110000
1043  1044    11839     262280
1270  1271    23595     260000
1335  1336     9650     167900
1359  1360    16737     315000
1429  1430    12546     182900


In [8]:
mean_sample = df['SalePrice'].mean()

In [9]:
print(mean_sample)

180921.19589041095


In [10]:
print(round(mean_sample,3))

180921.196


# systematic sampling

In [11]:
index = np.arange(0, len(df), step=5)

In [12]:
index

array([   0,    5,   10,   15,   20,   25,   30,   35,   40,   45,   50,
         55,   60,   65,   70,   75,   80,   85,   90,   95,  100,  105,
        110,  115,  120,  125,  130,  135,  140,  145,  150,  155,  160,
        165,  170,  175,  180,  185,  190,  195,  200,  205,  210,  215,
        220,  225,  230,  235,  240,  245,  250,  255,  260,  265,  270,
        275,  280,  285,  290,  295,  300,  305,  310,  315,  320,  325,
        330,  335,  340,  345,  350,  355,  360,  365,  370,  375,  380,
        385,  390,  395,  400,  405,  410,  415,  420,  425,  430,  435,
        440,  445,  450,  455,  460,  465,  470,  475,  480,  485,  490,
        495,  500,  505,  510,  515,  520,  525,  530,  535,  540,  545,
        550,  555,  560,  565,  570,  575,  580,  585,  590,  595,  600,
        605,  610,  615,  620,  625,  630,  635,  640,  645,  650,  655,
        660,  665,  670,  675,  680,  685,  690,  695,  700,  705,  710,
        715,  720,  725,  730,  735,  740,  745,  7

In [13]:
systematic_sample = df.iloc[index]
print(systematic_sample.head())

    Id  LotArea  SalePrice
0    1     8450     208500
5    6    14115     143000
10  11    11200     129500
15  16     6120     132000
20  21    14215     325300


In [14]:
print(round(systematic_sample['SalePrice'].mean(),3))

178146.13


# cluster sampling

In [15]:
# number of clusters
n = 5

np.repeat(array,repeat period)

In [16]:
df['cluster_id'] = np.repeat([range(1,n+1)], len(df)/n)

In [20]:
df.head(293)

Unnamed: 0,Id,LotArea,SalePrice,cluster_id
0,1,8450,208500,1
1,2,9600,181500,1
2,3,11250,223500,1
3,4,9550,140000,1
4,5,14260,250000,1
...,...,...,...,...
288,289,9819,122000,1
289,290,8730,153575,1
290,291,15611,233230,1
291,292,5687,135900,1


In [32]:
l = []
for i in range(len(df)):
    if df['cluster_id'].iloc[i]%2 == 0:
        l.append(i)
cluster_sample = df.iloc[l]
print(cluster_sample['cluster_id'].value_counts())

2    292
4    292
Name: cluster_id, dtype: int64


# stratified random sample

In [33]:
df['strata'] = np.repeat([1,2], len(df)/2)
p = []
for i in range(len(df)):
    p.append(i)
str_sample = df.iloc[p]
print(str_sample.head())
print(str_sample.shape)

   Id  LotArea  SalePrice  cluster_id  strata
0   1     8450     208500           1       1
1   2     9600     181500           1       1
2   3    11250     223500           1       1
3   4     9550     140000           1       1
4   5    14260     250000           1       1
(1460, 5)


In [38]:
from sklearn.model_selection import StratifiedShuffleSplit as sss
# set the split criteria

split = sss(n_splits = 3, test_size = 8)


# perform data frame split

for x,y in split.split(df, df['strata']):
    str_sample = df.iloc[y].sort_values(by = 'SalePrice')
print(str_sample)    

        Id  LotArea  SalePrice  cluster_id  strata
676    677     9600      87000           3       1
562    563    13907     108000           2       1
953    954    11075     172000           4       2
1191  1192     2645     174000           5       2
1309  1310     7153     179200           5       2
444    445     8750     210000           2       1
928    929    11838     236500           4       2
245    246    10400     241500           1       1
