## 표본 추출의 개요
`확률적 표본 추출`의 종류
 - 단순 임의 추출 (Random Sampling): 별도의 규칙이 존재하지 않은 보통의 임의 추출
 - 층화 표본 추출 (Stratified Sampling): 군집별로 지정한 비율만큼의 데이터를 임의 추출 (코호트 분석에 활용됨)
 - 계통 추출 (Systematic Sampling): 첫 표본을 무작위로 추출하고, 표집 간격 k 만큼 떨어진 곳의 데이터를 추출
 - 군집 추출 (Cluster Sampling): 소수의 군집으로 분할하고 일정 수의 소집단을 임의 표본 추출

1. Pandas: sample()
 - 단순임의 추출을 시행하는 메서드
 - n: 표본의 개수, frac: 비율, random_state: 표본 추출 결과를 고정
 - groupby(): 메서드를 추가하면 층화표본 추출 가능<br>
<br>
2. Sklearn: train_test_split()
 - 입력 데이터프레임이나 배열을 2개의 Train, Test Set로 나누는 함수
 - Train_size 또는 Test_size에 개수 또는 비율을 입력하여 표본 개수 조절
 - Random_state는 표본 추출 결과를 고정

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Pandas 활용

In [2]:
df = pd.read_csv("C:/Users/Python/Data/bike.csv")

In [3]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
df.sample(2) # n = 2

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
3439,2011-08-13 05:00:00,3,0,0,1,26.24,30.305,73,7.0015,3,15,18
7917,2012-06-10 06:00:00,2,0,0,1,25.42,29.545,73,0.0,3,27,30


In [7]:
df.sample(n = 3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4119,2011-10-03 16:00:00,4,0,1,2,16.4,20.455,76,7.0015,16,202,218
684,2011-02-11 20:00:00,1,0,1,1,9.02,11.365,55,8.9981,2,61,63
3525,2011-08-16 19:00:00,3,0,1,1,30.34,33.335,48,12.998,56,320,376


In [14]:
df.sample(n = 2, random_state = 34)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4219,2011-10-07 20:00:00,4,0,1,1,22.14,25.76,49,0.0,30,167,197
1409,2011-04-04 14:00:00,2,0,1,2,30.34,32.575,27,32.9975,47,76,123


In [15]:
df['season'].unique()

array([1, 2, 3, 4], dtype=int64)

In [17]:
print(len(df['season'].unique()))

4


In [21]:
type(df['season'])

pandas.core.series.Series

In [23]:
print(df['season'].nunique())

4


In [25]:
df.sample(frac = 0.005, random_state = 34)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4219,2011-10-07 20:00:00,4,0,1,1,22.14,25.76,49,0.0,30,167,197
1409,2011-04-04 14:00:00,2,0,1,2,30.34,32.575,27,32.9975,47,76,123
6289,2012-02-18 07:00:00,1,0,0,1,9.84,14.395,70,0.0,8,33,41
7506,2012-05-12 03:00:00,2,0,0,1,19.68,23.485,59,0.0,14,20,34
7509,2012-05-12 06:00:00,2,0,0,1,17.22,21.21,67,6.0032,10,23,33
2717,2011-07-02 03:00:00,3,0,0,1,26.24,31.06,53,0.0,5,21,26
4094,2011-10-02 15:00:00,4,0,0,3,14.76,16.665,81,16.9979,29,144,173
1526,2011-04-09 11:00:00,2,0,0,2,14.76,18.18,81,0.0,51,91,142
9325,2012-09-11 22:00:00,3,0,1,1,22.96,26.515,64,7.0015,27,189,216
5508,2012-01-04 15:00:00,1,0,1,2,7.38,7.575,37,22.0028,9,81,90


In [26]:
df.sample(frac = 0.005, random_state = 34).shape # (54, 12) : 행, 열

(54, 12)

In [27]:
len(df.sample(frac = 0.005, random_state = 34)) # (54, 12) : 행, 열

54

In [28]:
df.groupby('season').sample(frac = 0.005, random_state = 34)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
5779,2012-01-15 23:00:00,1,0,0,1,6.56,9.85,43,6.0032,3,26,29
6133,2012-02-11 19:00:00,1,0,0,2,8.2,7.575,40,36.9974,2,85,87
1231,2011-03-16 03:00:00,1,0,1,2,11.48,13.635,100,15.0013,1,2,3
5748,2012-01-14 16:00:00,1,0,0,1,9.84,11.365,38,15.0013,28,178,206
1072,2011-03-09 05:00:00,1,0,1,2,9.84,12.12,81,8.9981,1,7,8
6328,2012-02-19 22:00:00,1,0,0,3,9.84,12.88,75,6.0032,8,47,55
86,2011-01-04 18:00:00,1,0,1,1,10.66,12.88,48,12.998,3,179,182
522,2011-02-04 22:00:00,1,0,1,2,9.84,12.12,65,11.0014,1,45,46
5977,2012-02-05 06:00:00,1,0,0,2,10.66,12.12,70,16.9979,0,4,4
5570,2012-01-07 05:00:00,1,0,0,1,10.66,13.635,75,7.0015,2,7,9


In [31]:
len(df.sample(frac = 0.0123)) # 1.23%로 추출 시 샘플 행의 개수

134

In [32]:
len(df['season'].sample(frac = 0.05))

544

### sklearn.model_selection의 train_test_split() 활용

In [29]:
df_train, df_test = train_test_split(df, train_size = 0.7, random_state = 123)
df_train.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4046,2011-09-19 15:00:00,3,0,1,2,24.6,30.305,60,15.0013,44,143,187
9262,2012-09-09 07:00:00,3,0,0,1,22.14,25.76,73,11.0014,20,50,70


In [30]:
print(len(df_train))
print(len(df_test))

7620
3266


In [34]:
df_train, df_test = train_test_split(df, train_size = 0.8)
df_train['temp'].max()

39.36