## Import Library

In [1]:
import pandas as pd
import numpy as np

## Import Dataset

In [2]:
df = pd.read_csv('ab_data.csv')

## Quick Look

In [3]:
df.shape

(294478, 5)

In [4]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [5]:
df.shape

(294478, 5)

### Sample Size using Slovin

In [6]:
N = 294478
e = 0.05 # margin of error 5%

In [7]:
slovin = N/(1+N * e**2)

In [8]:
slovin

399.45740272248173

### Sample 400 observation

In [42]:
df_sampel1 = df.sample(n=400)

In [43]:
df_sampel1['group'].value_counts()

control      212
treatment    188
Name: group, dtype: int64

In [44]:
df_sampel1['user_id'].head()

281580    727221
17913     917330
269172    889987
179000    853657
10168     712474
Name: user_id, dtype: int64

### Sample using Same Randomness

In [71]:
df.sample(n=400,random_state = 123).head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
260162,847857,2017-01-18 15:15:02.551941,control,old_page,0
37456,920163,2017-01-10 20:42:09.498343,control,old_page,0
80272,806917,2017-01-06 12:05:56.049996,control,old_page,0
278131,662772,2017-01-08 04:36:26.397852,control,old_page,0
270691,911648,2017-01-08 17:21:28.253382,control,old_page,0


In [60]:
df.sample(n=400,random_state = 123)['group'].value_counts()

treatment    215
control      185
Name: group, dtype: int64

### Sample 30% from dataset

In [61]:
df.sample(frac=0.3)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
204714,855012,2017-01-21 20:24:32.253274,control,old_page,0
164473,759576,2017-01-09 01:00:17.392863,control,old_page,0
229484,817095,2017-01-18 01:18:46.353162,treatment,new_page,0
276631,882510,2017-01-20 14:34:26.545266,treatment,new_page,0
96578,913199,2017-01-05 09:54:22.353330,treatment,new_page,0
...,...,...,...,...,...
121470,710989,2017-01-12 07:10:09.272527,control,old_page,0
53463,945275,2017-01-08 18:48:07.501897,control,old_page,0
33894,904962,2017-01-10 04:59:51.626687,control,old_page,0
61656,936168,2017-01-15 02:17:34.225701,control,old_page,0


# Sampling for Each Group / Stratified Random Sampling

In [62]:
df.groupby(['group'],as_index=False).count()

Unnamed: 0,group,user_id,timestamp,landing_page,converted
0,control,147202,147202,147202,147202
1,treatment,147276,147276,147276,147276


In [67]:
df_sample2 = df.groupby(['group']).apply(lambda x: x.sample(n=200,random_state=123))

In [68]:
df_sample2

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp,group,landing_page,converted
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
control,95574,704344,2017-01-08 06:33:15.620318,control,old_page,0
control,282637,903218,2017-01-07 16:40:31.904242,control,old_page,0
control,201262,724634,2017-01-05 18:38:31.257679,control,old_page,0
control,93315,750623,2017-01-21 19:20:32.814948,control,old_page,0
control,16163,651056,2017-01-04 03:17:39.846424,control,old_page,0
...,...,...,...,...,...,...
treatment,16034,665227,2017-01-18 06:10:37.832101,treatment,new_page,1
treatment,241972,818984,2017-01-23 01:45:24.506789,treatment,new_page,0
treatment,135298,843757,2017-01-04 03:10:19.433517,treatment,new_page,0
treatment,200501,659763,2017-01-24 13:21:56.026713,treatment,new_page,0


In [69]:
df_sample2['group'].value_counts()

treatment    200
control      200
Name: group, dtype: int64