## Sampling

You can get a randomly rows of the dataset. It is very usefull in training machine learning models.
We will use the dataset about movie reviewers obtained of [here](http://grouplens.org/datasets/movielens/100k/).

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# read a dataset of movie reviewers into a DataFrame
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('./dataset/u.user', sep='|', header=None, names=user_cols, index_col='user_id')
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [8]:
# sample 3 rows from the DataFrame without replacement (new in pandas 0.16.1)
users.sample(n=3)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
175,26,F,scientist,21911
415,39,M,educator,85711
208,43,M,engineer,1720


In [12]:
#use the 'random_state' parameter for reproducibility
users.sample(n=3, random_state=42)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
97,43,M,artist,98006
266,62,F,administrator,78756
811,40,F,educator,73013


In [13]:
# sample 75% of the DataFrame's rows without replacement
train = users.sample(frac=0.75, random_state=99)

In [15]:
# store the remaining 25% of the rows in another DataFrame
test = users.loc[~users.index.isin(train.index), :]

In [16]:
train.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
795,30,M,programmer,8610
704,51,F,librarian,91105
615,38,M,educator,27705
520,62,M,healthcare,12603
137,50,M,educator,84408


In [17]:
test.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
12,28,F,other,6405
14,45,M,scientist,55106
21,26,M,writer,30068


In [18]:
# detect duplicate zip codes: True if an item is identical to a previous item
users.zip_code.duplicated().tail()

user_id
939    False
940     True
941    False
942    False
943    False
Name: zip_code, dtype: bool

In [19]:
# count the duplicate items (True becomes 1, False becomes 0)
users.zip_code.duplicated().sum()

148

In [20]:
# detect duplicate DataFrame rows: True if an entire row is identical to a previous row
users.duplicated().tail()

user_id
939    False
940    False
941    False
942    False
943    False
dtype: bool

### Logic for duplicated:

+ keep='first' (default): Mark duplicates as True except for the first occurrence.
+ keep='last': Mark duplicates as True except for the last occurrence.
+ keep=False: Mark all duplicates as True.

In [21]:
# examine the duplicate rows (ignoring the first occurrence)
users.loc[users.duplicated(keep='first'), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [22]:
# examine the duplicate rows (ignoring the last occurrence)
users.loc[users.duplicated(keep='last'), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630


In [23]:
# examine the duplicate rows (including all duplicates)
users.loc[users.duplicated(keep=False), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402


In [24]:
# only consider a subset of columns when identifying duplicates
users.duplicated(subset=['age', 'zip_code']).sum()

16

In [25]:
# drop the duplicate rows (inplace=False by default)
users.drop_duplicates(keep='first').shape

(936, 4)

In [26]:
users.drop_duplicates(keep='last').shape

(936, 4)

In [27]:
users.drop_duplicates(keep=False).shape

(929, 4)