In [11]:
import pandas as pd

df = pd.DataFrame({
    'class': ['English', 'English', 'Math', 'Math'],
    'student': ['Song', 'Trump', 'Song', 'Trump'],
    'score': [100, 10, 20, 90]
})

df

Unnamed: 0,class,student,score
0,English,Song,100
1,English,Trump,10
2,Math,Song,20
3,Math,Trump,90


<h1>Basics</h1>
<h4>1. sample() randomly select faction of items from the dataframe</h4>
<h4>2. If replace parameter (default = False) is True, selected item is put back to the pool, so an item can be selected multiple times</h4>
<h4>3. Most likely replace=True should never be used</h4>
<h4>4. If frac=1.0, it then shuffle the dataframe</h4>
<h4>5. sample() can result in a new DataFrame with a non-sequential index, preserving the index from the original DataFrame.
    To reset this index to a default integer-based index (0, 1, 2, ...), the reset_index(drop=True) method is used
<ol>
    <li>drop=True makes sure the old index is dropped, otherwise it becomes a column in the new dataframe</li>
</ul>
</h4>
<h4>If the index is never used later, it is no need to reset the index</h4>

In [3]:
df.sample(frac=0.3).reset_index(drop=True)

Unnamed: 0,class,student,score
0,English,Trump,10


In [4]:
df.sample(frac=1.0).reset_index(drop=True)

Unnamed: 0,class,student,score
0,English,Song,100
1,English,Trump,10
2,Math,Trump,90
3,Math,Song,20


<h1>The default behavior is to keep the original index</h1>

In [15]:
df.sample(frac=1.0)

Unnamed: 0,class,student,score
3,Math,Trump,90
0,English,Song,100
1,English,Trump,10
2,Math,Song,20


<h1>It is possible to make the sampling process repeatable by the "random_state" parameter</h1>

In [16]:
df = pd.DataFrame({
    'class': ['English', 'English', 'Math', 'Math'],
    'student': ['Song', 'Trump', 'Song', 'Trump'],
    'score': [None, 10, 20, 30]
})
df

Unnamed: 0,class,student,score
0,English,Song,
1,English,Trump,10.0
2,Math,Song,20.0
3,Math,Trump,30.0


In [17]:
df.sample(frac=0.25, random_state=42)

Unnamed: 0,class,student,score
1,English,Trump,10.0


<h1>The sample() create a deep copy from the original</h1>
<h4>Modify the sample does not change the original dateframe</h4>

In [18]:
df = pd.DataFrame({
    'class': ['English', 'English', 'Math', 'Math'],
    'student': ['Song', 'Trump', 'Song', 'Trump'],
    'score': [None, 0, 0, 0]
})
df

Unnamed: 0,class,student,score
0,English,Song,
1,English,Trump,0.0
2,Math,Song,0.0
3,Math,Trump,0.0


In [19]:
df = df.convert_dtypes()
df_new = df.sample(frac=1.0)
df_new.loc[df_new['student'] == 'Song', 'score'] = -100

df_new

Unnamed: 0,class,student,score
3,Math,Trump,0
1,English,Trump,0
2,Math,Song,-100
0,English,Song,-100


In [20]:
df

Unnamed: 0,class,student,score
0,English,Song,
1,English,Trump,0.0
2,Math,Song,0.0
3,Math,Trump,0.0
