In [2]:
import pandas as pd
import numpy as np
import random

D = {
    'class': ['English', 'English', 'Math', 'Math'],
    'student': ['Song', 'Trump', 'Song', 'Trump'],
    'score': [100, 10, 20, 90]
}
df = pd.DataFrame(D)

<h1>np.array_split can be used to split a dataframe</h1>

<h4>1. np.array_split() can split roughly equal sized smaller dataframes</h4>
<h4>2. If necessary, copy() should be used to create a copy of the sub-dataframe, "copy()" default bahavior is deep=True. This is because loc[] create a view, modificatons can cause confusions</h4>
<h4>3. If the lengh of the dataframe is smaller than the no_of_chunks, 
    <b>"empty"</b> chunks can be created</h4>

In [3]:
no_of_chunks = 3

dfs = []
indices = np.array_split(df.index, no_of_chunks)
for chunk_idx in indices:
    if len(chunk_idx) == 0:
        dfs.append(None)
        continue

    df_split = df.loc[chunk_idx].copy()
    dfs.append(df_split)

for df_split in dfs:
    print(df_split); print()
    

     class student  score
0  English    Song    100
1  English   Trump     10

  class student  score
2  Math    Song     20

  class student  score
3  Math   Trump     90



<h1>Empty chunks are possible if not enough rows in the original dataframe</h1>

In [10]:
# 6 is larger than the length of the dataframe 4, so there must be 2 empty chunks
no_of_chunks = 6

dfs = []
indices = np.array_split(df.index, no_of_chunks)
for chunk_idx in indices:
    df_split = df.loc[chunk_idx].copy()
    dfs.append(df_split)

for df_split in dfs:
    print(df_split); print()

     class student  score
0  English    Song    100

     class student  score
1  English   Trump     10

  class student  score
2  Math    Song     20

  class student  score
3  Math   Trump     90

Empty DataFrame
Columns: [class, student, score]
Index: []

Empty DataFrame
Columns: [class, student, score]
Index: []



<h1>We can further shuffle the chunks using np.random.shuffle()</h1>

In [22]:
# Create a shallow copy of the list
dfs_copy = dfs[:]
np.random.shuffle(dfs_copy)

for df_split in dfs_copy:
    print(df_split); print()

Empty DataFrame
Columns: [class, student, score]
Index: []

  class student  score
3  Math   Trump     90

Empty DataFrame
Columns: [class, student, score]
Index: []

     class student  score
1  English   Trump     10

  class student  score
2  Math    Song     20

     class student  score
0  English    Song    100



<h1>We can also shuffle the chunks using python's native random.shuffle()</h1>

In [21]:
# Create a shallow copy of the list
dfs_copy = dfs[:]
random.shuffle(dfs_copy)

for df_split in dfs_copy:
    print(df_split); print()

Empty DataFrame
Columns: [class, student, score]
Index: []

  class student  score
2  Math    Song     20

Empty DataFrame
Columns: [class, student, score]
Index: []

     class student  score
0  English    Song    100

     class student  score
1  English   Trump     10

  class student  score
3  Math   Trump     90

