In [2]:
import pandas as pd
import numpy as np

from pydataset import data

## Disaggregating Data

- This is a breakdown of using .repeat.Index and .loc to disaggregate data.

In [5]:
# This data is already aggregated.

df = data('HairEyeColor')
df.head()

Unnamed: 0,Hair,Eye,Sex,Freq
1,Black,Brown,Male,32
2,Brown,Brown,Male,53
3,Red,Brown,Male,10
4,Blond,Brown,Male,3
5,Black,Blue,Male,11


- If we disaggregated the data, we would have this many rows.

In [10]:
df.Freq.sum()

592

- Use `index.repeat` based on the value in the `Freq` column to create a new index with rows repeated `Freq` times.

In [24]:
df.index.repeat(df.Freq)

Int64Index([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
            ...
            31, 31, 32, 32, 32, 32, 32, 32, 32, 32],
           dtype='int64', length=592)

- Pass this new index to `.loc` to grab rows by index label.

In [21]:
df.loc[df.index.repeat(df.Freq)]

Unnamed: 0,Hair,Eye,Sex,Freq
1,Black,Brown,Male,32
1,Black,Brown,Male,32
1,Black,Brown,Male,32
1,Black,Brown,Male,32
1,Black,Brown,Male,32
...,...,...,...,...
32,Blond,Green,Female,8
32,Blond,Green,Female,8
32,Blond,Green,Female,8
32,Blond,Green,Female,8


- Reset the index and drop the old index column being created by `reset_index` at the same time.

In [22]:
df.loc[df.index.repeat(df.Freq)].reset_index(drop=True)

Unnamed: 0,Hair,Eye,Sex,Freq
0,Black,Brown,Male,32
1,Black,Brown,Male,32
2,Black,Brown,Male,32
3,Black,Brown,Male,32
4,Black,Brown,Male,32
...,...,...,...,...
587,Blond,Green,Female,8
588,Blond,Green,Female,8
589,Blond,Green,Female,8
590,Blond,Green,Female,8


- Validate that our strategy is working looking at a sample.

In [19]:
df_dis = df.loc[df.index.repeat(df.Freq)].reset_index(drop=True)
df_dis.tail(15)

Unnamed: 0,Hair,Eye,Sex,Freq
577,Red,Green,Female,7
578,Red,Green,Female,7
579,Red,Green,Female,7
580,Red,Green,Female,7
581,Red,Green,Female,7
582,Red,Green,Female,7
583,Red,Green,Female,7
584,Blond,Green,Female,8
585,Blond,Green,Female,8
586,Blond,Green,Female,8


- Drop the `Freq` column if we no longer need it.

In [12]:
df_dis = df_dis.drop(columns=['Freq'])

In [13]:
df_dis.head()

Unnamed: 0,Hair,Eye,Sex
0,Black,Brown,Male
1,Black,Brown,Male
2,Black,Brown,Male
3,Black,Brown,Male
4,Black,Brown,Male


In [15]:
df_dis.shape

(592, 3)

## Takeaway - Pandas Index == Awesome!