groupby practice
June 27th, 2024

In [3]:
import pandas as pd

# Sample DataFrame
data = {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
        'B': [1, 2, 3, 4, 5, 6]}
df = pd.DataFrame(data)

# Group by column 'A'
# Will group all of the same levels or categories in column A, in this example foo and bar
grouped = df.groupby('A')

# print(type(grouped))
# You are not able to see or print 'grouped', it is doing that in the background 

df


Unnamed: 0,A,B
0,foo,1
1,bar,2
2,foo,3
3,bar,4
4,foo,5
5,bar,6


In [8]:
# Aggregation
# Aggregation: Apply aggregate functions such as sum(), mean(), count(), etc.
# grouped is a df that has combined all same variables in column A and then it will add all the corresponding number
# in column B
# Returns a df called grouped_sum
grouped_sum = grouped.sum()
# print(type(grouped_sum))

      B
A      
bar  12
foo   9
<class 'NoneType'>


In [9]:
#Iteration: Iterate over each group.
# Prints out the name of each group and all of its rows
for name, group in grouped:
    print(name)
    print(group)

bar
     A  B
1  bar  2
3  bar  4
5  bar  6
foo
     A  B
0  foo  1
2  foo  3
4  foo  5


In [12]:
# Transformation: Apply transformations that return an object with the same shape as the original.
# The transform method is applied to each group
# The lambda function lambda x: x - x.mean() is applied to each group separately.
# Lambda Function: lambda x: x - x.mean(): for each group, the mean of the group is calculated, then each value
# in the group is subtracted by the mean of that group
# This process essentially normalizes each group by subtracting the group mean, so the resulting values 
#represent deviations from the mean of their respective groups.
grouped_transform = grouped.transform(lambda x: x - x.mean())
grouped_transform
# print(grouped_transform)
# Output: dataframe with same shape as original df, with each value replaced by its deviation from the the mean
# of it's group
# For group 'foo' (indices 0, 2, 4), the mean of column 'B' is (1+3+5)/3 = 3. So, the transformed values are 
#[1-3, 3-3, 5-3] = [-2, 0, 2].
# For group 'bar' (indices 1, 3, 5), the mean of column 'B' is (2+4+6)/3 = 4. So, the transformed values are 
# [2-4, 4-4, 6-4] = [-2, 0, 2]

Unnamed: 0,B
0,-2.0
1,-2.0
2,0.0
3,0.0
4,2.0
5,2.0
