# Iris Species Analysis with Grouping & Aggregation in Pandas

### Grouping & Aggregation

In [2]:
import pandas as pd

In [3]:
# Load the Iris dataset.
df=pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
df.shape

(150, 5)

In [5]:
# Group by species and find the mean of each numeric column.

df.groupby('species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [6]:
# Group by species and find the max of each numeric column.
df.groupby('species').max()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


In [7]:
# Group by species and get the count of rows.

df.groupby('species').size()

species
setosa        50
versicolor    50
virginica     50
dtype: int64

In [10]:
# Get the standard deviation of petal_length for each species.

df.groupby('species')['petal_length'].std()

species
setosa        0.173664
versicolor    0.469911
virginica     0.551895
Name: petal_length, dtype: float64

In [14]:
# Create a DataFrame showing only the mean sepal_length by species.
df.groupby('species')[['sepal_length']].mean()

Unnamed: 0_level_0,sepal_length
species,Unnamed: 1_level_1
setosa,5.006
versicolor,5.936
virginica,6.588


In [16]:
# Create a summary table using .agg() to show mean, min, max for all columns grouped by species.

df.groupby('species').agg(['mean','min','max'])

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,sepal_width,petal_length,petal_length,petal_length,petal_width,petal_width,petal_width
Unnamed: 0_level_1,mean,min,max,mean,min,max,mean,min,max,mean,min,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
setosa,5.006,4.3,5.8,3.428,2.3,4.4,1.462,1.0,1.9,0.246,0.1,0.6
versicolor,5.936,4.9,7.0,2.77,2.0,3.4,4.26,3.0,5.1,1.326,1.0,1.8
virginica,6.588,4.9,7.9,2.974,2.2,3.8,5.552,4.5,6.9,2.026,1.4,2.5


In [18]:
# Use describe() after grouping by species.

df.groupby('species').describe()

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,...,petal_length,petal_length,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50.0,5.006,0.35249,4.3,4.8,5.0,5.2,5.8,50.0,3.428,...,1.575,1.9,50.0,0.246,0.105386,0.1,0.2,0.2,0.3,0.6
versicolor,50.0,5.936,0.516171,4.9,5.6,5.9,6.3,7.0,50.0,2.77,...,4.6,5.1,50.0,1.326,0.197753,1.0,1.2,1.3,1.5,1.8
virginica,50.0,6.588,0.63588,4.9,6.225,6.5,6.9,7.9,50.0,2.974,...,5.875,6.9,50.0,2.026,0.27465,1.4,1.8,2.0,2.3,2.5


In [24]:
# Sort the groups by average sepal_width.
df.groupby('species')['sepal_width'].mean().sort_values()



species
versicolor    2.770
virginica     2.974
setosa        3.428
Name: sepal_width, dtype: float64

In [28]:
# Get the species with the highest average petal_width.

df.groupby('species')['petal_width'].mean().idxmax()

'virginica'

In [None]:


🧩 MultiIndex & Reshaping
Create a pivot table with species as index, and mean of each measurement as values.

Stack the pivoted DataFrame.

Unstack the stacked DataFrame.

Use .melt() to unpivot the DataFrame.

Rename columns during melt: make 'variable' to 'Feature', and 'value' to 'Measurement'.

Create a MultiIndex from species and bin of sepal_length.

Use .reset_index() to flatten a MultiIndex.

Use .set_index() to set species and sepal_width as the index.

Swap the levels of a MultiIndex.

Sort the MultiIndex.

SyntaxError: invalid character '🧩' (U+1F9E9) (3660326297.py, line 13)