## Data Aggregation and Descriptive Statistics

### Descriptive Statistics

In [8]:
###################################################################################################
# DataFrames and Series allow you to access descriptive statistics 
# conveniently via methods like sum, mean, and count, etc.
# By default, they return a Series along axis=0, which means you get the statistic of the column
# the full list is available via the pandas documentation:
# https://pandas.pydata.org/pandas-docs/stable/reference/frame.html#computations-descriptive-stats
###################################################################################################
import pandas as pd

In [9]:
rainfall = pd.DataFrame(data={"City 1": [300.1, 100.2],
                              "City 2": [400.3, 300.4],
                              "City 3": [1000.5, 1100.6]})
rainfall

Unnamed: 0,City 1,City 2,City 3
0,300.1,400.3,1000.5
1,100.2,300.4,1100.6


In [10]:
rainfall.mean()

City 1     200.15
City 2     350.35
City 3    1050.55
dtype: float64

In [11]:
# if you want the statistic per row, provide the axis argument
rainfall.mean(axis=1)

0    566.966667
1    500.400000
dtype: float64

### Grouping

In [12]:
df = pd.read_excel("../../data/course_participants.xlsx")
df

Unnamed: 0,user_id,name,age,country,score,continent
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America
3,1003,Jenny,12,Germany,9.0,Europe


In [23]:
# find out the average score per continent
# first group the rows by continent 
# and subsequently apply the mean method to score
df.groupby("continent")["score"].mean()

continent
America    5.30
Europe     6.75
Name: score, dtype: float64

In [26]:
df.groupby("continent")["score"].agg(lambda x: x.max() - x.min())

continent
America    2.8
Europe     4.5
Name: score, dtype: float64