# Summarizing and Computing Descriptive Statistics

In [83]:
import pandas as pd
import numpy as np
import seaborn as sns
import statistics as stats

### Just a quick word on displaying DataFrames

In [84]:
#All display options available
pd.describe_option()

compute.use_bottleneck : bool
    Use the bottleneck library to accelerate if it is installed,
    the default is True
    Valid values: False,True
    [default: True] [currently: True]
compute.use_numexpr : bool
    Use the numexpr library to accelerate computation if it is installed,
    the default is True
    Valid values: False,True
    [default: True] [currently: True]
display.chop_threshold : float or None
    if set to a float value, all float values smaller then the given threshold
    will be displayed as exactly 0 by repr and friends.
    [default: None] [currently: None]
display.colheader_justify : 'left'/'right'
    Controls the justification of column headers. used by DataFrameFormatter.
    [default: right] [currently: right]
display.column_space No description available.
    [default: 12] [currently: 12]
display.date_dayfirst : boolean
    When True, prints and parses dates with the day first, eg 20/01/2005
    [default: False] [currently: False]
display.date_yearfirst 

In [85]:
#Returns the number of rows that will be displayed 
pd.get_option('display.max_rows')

60

In [86]:
#Change the number of rows to be displayed. None prints all the rows in the data frame
pd.set_option('display.max_rows', None)

In [87]:
#To reset any changes :
pd.reset_option('display.max_rows')

## Summarize data

In [88]:
df = pd.DataFrame(np.arange(8).reshape(4,2),
                     index=['a', 'b', 'c', 'd'],
                     columns=['one', 'two'])
df

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5
d,6,7


In [89]:
#Let's get the info for this DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   one     4 non-null      int64
 1   two     4 non-null      int64
dtypes: int64(2)
memory usage: 96.0+ bytes


In [90]:
#Should always check the dimensions of our DataFrame
df.shape

(4, 2)

In [91]:
#Let's look at the number of values in the dataframe
df.count()

one    4
two    4
dtype: int64

In [115]:
#If we want to know how many unique values there are in a given column
# print(df.two.nunique())
# df

#Sanity check, let's change a value and check again :
# df['two'].b = 5
# print(df.count())
# df.info()
# df.two.nunique()

#Or get the number of unique values for every column
df.nunique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


one     4
two     3
name    3
dtype: int64

In [113]:
#What are those unique values ?
df.two.unique()

array([ 1., nan,  5.,  7.])

In [116]:
#Let's add a new column with names 
df['name'] = ['John', 'Tim', 'John', 'Chris']
df

Unnamed: 0,one,two,name
a,0,1.0,John
b,2,5.0,Tim
c,4,5.0,John
d,6,7.0,Chris


In [117]:
#Let's now get the number of times each name appears:
df['name'].value_counts()

John     2
Chris    1
Tim      1
Name: name, dtype: int64

### There are two way of going about descriptive statistics : 
### - describing the values of observations in a variable : sum, mean, median, min/max
### - describing the variable spread : sd, variance, counts, quartiles

### Pivot tables are making a come back

In [118]:
df

Unnamed: 0,one,two,name
a,0,1.0,John
b,2,5.0,Tim
c,4,5.0,John
d,6,7.0,Chris


In [119]:
#Remember pivot tables will group your data and provide helpful summaries
pd.pivot_table(df, values = ['one', 'two'], index = ['name'], aggfunc=np.mean)

Unnamed: 0_level_0,one,two
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chris,6,7.0
John,2,3.0
Tim,2,5.0


In [122]:
#We can calculate mutliple statistics at once :
data = pd.pivot_table(df, values = ['one', 'two'], index = ['name'], aggfunc=[np.mean, np.sum])
data

Unnamed: 0_level_0,mean,mean,sum,sum
Unnamed: 0_level_1,one,two,one,two
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Chris,6,7.0,6,7.0
John,2,3.0,4,6.0
Tim,2,5.0,2,5.0


In [125]:
#We can use the unstack() function to change how we display the results based on the function instead of index
# pd.DataFrame(pd.pivot_table(df, values = ['one', 'two'], 
#                             index = ['name'], aggfunc=[np.mean, np.sum]).unstack())

#Let's access the data for John only. 
data.loc['John']

#Now lets access John's mean values
data.loc['John', 'mean']

one    2.0
two    3.0
Name: John, dtype: float64

### Notice that for the parameter aggfunc we can pass in numpy functions. A list of those functions can be found [here](https://docs.scipy.org/doc/numpy/reference/routines.statistics.html)

In [126]:
#The argument aggfunc can be passed in a variety of functions to calculate different statistics on different 
#columns
pd.pivot_table(df, values = ['one', 'two'], index = ['name'], aggfunc={'one':[np.mean, np.max], 'two':np.sum})

Unnamed: 0_level_0,one,one,two
Unnamed: 0_level_1,amax,mean,sum
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Chris,6,6,7.0
John,4,2,6.0
Tim,2,2,5.0


### We can also use the aggregate function on the dataframes and pass in numpy functions

In [127]:
#Aggregate takes in numpy functions
df['one'].aggregate([np.sum, np.min])

sum     12
amin     0
Name: one, dtype: int64

### Let's now look at some of the pandas built-in stats functions

In [132]:
#Get total values for each column (row wise)
# df.sum(numeric_only=True)
# df
df.sum()

#We have some interesting results....

one                   12
two                   18
name    JohnTimJohnChris
dtype: object

In [137]:
#Get the mean value for each column (row wise)
df.mean(axis=0) #row
# df.mean(axis=1) #column

one    3.0
two    4.5
dtype: float64

In [135]:
#Get the maximum value for each column (row wise)
df.max()

one       6
two       7
name    Tim
dtype: object

In [136]:
#Get the maximum value for each column (row wise)
df.min()

one         0
two         1
name    Chris
dtype: object

In [139]:
#Say we want to get the location of where the maximum/minimum value for column two is...
df.two.idxmax()

'd'

In [142]:
# Can u guess a method to get a summary stats?
df.describe()

Unnamed: 0,one,two
count,4.0,4.0
mean,3.0,4.5
std,2.581989,2.516611
min,0.0,1.0
25%,1.5,4.0
50%,3.0,5.0
75%,4.5,5.5
max,6.0,7.0


In [150]:
#We get a neat litte table with all the stats that we calculated before
df.describe(include='all')


# pd.pivot_table(df, values = ['one', 'two'], index = ['name'], 
#                aggfunc={'one':[np.mean, np.max], 
#                         'two':[np.sum, np.min],
#                        'name':np.unique})

Unnamed: 0,one,two,name
count,4.0,4.0,4
unique,,,3
top,,,John
freq,,,2
mean,3.0,4.5,
std,2.581989,2.516611,
min,0.0,1.0,
25%,1.5,4.0,
50%,3.0,5.0,
75%,4.5,5.5,


In [151]:
#Only include object types
df.describe(include=['object'])

Unnamed: 0,name
count,4
unique,3
top,John
freq,2


In [152]:
#Include object and float types
df.describe(include=['object', 'float'])

Unnamed: 0,two,name
count,4.0,4
unique,,3
top,,John
freq,,2
mean,4.5,
std,2.516611,
min,1.0,
25%,4.0,
50%,5.0,
75%,5.5,


In [153]:
#We can filter the data first and then describe it. 
df[df['name'] == 'John'].describe()

Unnamed: 0,one,two
count,2.0,2.0
mean,2.0,3.0
std,2.828427,2.828427
min,0.0,1.0
25%,1.0,2.0
50%,2.0,3.0
75%,3.0,4.0
max,4.0,5.0


### A quick look at a dataframe with NaN values

In [154]:
df2 = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                       [np.nan, np.nan], [0.75, -1.3]],
                     index=['a', 'b', 'c', 'd'],
                     columns=['one', 'two'])
df2

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [155]:
#By default the sum() function skips NaN values
df2.sum()

one    9.25
two   -5.80
dtype: float64

In [156]:
#If we do not, then we get NaN whenever there are NaN values
df2.sum(skipna=False)

one   NaN
two   NaN
dtype: float64

In [157]:
#These functions will by default ignore the NaN values, it is not a reason to not deal with them though...
df2.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


## There is a built-in statistics package called **drumroll* * statistics....

All documentation for this package can be found [here](https://docs.python.org/3/library/statistics.html#module-statistics)

This package includes most of the functions we've just seen like : mean, mode, median, quantiles and so on.

### Demo with a dataframe that has more than a few values in it

In [158]:
cars = sns.load_dataset('car_crashes')
cars.head()

#Let's focus on insurance premiums

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA


In [166]:
#Let's use some the statistics we've seen but using the statistics package :

#Let's check out the mean :
# stats.mean(cars['ins_premium'])

#Let's get the median insurance premium :
# stats.median(cars['ins_premium'])

# #Is the median an actual value in the data or was it interpolated ?
# cars[cars['ins_premium'] == 858.97]

# #Assuming that the value is not in the data set, let's use the median_low, median_high
stats.median_low(cars['ins_premium'])
stats.median_high(cars['ins_premium'])  #Why are we getting the same value ?....

858.97

In [169]:
#Some other statistics in the package :

# stats.mode(cars['ins_premium'])

#Add a few values to the series
cars_prem = cars['ins_premium']
cars_prem_concat = cars_prem.append(pd.Series([2,2,2,2,1,1,1]))

#Let's look at the mode now :
stats.mode(cars_prem_concat)

2.0

In [173]:
#If we look at the spread of the variable ins_prem :

# Population standard deviation of data.
# stats.pstdev(cars['ins_premium'])

# Population variance of data.
# stats.pvariance(cars['ins_premium'])

# Sample standard deviation of data.
# stats.stdev(cars['ins_premium'])

# Sample variance of data.
# stats.variance(cars['ins_premium'])

31789.56517035294

## Let's look at another data set

In [174]:
#Let's read in some baby names :
df_names = pd.read_csv('https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/US_Baby_Names/US_Baby_Names_right.csv')
df_names

Unnamed: 0.1,Unnamed: 0,Id,Name,Year,Gender,State,Count
0,11349,11350,Emma,2004,F,AK,62
1,11350,11351,Madison,2004,F,AK,48
2,11351,11352,Hannah,2004,F,AK,46
3,11352,11353,Grace,2004,F,AK,44
4,11353,11354,Emily,2004,F,AK,41
...,...,...,...,...,...,...,...
1016390,5647421,5647422,Seth,2014,M,WY,5
1016391,5647422,5647423,Spencer,2014,M,WY,5
1016392,5647423,5647424,Tyce,2014,M,WY,5
1016393,5647424,5647425,Victor,2014,M,WY,5


In [175]:
df_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016395 entries, 0 to 1016394
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1016395 non-null  int64 
 1   Id          1016395 non-null  int64 
 2   Name        1016395 non-null  object
 3   Year        1016395 non-null  int64 
 4   Gender      1016395 non-null  object
 5   State       1016395 non-null  object
 6   Count       1016395 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 54.3+ MB


In [176]:
df_names.head()

Unnamed: 0.1,Unnamed: 0,Id,Name,Year,Gender,State,Count
0,11349,11350,Emma,2004,F,AK,62
1,11350,11351,Madison,2004,F,AK,48
2,11351,11352,Hannah,2004,F,AK,46
3,11352,11353,Grace,2004,F,AK,44
4,11353,11354,Emily,2004,F,AK,41


In [177]:
#Drop useless columns like ID and Unnamed:0
df_names.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True)
df_names

Unnamed: 0,Name,Year,Gender,State,Count
0,Emma,2004,F,AK,62
1,Madison,2004,F,AK,48
2,Hannah,2004,F,AK,46
3,Grace,2004,F,AK,44
4,Emily,2004,F,AK,41
...,...,...,...,...,...
1016390,Seth,2014,M,WY,5
1016391,Spencer,2014,M,WY,5
1016392,Tyce,2014,M,WY,5
1016393,Victor,2014,M,WY,5


In [185]:
#Let's get the number of names per gender :
df_names['Gender'].value_counts()

# df_names.groupby('Gender').sum()

F    558846
M    457549
Name: Gender, dtype: int64

In [187]:
#Number of unique names regardless of gender :
df_names['Name'].nunique()

17632

In [190]:
#What is the most popular name there is ?....
df_names.groupby('Name').Count.sum().idxmax()

'Jacob'

In [193]:
#Let's look at the 5 most popular names 
df_names.groupby('Name').Count.sum().nlargest(10)

Name
Jacob       242874
Emma        214852
Michael     214405
Ethan       209277
Isabella    204798
William     197894
Joshua      191551
Sophia      191446
Daniel      191440
Emily       190318
Name: Count, dtype: int64

In [198]:
#We can also sort the names based on their count
df_names.sort_values("Count", ascending = 0).head(10)

Unnamed: 0,Name,Year,Gender,State,Count
107416,Daniel,2004,M,CA,4167
110097,Daniel,2005,M,CA,3914
115739,Daniel,2007,M,CA,3865
112872,Daniel,2006,M,CA,3826
107417,Anthony,2004,M,CA,3805
115740,Anthony,2007,M,CA,3782
112873,Anthony,2006,M,CA,3774
118707,Daniel,2008,M,CA,3749
110098,Anthony,2005,M,CA,3745
112874,Angel,2006,M,CA,3687


In [199]:
#Let's look at the summary of all of these statistics :
pd.DataFrame(df_names['Count'].describe())

Unnamed: 0,Count
count,1016395.0
mean,34.85012
std,97.39735
min,5.0
25%,7.0
50%,11.0
75%,26.0
max,4167.0


In [200]:
#Let's take a look at the total count for each name :
df_names.groupby('Name').Count.sum()

Name
Aaban        12
Aadan        23
Aadarsh       5
Aaden      3426
Aadhav        6
           ... 
Zyra         42
Zyrah        11
Zyren         6
Zyria        59
Zyriah       58
Name: Count, Length: 17632, dtype: int64

In [204]:
# and the highest count :
df_names.groupby('Name').Count.sum().max()

# which is...
df_names.groupby('Name').Count.sum().idxmax()

'Jacob'

In [205]:
#Let's look at the pandas built-in stats function again :
df_names.describe()

Unnamed: 0,Year,Count
count,1016395.0,1016395.0
mean,2009.053,34.85012
std,3.138293,97.39735
min,2004.0,5.0
25%,2006.0,7.0
50%,2009.0,11.0
75%,2012.0,26.0
max,2014.0,4167.0


In [213]:
#Let's filter out some data using multiple conditions :
# df_names[(df_names.Count >= 2000) & (df_names.Year >= 2012)]

# We can filter based on other criteria :
# df_names[df_names.Name.isin(['Sophia','Emma'])]

# and again :
df_names[df_names.Name != 'Sophia']

Unnamed: 0,Name,Year,Gender,State,Count
0,Emma,2004,F,AK,62
1,Madison,2004,F,AK,48
2,Hannah,2004,F,AK,46
3,Grace,2004,F,AK,44
4,Emily,2004,F,AK,41
...,...,...,...,...,...
1016390,Seth,2014,M,WY,5
1016391,Spencer,2014,M,WY,5
1016392,Tyce,2014,M,WY,5
1016393,Victor,2014,M,WY,5


In [220]:
# We might need to do some preprocessing on strings before working with them :
# df_names['upper_names'] = df_names['Name'].str.upper()
# df_names

# we also have lower(), title()

# Other preprocessing :
# Get length of string :
# df_names['len'] = df_names['Name'].str.len()

# Contains part of a word/term :
# df_names[df_names['Name'].str.contains('m', case=False)]

# pd.Series.str.contains?

# We might need to replace part of the string :
df_names['Name'].str.replace('original_word', 'replacement_word')

In [222]:
#We want control over the stats we get :
# df_names.groupby(['Name']).Count.agg(['sum','mean','max','min'])


# We can create a multi-index table with the stats we want :
df_names.groupby(['Year','Gender']).Count.agg(['sum', 'mean', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,mean,min,max
Year,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004,F,1523496,31.684712,5,3416
2004,M,1770488,46.182226,5,4167
2005,F,1528165,31.06279,5,3282
2005,M,1777150,45.222403,5,3914
2006,F,1564886,30.773947,5,3101
2006,M,1822851,44.511892,5,3826
2007,F,1576648,30.184324,5,2961
2007,M,1835769,43.607036,5,3865
2008,F,1544411,29.379299,5,2788
2008,M,1795907,42.354299,5,3749


In [225]:
# We can get the smallest/largest values as well :
# df_names.groupby('Name').Count.sum().nlargest(5)

# df_names.groupby('Name').Count.sum().nsmallest(5)

# Or we can sort the values :
df_names.groupby('Name').Count.sum().nlargest(5)

Name
Jacob       242874
Emma        214852
Michael     214405
Ethan       209277
Isabella    204798
Name: Count, dtype: int64