# Groupby and apply
Powerful data query mechanisms
see '3.13 Grouping Rows by Values' in Albon

## Load a dataset

In [None]:
# !mamba install seaborn -y

In [None]:
import pandas as pd
import seaborn as sns
url="https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
# url="https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv"  #404 now
df = pd.read_csv(url)

In [None]:
#how many rows in DataFrame
print(len(df))

#look at first few
df.head()

Note that we have Sex and SexCode- SexCode is just a numeric representation of Sex

In [None]:
#lets see what type of data is in each column
df.dtypes
#what are the categorical variables?  Which are nominal and which ordinal?

In [None]:
df.nunique()

## 'Groupby' allows you to separate your data by some selector or combination of selectors and then apply a function over each of these groups to calculate a statistic.

The function will only apply to the fields where it makes sense (which means numerical data.)


## Lets see who survived based on Sex.  
I'm grouping all the rows by sex, and then adding up all the values in numerical fields

In [None]:
#this is a groupby object, you can't do anything with it until you apply a function to it
gb = df.groupby('Sex')
print(type(gb))

In [None]:
#after applying a function to the groupby object you get a dataframe
df1=df.groupby('Sex').sum()
print(type(df1))
df1

In [None]:
# adding up Ages is not helpful, neither is SexCode, lets just see who survived
df1.Survived
# df1.iloc[:,1]  #same as above

In [None]:
# Note that this DataFrame has summary stats only, it's not hiding any of the other data
df1.shape

In [None]:
# df1.index
# df1.loc['male']

## Lets see who survived based on Sex and PClass.  

In [None]:
df.groupby(['Sex','Pclass']).Survived.count()

In [None]:
#Note that the grouping has changed
# df2=df.groupby(['Sex','Pclass']).sum()
# df2

# df2.iloc[:,1]

In [None]:
#want to select just 1 row from above multiindexed dataframe?
#first need to know how to get the index, ask for it
df2.index

In [None]:
#to get the first row
df2.loc[[('female',1)]]

## You do not have to use the built in functions for groupby.  You can make your own.
Use groupby's apply() method, <mark>It takes a DataFrame and returns a DataFrame, or a Series or scaler

In [111]:
#lets find the oldest male and female
def fun1(df):
    #this function finds the oldest male and female passenger
    #sort by age, this returns a series with just age
    # return (df.Age.sort_values(ascending=False).iloc[0])

    #sort by age, return the name and age column
    
    #using iloc with just numbers (which column is Age and Name?)
    # return (df.sort_values(by='Age',ascending=False).iloc[0,[3,5]])
    
    #dont like the 3,5? get the columns by name
    # return (df.sort_values(by='Age',ascending=False).iloc[0,[df.columns.get_loc(c) for c in ['Name', 'Age']]])
    
    #or use the slower loc with column names (note the funky way to get row 0, make sure you sort it as well or you get the wrong answer)
    # return (df.sort_values(by='Age',ascending=False).loc[df.sort_values(by='Age',ascending=False).index[0],['Name','Age']])
            
df.groupby('Sex').apply(fun1)

Unnamed: 0_level_0,Name,Age
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,"Turkula, Mrs. (Hedwig)",63.0
male,"Barkworth, Mr. Algernon Henry Wilson",80.0


In [None]:
#lets find the oldest male and female, who died and survived 
def fun1(df):
    #this function finds the oldest male and female passenger
    #sort by age, this returns a series with just age
    # return (df.Age.sort_values(ascending=False).iloc[0])

    #sort by age, return the name and age column
    return (df.sort_values(by='Age',ascending=False).iloc[0,[3,5]])
    # return (df.sort_values(by='Age',ascending=False).loc[df.sort_values(by='Age',ascending=False).index[0],['Name','Age']])


df1=df.groupby(['Sex','Survived']).apply(fun1)
df1

In [None]:
#what to use to dereference
df1.index

In [None]:
# df1.query("Sex=='female' & Survived==0")
df1.query("Survived==0")

## Does gender or passenger class affect your chances?

We want percentages here not absolute numbers, for instance number of femal 1st class passengers who survived divided by total number of female first class passengers.


In [None]:
#survival based on class
def fun3(df):
    #each dataframe passed to this function will contain 
    #all rows of 1 gender (M,F) in one of the PClasses (1st, 2nd, 3rd) 
    #just divide the len of the dataframe by the number who survived to see
    #survival rate
    return df.Survived.sum()/len(df)


res=df.groupby(['Sex','Pclass']).apply(fun3)
res

In [None]:
#can also do it this way
tmp=df.groupby(['Sex','Pclass'])   #create a groupby object
tmp.sum().loc[:,'Survived']/tmp.Survived.count()  #divide total number survivors by total number

## I would like to graph this but my indexes are a pain, reset_index to the rescue!
It will create a dataframe from this Series and save the multiindex as columns

In [None]:
res1=res.reset_index()
res1

In [None]:
res1 = res1.rename(columns = {0 : 'Survival rate %'})
res1['Survival rate %'] = res1['Survival rate %']*100
res1.columns
res1

In [None]:
res1.dtypes

In [None]:
#convert % to string so seaborn does not fail when printing the legend below
res1['Pclass']=res1['Pclass'].astype(str)

In [None]:
#want to filter the seaborn warnings
# import warnings
# warnings.filterwarnings("ignore", "is_categorical_dtype")
# warnings.filterwarnings("ignore", "use_inf_as_na")

#lets graph this, oh no? what to do with the index?  And what I have is a series
sns.barplot(data=res1,x="Sex",y='Survival rate %',  hue="Pclass" );