In [18]:
import numpy as np
import pandas as pd
import seaborn as sns

In [63]:
data = sns.load_dataset('titanic')

In [64]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### હવે મને 'જાતિ' મુજબ જોવું છે કે એમાંથી કેટલા કયા કલાસના હતા અને બચ્યા કેટલા

In [8]:
data.pivot_table(columns= 'class', index= 'sex', values= 'survived')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


### Instead, we would have end up doing the lazy boring group by with this syntax...

In [30]:
data.groupby(['sex','class'])['survived'].mean().unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


## Trying out `pd.cut`

In [39]:
pd.cut(data['age'], [0, 18, 40, 80])

0      (18.0, 40.0]
1      (18.0, 40.0]
2      (18.0, 40.0]
3      (18.0, 40.0]
4      (18.0, 40.0]
           ...     
886    (18.0, 40.0]
887    (18.0, 40.0]
888             NaN
889    (18.0, 40.0]
890    (18.0, 40.0]
Name: age, Length: 891, dtype: category
Categories (3, interval[int64]): [(0, 18] < (18, 40] < (40, 80]]

In [40]:
# Cool, now let's set it to the age variable

In [41]:
age = pd.cut(data['age'], [0, 18, 40, 80])

#### So based on that we want to make a multilevel pivot table

In [65]:
data.pivot_table(columns= 'class', index= ['sex', 'who'], values= 'survived')

Unnamed: 0_level_0,class,First,Second,Third
sex,who,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,child,0.666667,1.0,0.533333
female,woman,0.978022,0.909091,0.491228
male,child,1.0,1.0,0.321429
male,man,0.352941,0.080808,0.119122


Use df.at[row,col] = value to change the data 
## Now, using `age` bins...

In [66]:
data.pivot_table(columns= 'class', index= ['sex', age], values= 'survived')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 40]",0.979167,0.914894,0.48
female,"(40, 80]",0.961538,0.846154,0.111111
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 40]",0.478261,0.063492,0.146199
male,"(40, 80]",0.28,0.095238,0.064516


### Here's what happened basically...

    See, pivot table doesn't know about pd.cut. It just takes the 'Series' kind of data to make calculations.
    As we have seen in the previous example where we could do ['sex', 'who'] where 'who' is also a Series and
    'sex' also. 
    
    What pivot_table does, is that is makes the Distinct values out of that series and then fit with the col
    and rows.
    
    Here in Age bin, (0,18], (0,18], (18,40]... are like the values and based on that, pandas tried to get 
    distinct values and then fit them in the resultant pivot table where the col and row matched up. So
    the magic happened only at pd.cut... the res was simple as a shit!

### Getting more wild from the book...
Adding the Multilevel pivot in columns


## Getting the data based on the `class` and the `fare` they have paid.

In [71]:
fare = pd.qcut(data['fare'], 2)
fare

0       (-0.001, 14.454]
1      (14.454, 512.329]
2       (-0.001, 14.454]
3      (14.454, 512.329]
4       (-0.001, 14.454]
             ...        
886     (-0.001, 14.454]
887    (14.454, 512.329]
888    (14.454, 512.329]
889    (14.454, 512.329]
890     (-0.001, 14.454]
Name: fare, Length: 891, dtype: category
Categories (2, interval[float64]): [(-0.001, 14.454] < (14.454, 512.329]]

# Why used `qcut` insted of `cut`?
Check this out...

<img src='cut and qcut.png'>

In [74]:
data.pivot_table(columns= ['class',fare], index= 'sex', values= 'survived')

class,First,First,Second,Second,Third,Third
fare,"(-0.001, 14.454]","(14.454, 512.329]","(-0.001, 14.454]","(14.454, 512.329]","(-0.001, 14.454]","(14.454, 512.329]"
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,,0.968085,0.896552,0.93617,0.580247,0.396825
male,0.0,0.387931,0.111111,0.222222,0.119403,0.189873


##### Oh, let's change some order...

In [75]:
data.pivot_table(columns= [fare,'class'], index= 'sex', values= 'survived')

fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,,0.896552,0.580247,0.968085,0.93617,0.396825
male,0.0,0.111111,0.119403,0.387931,0.222222,0.189873


Looks right! And also add the age...

In [76]:
data.pivot_table(columns= [fare,'class'], index= ['sex', age], values= 'survived')

Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 40]",,0.9,0.454545,0.979167,0.925926,0.529412
female,"(40, 80]",,0.8,0.333333,0.961538,0.875,0.0
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 40]",0.0,0.078947,0.136054,0.52381,0.04,0.208333
male,"(40, 80]",,0.153846,0.068966,0.28,0.0,0.0


## When I want `more values` to compare...

In [88]:
data.pivot_table(columns= 'class', index= 'sex', values= ['survived', 'fare'])

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,0.968085,0.921053,0.5
male,67.226127,19.741782,12.661633,0.368852,0.157407,0.135447


### 

# `aggfunc` 

In [83]:
data.pivot_table(columns= 'class', index= 'sex', values= 'survived', aggfunc= 'sum')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,91,70,72
male,45,17,47


# Advanced `aggfunc`

In [90]:
data.pivot_table(columns= 'class', index= 'sex', aggfunc= {'survived': 'mean', 'fare': 'sum'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,9975.825,1669.7292,2321.1086,0.968085,0.921053,0.5
male,8201.5875,2132.1125,4393.5865,0.368852,0.157407,0.135447


##### Don't give `values` when you provide `aggfunc` otherwise it will only show those which are in the `aggfunc`

In [91]:
data.pivot_table(columns= 'class', index= 'sex', aggfunc= {'survived': 'mean', 'survived': 'sum'})

Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,91,70,72
male,45,17,47


##### Same column name doesn't work!

In [106]:
data.pivot_table(columns= 'class', index= 'sex', values= ['survived', 'survived'], aggfunc= ['sum', 'mean'])

Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,survived,survived,survived,survived,survived,survived,survived,survived,survived,survived,survived,survived
class,First,Second,Third,First,Second,Third,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
female,91,70,72,91,70,72,0.968085,0.921053,0.5,0.968085,0.921053,0.5
male,45,17,47,45,17,47,0.368852,0.157407,0.135447,0.368852,0.157407,0.135447


### Same works in this way!
And also discovered a new way to pass values in `aggfunc` !

###  

### Get the TOTAL column!!

In [111]:
data.pivot_table(columns= 'class', index= 'sex', values= 'survived', margins= True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


In [112]:
data.pivot_table(columns= 'class', index= 'sex', values= 'survived', margins= True, margins_name="Total Given by Aayush")

class,First,Second,Third,Total Given by Aayush
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
Total Given by Aayush,0.62963,0.472826,0.242363,0.383838


In [117]:
np.std([0.74, 0.18])

0.27999999999999997