## Empirical Cumulative Distribution Function

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

## PROS

#### we can see how each observation is affecting the cumulative distribution

#### no smoothing  or binning

#### compare different category, distribution


## CONS

#### we can't see the central tendency of these distributions, the [mean, var, std] are hard to detect
#### Its also hard to detect , when we have a bi-modal dist



In [2]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips['tip'].agg(['min','max'])

min     1.0
max    10.0
Name: tip, dtype: float64

#### chart start at `min` and stop at `max`

#### Y axis [the proportion] shows us what percentage of data have we seen so far

In [5]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip');
plt.show();

<img src='./plots/ecdf_plot-1.png'>

### How to create an ecdf plot ?

In [6]:
tips.tip.value_counts().plot(kind='bar', figsize=(18,4))
plt.show()

<img src='./plots/ecdf_plot-2.png'>

In [7]:
tips.tip.value_counts().head()

tip
2.0    33
3.0    23
4.0    12
5.0    10
2.5    10
Name: count, dtype: int64

In [8]:
tips.tip.value_counts(normalize=True).head()

tip
2.0    0.135246
3.0    0.094262
4.0    0.049180
5.0    0.040984
2.5    0.040984
Name: proportion, dtype: float64

In [10]:
tips.tip.value_counts(normalize=True).cumsum().head()

tip
2.0    0.135246
3.0    0.229508
4.0    0.278689
5.0    0.319672
2.5    0.360656
Name: proportion, dtype: float64

In [23]:
tips.tip.value_counts(normalize=True).sort_index().cumsum().plot();
plt.show();

<img src='./plots/ecdf_plot-3.png'>

#### What percentage of data are above or below a certain value

* from `ecdf plot` we can see by the time we reach x=4$, we have covered 80% of data so far
* 20% of remaining tips are greater than 4$

In [26]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip');
    plt.axvline(x=4, color='r');
plt.show();

<img src='./plots/ecdf_plot-5.png'>

### plot on the : `y axis`

In [29]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, y='tip');

plt.show();

<img src='./plots/ecdf_plot-6.png'>

#### 50th percentile or median

In [30]:
tips.tip.median()

2.9

In [31]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, y='tip')
    plt.axvline(x=0.5, c='y');

plt.show();

<img src='./plots/ecdf_plot-7.png'>

In [32]:
import numpy as np

np.percentile(tips.tip, q=[25, 50, 75])

array([2.    , 2.9   , 3.5625])

In [33]:
np.quantile(tips.tip, q=[0.25, 0.5, 0.75])

array([2.    , 2.9   , 3.5625])

### Compare the distribution of different categories within your data

* Dinner tips tends to be little bit larger than lunch tips
* lot of 2$, 4$, 5$ tips during lunch  -- straight lines in the plot

In [35]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip', hue='time');


plt.show();

<img src='./plots/ecdf_plot-8.png'>

### tip distribution based on the day

* we can see that on saturday the tips are high

In [37]:
# with plt.style.context('fivethirtyeight'):
#     sns.ecdfplot(data=tips, x='tip', hue='day');


# plt.show();

<img src='./plots/ecdf_plot-9.png'>

In [108]:
tips.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [38]:
tips.query('day=="Fri"')['tip'].agg(['min','max'])

min    1.00
max    4.73
Name: tip, dtype: float64

### CAUTION : Y axis = proportion
#### Suppose if there is a category thats is smaller than the others
#### this still scales all the cats in the range `[0 - 1]` in terms of proportion

#### But in general when you have various different groups you want to compare and the dist are kind of all stacked on top of each other the ecdf is great for showing this relationship because here we have lines instead of boxes and bins

### Count stats

In [40]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip', hue='day', stat='count');


plt.show();

<img src='./plots/ecdf_plot-10.png'>

### Count tells us how many obs have we seen so far

* we have about see  about #200 tips by the time we reach 4$ tips
* or we can say appox #200 customers tip below 4$ 

In [42]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip', stat='count');


plt.show();

<img src='./plots/ecdf_plot-11.png'>

### `Lunch vs Dinner`

In [43]:
tips.tip.value_counts().head()

tip
2.0    33
3.0    23
4.0    12
5.0    10
2.5    10
Name: count, dtype: int64

In [44]:
with plt.style.context('fivethirtyeight'):
    sns.histplot(data=tips, x='tip', kde=True, bins=20, hue='time');


plt.show();

<img src='./plots/ecdf_plot-12.png'>

In [45]:
with plt.style.context('fivethirtyeight'):
    sns.histplot(data=tips, x='tip', kde=True, bins=20, hue='time', element='poly')


plt.show()

<img src='./plots/ecdf_plot-13.png'>

### MODES

In [46]:
tips.tip.value_counts().head()

tip
2.0    33
3.0    23
4.0    12
5.0    10
2.5    10
Name: count, dtype: int64

In [47]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip', complementary=True)
    plt.axvline(x=2, c='g', alpha=0.3)
    plt.axvline(x=3, c='r', alpha=0.3)
    plt.axvline(x=4, c='y', alpha=0.3)
    plt.axvline(x=5, c='b', alpha=0.3)
    plt.title('Modes in the data')

plt.show()

<img src='./plots/ecdf_plot-14.png'>

#### what is the rank of 4$

* `complementary=True` give us a complete opposite view
* we start with max value then count backwards

cool thing about ` stat='count', complementary=True ` is we are ranking the data
* the best tip = 10$ --- so rank = 1
* we move to the min [  R to L  ]
 
so here you can see the rank of 4$ is around 50 


In [49]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip', stat='count', complementary=True)
    plt.axvline(x=4, c='r', alpha=0.4);


plt.show();

<img src='./plots/ecdf_plot-15.png'>

### `weights`

### 50% of tip money comes from tips less than 3.27$

In [93]:
tips.tip.sort_values().head()

67     1.00
236    1.00
92     1.00
111    1.00
0      1.01
Name: tip, dtype: float64

In [92]:
import pandas as pd

sample = pd.DataFrame(data={'tips': tips.tip.sort_values().values, 'cumsum':tips.tip.sort_values().cumsum()})
sample['weight'] = sample['cumsum'] / max(sample['cumsum'])
sample.query('weight >= 0.5').head(1)

Unnamed: 0,tips,cumsum,weight
34,3.27,368.56,0.503786


In [95]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip', weights='tip');
    plt.axhline(y=0.5, c='r', alpha=0.4);
    plt.axvline(x=3.27, c='g', alpha=0.4);


plt.show();

<img src='./plots/ecdf_plot-16.png'>

### Styling

#### `palette='winter'`

In [97]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip', palette='winter', hue='day');


plt.show();

<img src='./plots/ecdf_plot-17.png'>

#### `linewith `

In [99]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=tips, x='tip', palette='summer', hue='day', lw=2);


plt.show();

<img src='./plots/ecdf_plot-18.png'>

In [146]:
plot = sns.ecdfplot(data=tips, x='tip')
type(plot)

matplotlib.axes._subplots.AxesSubplot

In [100]:
with plt.style.context('fivethirtyeight'):
    ax = sns.ecdfplot(data=tips, x='tip', palette='summer', hue='day', lw=2)
    ax.legend(['Thrusday','Friday','Saturday','Sunday'], title='Days of week')

plt.show()

<img src='./plots/ecdf_plot-19.png'>

In [76]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [102]:
sns.histplot(data=tips, x='tip', cumulative=True);
plt.show();

<img src='./plots/ecdf_plot-20.png'>

In [103]:
sns.histplot(data=tips, x='tip', cumulative=True, fill=False)
plt.show()

<img src='./plots/ecdf_plot-21.png'>

In [105]:
sns.histplot(data=tips, x='tip', cumulative=True, fill=False, element='step');
plt.show();

<img src='./plots/ecdf_plot-22.png'>

In [107]:
sns.histplot(data=tips, x='tip', cumulative=True, fill=False, element='step', stat='proportion');
plt.show();

<img src='./plots/ecdf_plot-23.png'>

In [108]:
sns.histplot(data=tips, x='tip', cumulative=True, fill=False, element='step', stat='proportion', hue='day')
plt.show()

<img src='./plots/ecdf_plot-24.png'>

In [109]:
sns.histplot(data=tips, x='tip', cumulative=True, fill=False, element='step', stat='count', hue='day');
plt.show();

<img src='./plots/ecdf_plot-25.png'>

In [131]:
tips[['tip','day']].groupby('day')['tip'].count()

day
Thur    62
Fri     19
Sat     87
Sun     76
Name: tip, dtype: int64

In [138]:
plt.figure(figsize=(15,8))
for k, v in tips.groupby(by='day')['tip']:
    # v is a series and k is the day [string]
    plt.plot(v, linestyle='', marker=f'${k}$', markersize=15, c='k', alpha=0.5);


plt.show();

<img src='./plots/ecdf_plot-26.png'>

In [69]:
tips.set_index(keys='day')[['tip']].loc['Sun'].head()

Unnamed: 0_level_0,tip
day,Unnamed: 1_level_1
Sun,1.01
Sun,1.66
Sun,3.5
Sun,3.31
Sun,3.61


In [148]:
plt.figure(figsize=(14,4))
ax1 = plt.subplot(131)
tips.set_index(keys='day')[['tip']].loc['Sun'].plot(kind='kde', title='tip on sunday', ax=ax1);
ax2 = plt.subplot(132)
sns.kdeplot(tips.set_index(keys='day')[['tip']].loc['Sun'], cumulative=True, ax=ax2);
ax2.set(title="Cumulative");
ax3  = plt.subplot(133)
sns.histplot(tips.set_index(keys='day')[['tip']].loc['Sun'], ax=ax3)
ax3.set(title="Histogram")
plt.tight_layout()
plt.show();

<img src='./plots/ecdf_plot-27.png'>