In [1]:
# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 20

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import chart_studio.plotly as py
import plotly.graph_objs as go
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()

In [2]:
def ecdf(df, x, grouper=None):
    """
    Calculate empirical cumulative distribution function of a distribution
    
    :param df: dataframe
    :param x: string name of column
    :param grouper: string for column to groupby
    
    """
    df = df.copy()
    
    if grouper is not None:
        new_dfs = []
        for group, grouped_data in df.groupby(grouper):
            grouped_data.sort_values(x, ascending=True, inplace=True)
            n = len(grouped_data)
            grouped_data['y'] = np.arange(1, n + 1, step = 1) / n
            new_dfs.append(grouped_data)
        return pd.concat(new_dfs)
    
    # Sort by the column for distribution
    df.sort_values(x, ascending=True, inplace=True)
    n = len(df)
    # Calculate percentiles
    df['y'] = np.arange(1, n + 1, step=1) / n
    return df

In [3]:
wages = pd.read_csv('data/us_state_wages.csv')
wages.head()

Unnamed: 0,year,geo_name,geo,income
0,2013,Alabama,04000US01,43253.0
1,2013,Alaska,04000US02,70760.0
2,2013,Arizona,04000US04,49774.0
3,2013,Arkansas,04000US05,40768.0
4,2013,California,04000US06,61094.0


In [4]:
wages.tail()

Unnamed: 0,year,geo_name,geo,income
203,2016,Washington,04000US53,62848.0
204,2016,West Virginia,04000US54,42644.0
205,2016,Wisconsin,04000US55,54610.0
206,2016,Wyoming,04000US56,59143.0
207,2016,Puerto Rico,04000US72,19606.0


In [5]:
df = ecdf(wages, 'income')
df['year'] = df['year'].astype(str)
df.iplot(x='income', y='y', mode='markers+lines', categories='year', hline=[0.5],
         xTitle='income', yTitle='percentile', title='ECDF of Wages in US states')


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



In [6]:
df = ecdf(wages, 'income', grouper='year')
df['year'] = df['year'].astype(str)
df.iplot(x='income', y='y', mode='markers+lines', categories='year',
         xTitle='income', yTitle='percentile', title='ECDF of Wages in US states')

In [7]:
df = wages.copy()

In [9]:
x = 'income'
text = 'geo_name'
grouper = None
title_override = None

df = ecdf(df, x, grouper=grouper)

if grouper is not None:
    data = [
        go.Scatter(
            x=grouped_data[x],
            y=grouped_data['y'],
            mode='markers+lines',
            marker=dict(opacity=0.6, line=dict(color='black', width=2)),
            text=grouped_data[text],
            name=group) for group, grouped_data in df.groupby(grouper)
    ]

else:
    data = [
        go.Scatter(
            x=df[x],
            y=df['y'],
            mode='markers+lines',
            marker=dict(
                color='blue', opacity=0.6, line=dict(color='black', width=2)),
            text=df[text])
    ]

quantiles = [0.25, 0.5, 0.75]
    
shapes = [go.layout.Shape(type='line', x0=0, x1=1, y0=q, y1=q, xref='paper', yref='y',
                          line=dict(width=2.1, color = 'red', dash='dash')) for q in quantiles]

annotations = [go.layout.Annotation(x=0, y=q + 0.05, showarrow=False,
                                    xanchor='left', font=dict(size=14), xshift=20,
                                    text=f'{q}: {df[x].quantile(q):.2f}') for q in quantiles]

annotations.append(go.layout.Annotation(x=df[x].min(), font=dict(size=14), 
                                        y=0.05, text=f'Min: {df[x].min():.2f}'))
annotations.append(go.layout.Annotation(x=df[x].max(), font=dict(size=14),
                                        y=1, text=f'Max: {df[x].max():.2f}'))
annotations.append(go.layout.Annotation(x=df[x].mean(), font=dict(size=14),
                                        y=df.loc[(df[x] - df[x].mean()).abs().idxmin(), 'y'], 
                                        text=f'Mean: {df[x].mean():.2f}'))

x_nice = x.replace('_', '').title()
layout = go.Layout(
    annotations=annotations,
    shapes=shapes,
    xaxis=dict(title=x_nice),
    yaxis=dict(title='percentile'),
    title=f'ECDF of {x_nice}' if title_override is None else title_override)

figure = go.Figure(data=data, layout=layout)

In [10]:
iplot(figure)

In [11]:
df.groupby('year')['income'].quantile([0.25, 0.5, 0.75])
df.groupby('year')['income'].mean()

year      
2013  0.25    46658.75
      0.50    51757.50
      0.75    58985.25
2014  0.25    47100.50
      0.50    52302.50
      0.75    59942.25
2015  0.25    47422.50
      0.50    52601.00
      0.75    60653.50
2016  0.25    48770.00
      0.50    53977.50
      0.75    62518.50
Name: income, dtype: float64

year
2013    52878.230769
2014    53468.788462
2015    53957.519231
2016    55330.576923
Name: income, dtype: float64