# cut (binning) and category

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline

In [None]:
pd.__version__

In [None]:
df=pd.read_csv('https://github.com/prasertcbs/tutorial/raw/master/staff.csv', 
               index_col='empID', 
               thousands=',', parse_dates=['dob', 'join_date'])
df

In [None]:
df.info()

In [None]:
# old way (before 0.21.0)
salary_bucket=['0-15,000', '15,001-30,000', '30,001-50,000', '50,001-80,000', '80,001+']
df['salary_group'] = pd.cut(df.salary,
                           bins=[0, 15000, 30000, 50000, 80000, np.inf],
                           labels=salary_bucket,
                           right=True).astype(str).astype("category", categories=salary_bucket, ordered=True)
df.sort_values(by='salary_group')

In [None]:
# new way in pandas 0.21.0
from pandas.api.types import CategoricalDtype
salary_bucket=['0-15,000', '15,001-30,000', '30,001-50,000', '50,001-80,000', '80,001+']
df['salary_group'] = pd.cut(df.salary,
                           bins=[0, 15000, 30000, 50000, 80000, np.inf],
                           labels=salary_bucket,
                           right=True).astype(CategoricalDtype(salary_bucket, ordered=True)) #.astype("category", categories=salary_cat, ordered=True)
df.sort_values(by='salary_group')

In [None]:
df.salary_group

In [None]:
df.salary_group.hist(grid=False)

In [None]:
pd.to_datetime('today').year

In [None]:
df['age']=pd.to_datetime('today').year - df.dob.dt.year
df

In [None]:
age_cat = ["18-25", "26-30", "31-40", "41-50", "50+"]
df['age_group'] = pd.cut(df.age, 
         bins=[18, 25, 30, 40, 50, np.Inf], 
         labels= age_cat,
         right=True, 
         include_lowest=True).astype(str).astype("category", categories=age_cat, ordered=True)
df

In [None]:
# recommended way
age_cat = ["18-25", "26-30", "31-40", "41-50", "50+"]
df['age_group'] = pd.cut(df.age, 
         bins=[18, 25, 30, 40, 50, np.Inf], 
         labels= age_cat,
         right=True, 
         include_lowest=True).astype(CategoricalDtype(age_cat, ordered=True))
df

In [None]:
df.age_group

In [None]:
df.age_group.hist(grid=False);

In [None]:
from IPython.display import IFrame
IFrame("https://matplotlib.org/examples/color/named_colors.html", width=800, height=800)

In [None]:
# https://matplotlib.org/examples/color/named_colors.html

pd.crosstab(df.age_group, df.sex).plot(kind='barh', color=['pink', 'deepskyblue'])

In [None]:
from IPython.display import IFrame
IFrame("https://www.careerplanner.com/Career-Articles/Generations.cfm", width=800, height=800)

In [None]:
df.dob.dt.year

In [None]:
def gen(birthdate):
    '''born in generation'''
    y = birthdate.year
    if 1946 <= y <= 1964:
        return 'Baby Boomer'
    elif 1965 <= y <= 1979:
        return 'Gen X'
    elif 1980 <= y <= 1994:
        return 'Gen Y'
    elif 1995 <= y <= 2012:
        return 'Gen Z'
    else:
        return 'Gen Alpha'

In [None]:
df['gen']=df.dob.map(lambda d: gen(d)).astype('category', 
                                              categories=['Baby Boomer', 'Gen X', 'Gen Y', 'Gen Z', 'Gen Alpha'], 
                                              ordered=True)
df

In [None]:
df['gen']=df.dob.map(lambda d: gen(d)).astype(CategoricalDtype( 
                                              categories=['Baby Boomer', 'Gen X', 'Gen Y', 'Gen Z', 'Gen Alpha'], 
                                              ordered=True))
df

In [None]:
df['gen']=df.dob.map(lambda d: gen(d)).astype(CategoricalDtype( 
                                              categories=['Baby Boomer', 'Gen X', 'Gen Y', 'Gen Z', 'Gen Alpha'], 
                                              ordered=True))
df

In [None]:
df[['age', 'age_group', 'gen']].sort_values(by='gen')

In [None]:
df.gen.hist(grid=False);