# Understanding Categories

In [1]:
import pandas as pd

In [4]:
df_adult = pd.read_csv(r'../data/adult.csv')
df_adult.head(3)

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [5]:
df_adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              32561 non-null  int64 
 1   Workclass        32561 non-null  object
 2   fnlgwt           32561 non-null  int64 
 3   Education        32561 non-null  object
 4   Education Num    32561 non-null  int64 
 5   Marital Status   32561 non-null  object
 6   Occupation       32561 non-null  object
 7   Relationship     32561 non-null  object
 8   Race             32561 non-null  object
 9   Sex              32561 non-null  object
 10  Capital Gain     32561 non-null  int64 
 11  Capital Loss     32561 non-null  int64 
 12  Hours/Week       32561 non-null  int64 
 13  Country          32561 non-null  object
 14  Above/Below 50k  32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
df_adult['Marital Status'].describe()

count                   32561
unique                      7
top        Married-civ-spouse
freq                    14976
Name: Marital Status, dtype: object

In [7]:
df_adult['Marital Status'].value_counts()

 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: Marital Status, dtype: int64

In [14]:
df_adult['Marital Status'].value_counts(normalize=True).rename_axis('Marital_Status').reset_index(name='%age')

Unnamed: 0,Marital_Status,%age
0,Married-civ-spouse,0.459937
1,Never-married,0.328092
2,Divorced,0.136452
3,Separated,0.031479
4,Widowed,0.030497
5,Married-spouse-absent,0.012837
6,Married-AF-spouse,0.000706


In [15]:
df_adult.dtypes

Age                 int64
Workclass          object
fnlgwt              int64
Education          object
Education Num       int64
Marital Status     object
Occupation         object
Relationship       object
Race               object
Sex                object
Capital Gain        int64
Capital Loss        int64
Hours/Week          int64
Country            object
Above/Below 50k    object
dtype: object

In [16]:
df_adult['Marital Status'].dtype

dtype('O')

In [17]:
df_adult['Marital_Status'] = df_adult['Marital Status'].astype('category')
df_adult['Marital_Status'].dtype

CategoricalDtype(categories=[' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
                  ' Married-spouse-absent', ' Never-married', ' Separated',
                  ' Widowed'],
, ordered=False)

In [18]:
df_adult['Marital Status'].nbytes

260488

In [19]:
df_adult['Marital_Status'].nbytes

32617

# Creating Categorical Series

In [20]:
cat_data = ['A', 'A', 'C', 'A', 'B', 'A', 'C']
cat_series1 = pd.Series(cat_data, dtype='category')
cat_series1

0    A
1    A
2    C
3    A
4    B
5    A
6    C
dtype: category
Categories (3, object): ['A', 'B', 'C']

In [23]:
cat_series1.min()

TypeError: Categorical is not ordered for operation min
you can use .as_ordered() to change the Categorical to an ordered one


In [21]:
cat_series2 = pd.Categorical(cat_data, categories=['B', 'C', 'A'], ordered=True,)
cat_series2

['A', 'A', 'C', 'A', 'B', 'A', 'C']
Categories (3, object): ['B' < 'C' < 'A']

In [22]:
cat_series2.min()

'B'

# Reading Data by specifying their dtype

In [24]:
df_adult = pd.read_csv(r'../data/adult.csv')
df_adult.head(3)

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [25]:
df_adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              32561 non-null  int64 
 1   Workclass        32561 non-null  object
 2   fnlgwt           32561 non-null  int64 
 3   Education        32561 non-null  object
 4   Education Num    32561 non-null  int64 
 5   Marital Status   32561 non-null  object
 6   Occupation       32561 non-null  object
 7   Relationship     32561 non-null  object
 8   Race             32561 non-null  object
 9   Sex              32561 non-null  object
 10  Capital Gain     32561 non-null  int64 
 11  Capital Loss     32561 non-null  int64 
 12  Hours/Week       32561 non-null  int64 
 13  Country          32561 non-null  object
 14  Above/Below 50k  32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [27]:
adult_dtypes = {
  'Workclass': 'category',
  'Education': 'category',
  'Relationship': 'category',
  'Above/Below 50k': 'category'
}
df_adult = pd.read_csv(r'../data/adult.csv', dtype=adult_dtypes)
df_adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Age              32561 non-null  int64   
 1   Workclass        32561 non-null  category
 2   fnlgwt           32561 non-null  int64   
 3   Education        32561 non-null  category
 4   Education Num    32561 non-null  int64   
 5   Marital Status   32561 non-null  object  
 6   Occupation       32561 non-null  object  
 7   Relationship     32561 non-null  category
 8   Race             32561 non-null  object  
 9   Sex              32561 non-null  object  
 10  Capital Gain     32561 non-null  int64   
 11  Capital Loss     32561 non-null  int64   
 12  Hours/Week       32561 non-null  int64   
 13  Country          32561 non-null  object  
 14  Above/Below 50k  32561 non-null  category
dtypes: category(4), int64(6), object(5)
memory usage: 2.9+ MB


# Grouping Data by Categories

In [29]:
df_adult_lte_50K = df_adult[df_adult['Above/Below 50k']==' <=50K']
df_adult_lte_50K['Above/Below 50k'].unique()

[' <=50K']
Categories (2, object): [' <=50K', ' >50K']

In [30]:
type(df_adult_lte_50K)

pandas.core.frame.DataFrame

In [32]:
groupby_Income = df_adult.groupby(by=['Above/Below 50k'])
type(groupby_Income)

pandas.core.groupby.generic.DataFrameGroupBy

In [40]:
groupby_Income.Country.count()

Above/Below 50k
 <=50K    24720
 >50K      7841
Name: Country, dtype: int64

In [41]:
groupby_Income.size()

Above/Below 50k
 <=50K    24720
 >50K      7841
dtype: int64

In [43]:
groupby_Income.Education.size()

Above/Below 50k
 <=50K    24720
 >50K      7841
Name: Education, dtype: int64

In [44]:
gb_sex_income = df_adult.groupby(by=['Sex', 'Above/Below 50k'])
gb_sex_income.size()

Sex      Above/Below 50k
 Female   <=50K              9592
          >50K               1179
 Male     <=50K             15128
          >50K               6662
dtype: int64

In [46]:
gb_sex_income.mean(numeric_only=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,fnlgwt,Education Num,Capital Gain,Capital Loss,Hours/Week
Sex,Above/Below 50k,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,<=50K,36.210801,185999.381359,9.820475,121.986134,47.36447,35.916701
Female,>50K,42.12553,183687.406277,11.787108,4200.389313,173.648855,40.426633
Male,<=50K,37.147012,193093.609268,9.452142,165.723823,56.806782,40.693879
Male,>50K,44.625788,188769.101321,11.580606,3971.765836,198.780396,46.366106


In [47]:
groupby_list = ['Education', 'Above/Below 50k']
gb = df_adult.groupby(by=groupby_list)
gb['Hours/Week'].mean()

Education      Above/Below 50k
 10th           <=50K             36.574053
                >50K              43.774194
 11th           <=50K             33.322870
                >50K              45.133333
 12th           <=50K             35.035000
                >50K              44.818182
 1st-4th        <=50K             37.864198
                >50K              48.833333
 5th-6th        <=50K             38.539432
                >50K              46.000000
 7th-8th        <=50K             38.830033
                >50K              47.500000
 9th            <=50K             37.667351
                >50K              44.851852
 Assoc-acdm     <=50K             39.264339
                >50K              44.256604
 Assoc-voc      <=50K             40.817826
                >50K              43.853186
 Bachelors      <=50K             40.586152
                >50K              45.475462
 Doctorate      <=50K             45.429907
                >50K              47.513072
 

In [54]:
gb.agg({'Hours/Week':['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Hours/Week,Hours/Week
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Education,Above/Below 50k,Unnamed: 2_level_2,Unnamed: 3_level_2
10th,<=50K,36.574053,871
10th,>50K,43.774194,62
11th,<=50K,33.32287,1115
11th,>50K,45.133333,60
12th,<=50K,35.035,400
12th,>50K,44.818182,33
1st-4th,<=50K,37.864198,162
1st-4th,>50K,48.833333,6
5th-6th,<=50K,38.539432,317
5th-6th,>50K,46.0,16


In [81]:
# https://towardsdatascience.com/4-useful-tips-of-pandas-groupby-3744eefb1852
gb_data = gb.agg({'Hours/Week':['mean', 'count']})
gb_data[('Hours/Week', 'count_%')] = gb_data[('Hours/Week', 'count')].groupby(level=0, group_keys=False).apply(lambda x: (x/x.sum()*100).round(2))
gb_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Hours/Week,Hours/Week,Hours/Week
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,count_%
Education,Above/Below 50k,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
10th,<=50K,36.574053,871,93.35
10th,>50K,43.774194,62,6.65
11th,<=50K,33.32287,1115,94.89
11th,>50K,45.133333,60,5.11
12th,<=50K,35.035,400,92.38
12th,>50K,44.818182,33,7.62
1st-4th,<=50K,37.864198,162,96.43
1st-4th,>50K,48.833333,6,3.57
5th-6th,<=50K,38.539432,317,95.2
5th-6th,>50K,46.0,16,4.8


# Set Category Variables

In [106]:
df_dogs = pd.read_csv(r'../data/ShelterDogs.csv')
df_dogs.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in
0,23807,Gida,0.25,female,Unknown Mix,12/10/19,12/11/19,12/11/19,red,short,small,no,,,,,,,
1,533,Frida És Ricsi,0.17,female,Unknown Mix,12/1/19,12/1/19,12/9/19,black and white,short,small,no,,yes,yes,yes,yes,yes,
2,23793,,4.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,saddle back,short,medium,no,,,,,,,
3,23795,,1.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,yellow-brown,medium,medium,no,,,,,,,
4,23806,Amy,2.0,female,French Bulldog Mix,12/10/19,12/11/19,12/11/19,black,short,small,no,,,,,,,


In [107]:
df_dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2937 entries, 0 to 2936
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 2937 non-null   int64  
 1   name               2845 non-null   object 
 2   age                2937 non-null   float64
 3   sex                2937 non-null   object 
 4   breed              2937 non-null   object 
 5   date_found         2937 non-null   object 
 6   adoptable_from     2937 non-null   object 
 7   posted             2937 non-null   object 
 8   color              2937 non-null   object 
 9   coat               2937 non-null   object 
 10  size               2937 non-null   object 
 11  neutered           1852 non-null   object 
 12  housebroken        460 non-null    object 
 13  likes_people       1999 non-null   object 
 14  likes_children     1219 non-null   object 
 15  get_along_males    1633 non-null   object 
 16  get_along_females  1673 

In [108]:
df_dogs['coat'] = df_dogs['coat'].astype('category')
df_dogs['coat'].value_counts(dropna=False)

short         1972
medium         565
wirehaired     220
long           180
Name: coat, dtype: int64

In [109]:
df_dogs['new_coat'] = df_dogs['coat'].cat.set_categories( new_categories=['short', 'medium', 'long'])
df_dogs['new_coat'].value_counts(dropna=False)

short     1972
medium     565
NaN        220
long       180
Name: new_coat, dtype: int64

In [110]:
df_dogs['new_coat'] = df_dogs['coat'].cat.set_categories( new_categories=['short', 'medium', 'long'], ordered=True)
df_dogs['new_coat']

0        short
1        short
2        short
3       medium
4        short
         ...  
2932     short
2933     short
2934     short
2935    medium
2936     short
Name: new_coat, Length: 2937, dtype: category
Categories (3, object): ['short' < 'medium' < 'long']

In [111]:
df_dogs['likes_people'].value_counts(dropna=False)

yes    1991
NaN     938
no        8
Name: likes_people, dtype: int64

In [112]:
df_dogs['likes_people'] = df_dogs['likes_people'].astype('category')
df_dogs['likes_people'] = df_dogs['likes_people'].cat.add_categories( new_categories=['did not check', 'could not tell'])
df_dogs['likes_people'].value_counts(dropna=False)

yes               1991
NaN                938
no                   8
did not check        0
could not tell       0
Name: likes_people, dtype: int64

# Updating Categories

In [113]:
df_dogs['age'].value_counts(bins=10, sort=True, ascending=True)

(19.743, 21.92]       9
(17.566, 19.743]     50
(15.389, 17.566]     96
(13.212, 15.389]    201
(0.127, 2.327]      250
(2.327, 4.504]      270
(4.504, 6.681]      394
(11.035, 13.212]    417
(6.681, 8.858]      606
(8.858, 11.035]     644
Name: age, dtype: int64

In [114]:
df_dogs['breed'].value_counts(sort=True, ascending=False)

Unknown Mix                                             1524
German Shepherd Dog Mix                                  190
Dachshund Mix                                            147
Labrador Retriever Mix                                    83
Staffordshire Terrier Mix                                 62
                                                        ... 
Border Collie, Spaniel Mix                                 1
Bull Terrier, Fox Terrier, Staffordshire Terrier Mix       1
Greyhound, Transylvanian Hound Mix                         1
Komondor Mix                                               1
German Pointer, Greyhound, Pointer Mix                     1
Name: breed, Length: 277, dtype: int64

In [116]:
df_dogs['breed'] = df_dogs['breed'].astype('category')
df_dogs.dtypes

ID                      int64
name                   object
age                   float64
sex                    object
breed                category
date_found             object
adoptable_from         object
posted                 object
color                  object
coat                 category
size                   object
neutered               object
housebroken            object
likes_people         category
likes_children         object
get_along_males        object
get_along_females      object
get_along_cats         object
keep_in                object
new_coat             category
dtype: object

In [117]:
df_dogs['new_breed'] = df_dogs['breed'].cat.rename_categories({'Unknown Mix' : 'Unknown'})
df_dogs['new_breed'].value_counts(sort=True, ascending=False)

Unknown                                     1524
German Shepherd Dog Mix                      190
Dachshund Mix                                147
Labrador Retriever Mix                        83
Staffordshire Terrier Mix                     62
                                            ... 
English Cocker Spaniel, Vizsla Mix             1
English Greyhound Mix                          1
English Greyhound, Spanish Greyhound Mix       1
Fox Terrier, German Shepherd Dog Mix           1
Yorkshire Terrier                              1
Name: new_breed, Length: 277, dtype: int64

In [118]:
df_dogs['new_breed'] = df_dogs['new_breed'].cat.rename_categories(lambda x: x.lower())
df_dogs['new_breed'].value_counts(sort=True, ascending=False)

unknown                                     1524
german shepherd dog mix                      190
dachshund mix                                147
labrador retriever mix                        83
staffordshire terrier mix                     62
                                            ... 
english cocker spaniel, vizsla mix             1
english greyhound mix                          1
english greyhound, spanish greyhound mix       1
fox terrier, german shepherd dog mix           1
yorkshire terrier                              1
Name: new_breed, Length: 277, dtype: int64