# Categorical Data

Categorical data is a finite set of discrete Values

In [2]:
import pandas as pd
import numpy as np

In [4]:
categorical_series = pd.Series(['Cotton', 'Polyester', 'Wool'],
                               dtype="category")

In [5]:
categorical_series

0       Cotton
1    Polyester
2         Wool
dtype: category
Categories (3, object): [Cotton, Polyester, Wool]

In [6]:
categorical_series = pd.Series(['Cotton', 'Polyester', 'Wool', 'Cotton'],
                               dtype="category")

In [7]:
categorical_series

0       Cotton
1    Polyester
2         Wool
3       Cotton
dtype: category
Categories (3, object): [Cotton, Polyester, Wool]

In [15]:
df = pd.DataFrame({"Material":['Cotton','Polyester','Wool','Silk'],
                   "Garment":['T-shirt','Jacket','Socks','Scarf'],
                   "Size":['Medium', 'Large', 'Large', 'Small']})

In [16]:
df

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,Large
2,Wool,Socks,Large
3,Silk,Scarf,Small


In [17]:
df.dtypes

Material    object
Garment     object
Size        object
dtype: object

In [19]:
 # Cast our Material_cat as a type category and assign it to the new colum
df['Material_cat'] = df["Material"].astype('category')

In [20]:
df

Unnamed: 0,Material,Garment,Size,Material_cat
0,Cotton,T-shirt,Medium,Cotton
1,Polyester,Jacket,Large,Polyester
2,Wool,Socks,Large,Wool
3,Silk,Scarf,Small,Silk


In [23]:
df = pd.DataFrame({"Material":['Cotton','Polyester','Wool','Silk'],
                   "Garment":['T-shirt','Jacket','Socks','Scarf'],
                   "Size":['Medium', 'Large', 'Large', 'Small']},
                 dtype = "category")

In [24]:
df.dtypes

Material    category
Garment     category
Size        category
dtype: object

In [26]:
df['Size']

0    Medium
1     Large
2     Large
3     Small
Name: Size, dtype: category
Categories (3, object): [Large, Medium, Small]

In [27]:
# If you want to specify order for your categorical data type, you should use the CategoricalDtype class
from pandas.api.types import CategoricalDtype

In [28]:
materials = CategoricalDtype(categories=['Cotton', 'Polyester', 'Wool', 'Silk'], ordered=False) 
# These categories do not have any innate order which is why we say ordered=False

In [30]:
materials

CategoricalDtype(categories=['Cotton', 'Polyester', 'Wool', 'Silk'], ordered=False)

In [46]:
sizes = CategoricalDtype(categories=['Small', 'Medium', 'Large'], 
                        ordered=True)

In [47]:
sizes

CategoricalDtype(categories=['Small', 'Medium', 'Large'], ordered=True)

In [37]:
list_series = pd.Series(['Medium','Small','X-Large','Small'])
list_series.astype(sizes) # We cast the series to be of type sizes

0    Medium
1     Small
2       NaN
3     Small
dtype: category
Categories (3, object): [Small < Medium < Large]

In [55]:
df = pd.DataFrame({"Material":['Cotton','Polyester','Wool','Silk'],
                   "Garment":['T-shirt','Jacket','Socks','Scarf'],
                   "Size":['Medium', 'Large', 'Large', 'Small']})

In [56]:
df['Material'] = df["Material"].astype(materials)

In [57]:
df['Size'] = df['Size'].astype(sizes)

In [58]:
df

Unnamed: 0,Material,Garment,Size
0,Cotton,T-shirt,Medium
1,Polyester,Jacket,Large
2,Wool,Socks,Large
3,Silk,Scarf,Small


In [60]:
df['Size']

0    Medium
1     Large
2     Large
3     Small
Name: Size, dtype: category
Categories (3, object): [Small < Medium < Large]

### Operations with categorical data