In [47]:
import pandas as pd   
import woodwork as ww

df = pd.read_csv("https://oss.alteryx.com/datasets/online-retail-logs-2018-08-28.csv")
df["order_product_id"] = range(df.shape[0])

In [48]:
df

Unnamed: 0,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled,order_product_id
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.2075,Andrea Brown,United Kingdom,25.2450,False,0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.5610,False,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,4.5375,Andrea Brown,United Kingdom,36.3000,False,2
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.5610,False,3
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.5610,False,4
...,...,...,...,...,...,...,...,...,...,...,...
401599,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,1.4025,Regina Green,France,16.8300,False,401599
401600,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,3.4650,Regina Green,France,20.7900,False,401600
401601,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,6.8475,Regina Green,France,27.3900,False,401601
401602,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,6.8475,Regina Green,France,27.3900,False,401602


In [49]:
#initializing woodwork on a dataframe an optional name parameter can be specified to label the data.
df.ww.init(name="retail")
df.ww

# Woodwork uses a weak reference for maintaining a reference from the accessor to the dataframe.
#Instead of calling it pd.read_ect.ww.init() always pull in the data into a var and then init()

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,['category']
product_id,category,Categorical,['category']
description,category,Categorical,['category']
quantity,int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]
unit_price,float64,Double,['numeric']
customer_name,category,Categorical,['category']
country,category,Categorical,['category']
total,float64,Double,['numeric']
cancelled,bool,Boolean,[]


In [50]:
# Creates a new dataframe containing first 5 rows
head_df = df.ww.head(5)
head_df.ww
# Once woodwork is initialized on a dataframe it is recommended to go through the ww namespace
# when performing dataframe operations to avoid invalidating woodwork's typing info.

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,['category']
product_id,category,Categorical,['category']
description,category,Categorical,['category']
quantity,int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]
unit_price,float64,Double,['numeric']
customer_name,category,Categorical,['category']
country,category,Categorical,['category']
total,float64,Double,['numeric']
cancelled,bool,Boolean,[]


In [51]:
head_df 

Unnamed: 0,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled,order_product_id
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.2075,Andrea Brown,United Kingdom,25.245,False,0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.561,False,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,4.5375,Andrea Brown,United Kingdom,36.3,False,2
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.561,False,3
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.561,False,4


In [52]:
#Changing Logical type to a more appropriate value
#ww.types to check column types to infer from
df.ww.set_types(
    logical_types={
        "customer_name": "PersonFullName",
        "country": "Categorical",
        "order_product_id": "Categorical"
    }
)
df.ww.types.head()

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,['category']
product_id,category,Categorical,['category']
description,category,Categorical,['category']
quantity,int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]


In [53]:
#Selecting Columns based on logical types
numeric_df = df.ww.select(["Integer","Double"])
numeric_df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
quantity,int64,Integer,['numeric']
unit_price,float64,Double,['numeric']
total,float64,Double,['numeric']


In [54]:
numeric_df 

Unnamed: 0,quantity,unit_price,total
0,6,4.2075,25.2450
1,6,5.5935,33.5610
2,8,4.5375,36.3000
3,6,5.5935,33.5610
4,6,5.5935,33.5610
...,...,...,...
401599,12,1.4025,16.8300
401600,6,3.4650,20.7900
401601,4,6.8475,27.3900
401602,4,6.8475,27.3900


In [55]:
#Adding Semantic Tags
df.ww.set_types(semantic_tags={"description": "product_details","total": "currency"})
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,['category']
product_id,category,Categorical,['category']
description,category,Categorical,"['product_details', 'category']"
quantity,int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]
unit_price,float64,Double,['numeric']
customer_name,string,PersonFullName,[]
country,category,Categorical,['category']
total,float64,Double,"['currency', 'numeric']"
cancelled,bool,Boolean,[]


In [56]:
#Select columns based on semantic tags
category_df = df.ww.select("category")
category_df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,['category']
product_id,category,Categorical,['category']
description,category,Categorical,"['product_details', 'category']"
country,category,Categorical,['category']
order_product_id,category,Categorical,['category']


In [57]:
#Select columns using multiple semantic tags
category_numeric_df = df.ww.select(["numeric","category"])
category_numeric_df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,['category']
product_id,category,Categorical,['category']
description,category,Categorical,"['product_details', 'category']"
quantity,int64,Integer,['numeric']
unit_price,float64,Double,['numeric']
country,category,Categorical,['category']
total,float64,Double,"['currency', 'numeric']"
order_product_id,category,Categorical,['category']


In [58]:
mixed_df = df.ww.select(["Boolean","product_details"])
mixed_df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
description,category,Categorical,"['product_details', 'category']"
cancelled,bool,Boolean,[]


In [59]:
# To pull individual columns
multi_col = df.ww[["product_id","total","unit_price"]]
multi_col.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
product_id,category,Categorical,['category']
total,float64,Double,"['currency', 'numeric']"
unit_price,float64,Double,['numeric']


In [60]:
# removing semantic tag
df.ww.remove_semantic_tags({"description":"product_details"})
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,['category']
product_id,category,Categorical,['category']
description,category,Categorical,['category']
quantity,int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]
unit_price,float64,Double,['numeric']
customer_name,string,PersonFullName,[]
country,category,Categorical,['category']
total,float64,Double,"['currency', 'numeric']"
cancelled,bool,Boolean,[]


In [61]:
# Designating certain columns as the woodwork index or time_index. set_index or set_time_index
df.ww.set_index("order_product_id")
df.ww.index

'order_product_id'

In [62]:
df.ww.set_time_index("order_date")
df.ww.time_index

'order_date'

In [63]:
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,['category']
product_id,category,Categorical,['category']
description,category,Categorical,['category']
quantity,int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['time_index']
unit_price,float64,Double,['numeric']
customer_name,string,PersonFullName,[]
country,category,Categorical,['category']
total,float64,Double,"['currency', 'numeric']"
cancelled,bool,Boolean,[]


In [64]:
series = pd.Series([1,2,3], dtype="int64")
series.ww.init(logical_type="Integer")
series.ww

<Series: None (Physical Type = int64) (Logical Type = Integer) (Semantic Tags = {'numeric'})>

In [66]:
string_series = pd.Series(["a","b","c"], dtype="string")
ww_series = ww.init_series(string_series, logical_type="Categorical")
ww_series.ww

<Series: None (Physical Type = category) (Logical Type = Categorical) (Semantic Tags = {'category'})>

In [67]:
#Add a new semantic tag to the series
series.ww.add_semantic_tags("new_tag")
series.ww

<Series: None (Physical Type = int64) (Logical Type = Integer) (Semantic Tags = {'new_tag', 'numeric'})>

In [69]:
#Access Series properties 
series.ww.shape

(3,)

In [71]:
#Call a method sample
sample_series = series.ww.sample(2)
sample_series.ww

<Series: None (Physical Type = int64) (Logical Type = Integer) (Semantic Tags = {'new_tag', 'numeric'})>

In [72]:
#List Logical Types 
from woodwork.type_sys.utils import list_logical_types
list_logical_types()

Unnamed: 0,name,type_string,description,physical_type,standard_tags,is_default_type,is_registered,parent_type
0,Address,address,Represents Logical Types that contain address ...,string,{},True,True,
1,Age,age,Represents Logical Types that contain whole nu...,int64,{numeric},True,True,Integer
2,AgeFractional,age_fractional,Represents Logical Types that contain non-nega...,float64,{numeric},True,True,Double
3,AgeNullable,age_nullable,Represents Logical Types that contain whole nu...,Int64,{numeric},True,True,IntegerNullable
4,Boolean,boolean,Represents Logical Types that contain binary v...,bool,{},True,True,BooleanNullable
5,BooleanNullable,boolean_nullable,Represents Logical Types that contain binary v...,boolean,{},True,True,
6,Categorical,categorical,Represents Logical Types that contain unordere...,category,{category},True,True,
7,CountryCode,country_code,Represents Logical Types that use the ISO-3166...,category,{category},True,True,Categorical
8,CurrencyCode,currency_code,Represents Logical Types that use the ISO-4217...,category,{category},True,True,Categorical
9,Datetime,datetime,Represents Logical Types that contain date and...,datetime64[ns],{},True,True,
