## Topic 3

In this notebook, we introduce `dfply`, a library that brings the intuitive and expressive syntax of `R`â€™s `dplyr` to the Python ecosystem.

`dfply` allows us to write data manipulation pipelines using the pipe operator (`>>`), making the sequence of operations easy to follow. Instead of chaining multiple pandas methods or nesting function calls, we can express transformations step by step in a clear and readable flow.

By combining the power of `pandas` with the declarative style of `dplyr`, `dfply` makes data manipulation in Python faster, more efficient, and easier to read, especially for those familiar with the `R` tidyverse.

In [1]:
import pandas as pd
import seaborn as sns
from dfply import *

In [2]:
titanic = sns.load_dataset('titanic')
titanic >> head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Selecting columns

In [3]:
titanic >> select(X.sex, X.age, X.survived) >> head(3)

Unnamed: 0,sex,age,survived
0,male,22.0,0
1,female,38.0,1
2,female,26.0,1


In [4]:
titanic >> drop(X.sex, X.age, X.survived) >> head(3)

Unnamed: 0,pclass,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,3,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,3,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [5]:
titanic >> drop(columns_from('who')) >> head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class
0,0,3,male,22.0,1,0,7.25,S,Third
1,1,1,female,38.0,1,0,71.2833,C,First
2,1,3,female,26.0,0,0,7.925,S,Third


In [6]:
titanic >> select(~X.sex, ~X.age, ~X.survived) >> head(3)

Unnamed: 0,pclass,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,3,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,3,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [7]:
titanic >> select(starts_with('e')) >> head(3)

Unnamed: 0,embarked,embark_town
0,S,Southampton
1,C,Cherbourg
2,S,Southampton


### Subsetting rows

In [8]:
titanic >> mask(X.age <= 20) >> head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False


In [9]:
titanic >> filter_by(X.age <= 15, X.sex == 'female') >> head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
14,0,3,female,14.0,0,0,7.8542,S,Third,child,False,,Southampton,no,True


### Creating new variables

In [10]:
titanic >> filter_by(X.age <= 15) >> select(X.age, X.sex, X.who) >> mutate(info = X.sex + X.who) >> head()

Unnamed: 0,age,sex,who,info
7,2.0,male,child,malechild
9,14.0,female,child,femalechild
10,4.0,female,child,femalechild
14,14.0,female,child,femalechild
16,2.0,male,child,malechild


### Renaming variables

In [11]:
titanic >> rename(gender = X.sex) >> head(3)

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


### Sorting

In [12]:
titanic >> arrange(X.age, ascending = False) >> head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


### Grouping and summarizing

In [13]:
titanic >> group_by(X.pclass) >> summarize(survived_prob = X.survived.mean())

Unnamed: 0,pclass,survived_prob
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [14]:
titanic >> group_by(X.pclass, X.sex) >> summarize(survived_prob = X.survived.mean())

Unnamed: 0,sex,pclass,survived_prob
0,female,1,0.968085
1,male,1,0.368852
2,female,2,0.921053
3,male,2,0.157407
4,female,3,0.5
5,male,3,0.135447


In [15]:
df_survived = titanic >> group_by(X.pclass, X.sex) >> summarize(survived_prob = X.survived.mean())
df_survived

Unnamed: 0,sex,pclass,survived_prob
0,female,1,0.968085
1,male,1,0.368852
2,female,2,0.921053
3,male,2,0.157407
4,female,3,0.5
5,male,3,0.135447


### Reshaping

In [16]:
df_survived_wide = df_survived >> spread(X.sex, X.survived_prob)
df_survived_wide

Unnamed: 0,pclass,female,male
0,1,0.968085,0.368852
1,2,0.921053,0.157407
2,3,0.5,0.135447


In [17]:
df_survived_long = df_survived_wide >> gather('gender', 'prob', ['female', 'male'])
df_survived_long

Unnamed: 0,pclass,gender,prob
0,1,female,0.968085
1,2,female,0.921053
2,3,female,0.5
3,1,male,0.368852
4,2,male,0.157407
5,3,male,0.135447


### Merging tables

In [18]:
a = pd.DataFrame({
        'x1':['A', 'B', 'C'],
        'x2':[1, 2, 3]
    })
b = pd.DataFrame({
    'x1':['A', 'B', 'D'],
    'x3':[True, False, True]
})

In [19]:
a >> inner_join(b, by = 'x1')

Unnamed: 0,x1,x2,x3
0,A,1,True
1,B,2,False


In [20]:
a >> left_join(b, by = 'x1')

Unnamed: 0,x1,x2,x3
0,A,1,True
1,B,2,False
2,C,3,


In [21]:
a >> right_join(b, by = 'x1')

Unnamed: 0,x1,x2,x3
0,A,1.0,True
1,B,2.0,False
2,D,,True


In [22]:
a >> anti_join(b, by = 'x1')

Unnamed: 0,x1,x2
2,C,3
