## This notebook showcases how methond chaining and especially the `pipe` operator increases the readability of your pandas code.
I used the [Titanic dataset](https://www.kaggle.com/competitions/titanic/overview) from Kaggle for this example.

In [32]:
import pandas as pd
from typing import Any, List

### Preparing data using method chaining

In [35]:
def load_data(path: str) -> pd.DataFrame:
    return pd.read_csv("train.csv")

def replace_na_with_value(df: pd.DataFrame, col: str, replacement: Any, **kwargs) -> pd.DataFrame:
    return df.assign(**{col: lambda df: df[col].fillna("Unknown", **kwargs)})

def impute_with_mean(df: pd.DataFrame, col: str) -> pd.DataFrame:
    return df.assign(**{col: lambda df: df[col].fillna(df[col].mean())})

def round_values(df: pd.DataFrame, col: str) -> pd.DataFrame:
    return df.assign(**{col: lambda df: df[col].round()})

def split_name_col(df: pd.DataFrame, sep: str) -> pd.DataFrame:
    return (df.assign(LastName = lambda df: df["Name"].str.split(sep).str[0])
              .assign(FirstName = lambda df: df["Name"].str.split(sep).str[1]))

def rename_cols(df: pd.DataFrame, cols: List[str], new_names: List[str]) -> pd.DataFrame:
    return df.rename({col: new for col, new in zip(cols, new_names)}, axis=1)

In [37]:
(load_data("train.csv")
     .query("Embarked == 'C'")
     .pipe(replace_na_with_value, "Cabin", "Unknown")
     .pipe(impute_with_mean, "Age")
     .pipe(round_values, "Age")
     .pipe(split_name_col, ",")
     .assign(Interaction = lambda df: df["Survived"] * df["Pclass"])
     .pipe(rename_cols, ["Pclass", "SibSp"], ["TicketClass", "NumberSiblings"])
     )

Unnamed: 0,PassengerId,Survived,TicketClass,Name,Sex,Age,NumberSiblings,Parch,Ticket,Fare,Cabin,Embarked,LastName,FirstName,Interaction
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer),1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,Unknown,C,Nasser,Mrs. Nicholas (Adele Achem),2
19,20,1,3,"Masselmani, Mrs. Fatima",female,31.0,0,0,2649,7.2250,Unknown,C,Masselmani,Mrs. Fatima,3
26,27,0,3,"Emir, Mr. Farred Chehab",male,31.0,0,0,2631,7.2250,Unknown,C,Emir,Mr. Farred Chehab,0
30,31,0,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,Unknown,C,Uruchurtu,Don. Manuel E,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,867,1,2,"Duran y More, Miss. Asuncion",female,27.0,1,0,SC/PARIS 2149,13.8583,Unknown,C,Duran y More,Miss. Asuncion,2
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,Unknown,C,Abelson,Mrs. Samuel (Hannah Wizosky),2
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,Unknown,C,Najib,"Miss. Adele Kiamie ""Jane""",3
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,Potter,Mrs. Thomas Jr (Lily Alexenia Wilson),1


### The same functionality but *without* method chaining

In [44]:
df = pd.read_csv("train.csv")
df_filtered = df[df["Embarked"] == "C"]
df_filtered["Cabin"] = df_filtered["Cabin"].fillna("Unknown")
df_filtered["Age"] = df_filtered["Age"].fillna(df_filtered["Age"].mean())
df_filtered["Age"] = df_filtered["Age"].round()
df_filtered["FirstName"] = df_filtered["Name"].str.split(",").str[0]
df_filtered["LastName"] = df_filtered["Name"].str.split(",").str[1]
df_filtered["Interaction"] = df_filtered["Survived"] * df_filtered["Pclass"]
df_renamed = df_filtered.rename({"Pclass": "TicketClass", "SibSp": "NumberSiblings"}, axis=1)
df_renamed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["Cabin"] = df_filtered["Cabin"].fillna("Unknown")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["Age"] = df_filtered["Age"].fillna(df_filtered["Age"].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["Age"] = df_filtered["Age"].round()
A value is trying to 

Unnamed: 0,PassengerId,Survived,TicketClass,Name,Sex,Age,NumberSiblings,Parch,Ticket,Fare,Cabin,Embarked,FirstName,LastName,Interaction
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer),1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,Unknown,C,Nasser,Mrs. Nicholas (Adele Achem),2
19,20,1,3,"Masselmani, Mrs. Fatima",female,31.0,0,0,2649,7.2250,Unknown,C,Masselmani,Mrs. Fatima,3
26,27,0,3,"Emir, Mr. Farred Chehab",male,31.0,0,0,2631,7.2250,Unknown,C,Emir,Mr. Farred Chehab,0
30,31,0,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,Unknown,C,Uruchurtu,Don. Manuel E,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,867,1,2,"Duran y More, Miss. Asuncion",female,27.0,1,0,SC/PARIS 2149,13.8583,Unknown,C,Duran y More,Miss. Asuncion,2
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,Unknown,C,Abelson,Mrs. Samuel (Hannah Wizosky),2
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,Unknown,C,Najib,"Miss. Adele Kiamie ""Jane""",3
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,Potter,Mrs. Thomas Jr (Lily Alexenia Wilson),1
