[Reference](https://medium.com/@akaivdo/pandas-optimization-c19c467268d1)

# Data preparation

In [1]:
import pandas as pd
df = pd.DataFrame(
    [
        ["123", "abc", "123456.78", "2022-01-02 01:02:01"],
        ["234", "def", "223456.78", "2022-02-03 02:03:02"],
        ["567", "ghi", "323456.78", "2022-03-04 06:04:03"],
        ["890", "jkl", "423456.78", "2022-04-11 12:05:04"],
        ["123", "mno", "523456.78", "2022-05-25 19:06:05"],
    ] * 20000,
    columns=["col_1", "col_2", "col_3", "col_4"]
)
len(df)

100000

# Memory

In [2]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   col_1   100000 non-null  object
 1   col_2   100000 non-null  object
 2   col_3   100000 non-null  object
 3   col_4   100000 non-null  object
dtypes: object(4)
memory usage: 25.0 MB


# string -> datetime


In [3]:
df["col_4"] = pd.to_datetime(df["col_4"])
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   col_1   100000 non-null  object        
 1   col_2   100000 non-null  object        
 2   col_3   100000 non-null  object        
 3   col_4   100000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 18.5 MB


# string -> number

In [4]:
import numpy as np
df = df.astype({"col_1": np.int16, "col_3": np.float32})
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   col_1   100000 non-null  int16         
 1   col_2   100000 non-null  object        
 2   col_3   100000 non-null  float32       
 3   col_4   100000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float32(1), int16(1), object(1)
memory usage: 7.1 MB


# string -> category

In [5]:
df["col_2"] = df["col_2"].astype("category")
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   col_1   100000 non-null  int16         
 1   col_2   100000 non-null  category      
 2   col_3   100000 non-null  float32       
 3   col_4   100000 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](1), float32(1), int16(1)
memory usage: 1.4 MB


# General Row-by-Row Loop

In [6]:
# Copy the DataFrame to keep the original DataFrame unchanged
temp_df = df.copy()
# List to store the computing results.  
new_col = []
# Calculating execution time  
%timeit
for i in range(len(temp_df)):
    # Save the computing result of each row to a list
    new_col.append(temp_df.loc[i, "col_1"] + temp_df.loc[i, "col_3"])
# Assign the result to a new column named total  
temp_df["total"] = pd.Series(new_col)

# iterrows()


In [7]:
# Copy the DataFrame to keep the original DataFrame unchanged
temp_df = df.copy()
# List to store the computing results.  
new_col = []
%timeit
for i, row in temp_df.iterrows():
    new_col.append(row["col_1"] + row["col_3"]) 
temp_df["total"] = pd.Series(new_col)

# itertuples()


In [8]:
# Copy the DataFrame to keep the original DataFrame unchanged
temp_df = df.copy()
# List to store the computing results.  
new_col = []
%timeit
# By default, index=True -> for index, row in ...:
for row in temp_df.itertuples(index=False):
     new_col.append(row[0] + row[2])
temp_df["total"] = pd.Series(new_col)

# apply()


In [10]:
# The function definition to be applied
def print_col(row):
    return row["col_1"] + row["col_3"]
# Copy the DataFrame to keep the original DataFrame unchanged
temp_df = df.copy()
%timeit
# axis=1 means apply this operation on all rows.
temp_df["total"] = temp_df.apply(print_col, axis=1)

# vectorization


In [11]:
# Copy the DataFrame to keep the original DataFrame unchanged
temp_df = df.copy()
%timeit
# Use the built-in adding operation
temp_df["total"] = temp_df["col_1"] + temp_df["col_3"]

# I/O


In [12]:
df = pd.DataFrame(
    [
        ["abc123"]*50,
    ] * 1000000,
    columns=[f"col_{i}" for i in range(1, 51)]
)