[Reference](https://medium.com/swlh/6-ways-to-significantly-speed-up-pandas-with-a-couple-lines-of-code-part-1-2c2dfb0de2300)

In [1]:
import pandas as pd
import numpy as np
import numba

# create a table of 100,000 rows and 4 columns filled with random numbers from 0 to 100
df = pd.DataFrame(np.random.randint(0,100,size=(100000, 4)),columns=['a', 'b', 'c', 'd'])

# function for creating new col
def multiply(x):
    return x * 5
    
# optimized version of this function
@numba.vectorize
def multiply_numba(x):
    return x * 5

In [2]:
%timeit df['new_col'] = df['a'].apply(multiply)

10 loops, best of 3: 33.8 ms per loop


In [3]:
%timeit df['new_col'] = df['a'] * 5

The slowest run took 26.88 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 992 µs per loop


In [4]:
%timeit df['new_col'] = multiply_numba(df['a'].to_numpy())

The slowest run took 311.47 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 721 µs per loop


In [5]:
# square the values and take their mean
def square_mean(row):
    row = np.power(row, 2)
    return np.mean(row)
# usage:
# df['new_col'] = df.apply(square_mean, axis=1)# numba does not know how to work with pandas primitives (Dataframe, Series, etc.)
# so that we use a two-dimensional numpy array
@numba.njit
def square_mean_numba(arr):
    res = np.empty(arr.shape[0])
    arr = np.power(arr, 2)
    for i in range(arr.shape[0]):
        res[i] = np.mean(arr[i])
    return res
# usage:
# df['new_col'] = square_mean_numba(df.to_numpy())

In [6]:
import os
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
!pwd
os.chdir('gdrive/My Drive/Colab Notebooks/')
!pwd

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
/content
/content/gdrive/My Drive/Colab Notebooks


In [7]:
df = pd.read_csv('abcnews-date-text.csv', header=0)
# increase the dataset 10 times, adding copies to the end
df = pd.concat([df] * 10)
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [8]:
# calculate the average word length in the title
def mean_word_len(line):
    # this cycle just complicates the task
    for i in range(6):
        words = [len(i) for i in line.split()]
        res = sum(words) / len(words)
    return res
def compute_avg_word(df):
    return df['headline_text'].apply(mean_word_len)

In [9]:
from multiprocessing import Pool

# I have 4 cores
n_cores = 4
pool = Pool(n_cores)
def apply_parallel(df, func):
    # split dataframe
    df_split = np.array_split(df, n_cores)
    # calculate metrics for each and concatenate
    df = pd.concat(pool.map(func, df_split))
    return df

In [10]:
df['new_col'] = apply_parallel(df, compute_avg_word)

In [12]:
!pip install pandarallel
from pandarallel import pandarallel
# pandarallel will determine how many cores you have, but you can specify it yourself
pandarallel.initialize()

Collecting pandarallel
  Downloading https://files.pythonhosted.org/packages/99/06/bd582106766c483d6da51c05b0cdd7cb61894bb843c7ecc4789032232327/pandarallel-1.4.8.tar.gz
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.4.8-cp36-none-any.whl size=16112 sha256=c139d59780915beb394aec49c98a68adae9015976fa541e782e22f0d6f97d30c
  Stored in directory: /root/.cache/pip/wheels/75/a2/85/b45be2e86d86e9ec5da6d05c4b994d18c81abe76e3f39415aa
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.4.8
INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [13]:
df['headline_text'].parallel_apply(mean_word_len)

0          7.500000
1          4.875000
2          5.714286
3          3.555556
4          5.571429
             ...   
1186013    6.142857
1186014    5.111111
1186015    5.250000
1186016    3.400000
1186017    6.714286
Name: headline_text, Length: 11860180, dtype: float64