In [6]:
import numpy as np
import pandas as pd

Methods

1. [x] for loop
2. [x] pd.iterow()
3. [^] pd.apply()
4. [^] pd.map()
5. [O] np.select()
6. [O] np.where()
7. [O] np.vectorize()
8. [O] pd.shift()
9. [O] pd.str.contains() # re
10. [O] dask
11. [O] from multithreading import Pool -> pd.concat

In [59]:
def get_dataset(size:int) -> pd.DataFrame:
    df = pd.DataFrame()
    df['A'] = np.random.choice(['left', 'middle', 'right'], size)
    df['B'] = np.random.randint(1, 50, size)
    df['C'] = np.random.choice(['red', 'yellow', 'blue', 'green'], size)
    df['D'] = np.random.uniform(0, 1, size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    return df

In [8]:
df = get_dataset(100)

In [18]:
%timeit df['age_gt_30'] = np.where(df['B'].values > 30, 'yes', 'no')

133 µs ± 2.56 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [19]:
df.head()

Unnamed: 0,A,B,C,D,age_gt_30,symbolize
0,middle,37,yellow,0.367555,yes,-
1,left,47,green,0.899406,yes,&
2,middle,32,red,0.161801,yes,-
3,right,3,yellow,0.783261,no,^
4,middle,10,yellow,0.999057,no,-


In [20]:
conditions = [
    df['A'] == 'middle',
    df['A'] == 'right',
    df['A'] == 'left'
]

choices = [
    ' - ',
    ' ^ ',
    ' & ',
]

%timeit df['symbolize'] = np.select(conditions, choices, 'na')

245 µs ± 39 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
df.head()

Unnamed: 0,A,B,C,D,age_gt_30,symbolize
0,middle,37,yellow,0.367555,yes,-
1,left,47,green,0.899406,yes,&
2,middle,32,red,0.161801,yes,-
3,right,3,yellow,0.783261,no,^
4,middle,10,yellow,0.999057,no,-


In [23]:
%timeit df['C'].isin(['red', 'blue'])

129 µs ± 1.32 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [29]:
#%load_ext line_profiler
%prun df['C'].isin(['red', 'blue'])

 

In [33]:
pip install memory_profiler

Looking in indexes: http://mirrors.aliyun.com/pypi/simple/
Collecting memory_profiler
  Downloading http://mirrors.aliyun.com/pypi/packages/06/dd/7308a8ef1902db9d81c5bc226befe346a87ed8787caff00b8d91ed9f3b86/memory_profiler-0.60.0.tar.gz (38 kB)
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25ldone
[?25h  Created wheel for memory-profiler: filename=memory_profiler-0.60.0-py3-none-any.whl size=31276 sha256=29f97d625fec74a9bffa5a914b5418ebaaf041e8d39a2ae2ba9aaffb528985c2
  Stored in directory: /Users/zhangliang/Library/Caches/pip/wheels/01/1e/e3/5ac53cc49cb6183982d6a8665facdabe557cbf32f078b8150c
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.60.0
Note: you may need to restart the kernel to use updated packages.


In [36]:
%time
conditions = [
    df['C'].str.contains(r'.*?el', na=False),
    df['C'].str.contains(r'^blue|^re\w+', na=False)
]

choices = ['yell', 'magic']

df['ym'] = np.select(conditions, choices, default='na')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [38]:
df.head()

Unnamed: 0,A,B,C,D,age_gt_30,symbolize,ym
0,middle,37,yellow,0.367555,yes,-,yell
1,left,47,green,0.899406,yes,&,na
2,middle,32,red,0.161801,yes,-,magic
3,right,3,yellow,0.783261,no,^,yell
4,middle,10,yellow,0.999057,no,-,yell


In [39]:
df.tail()

Unnamed: 0,A,B,C,D,age_gt_30,symbolize,ym
95,middle,8,red,0.458504,no,-,magic
96,middle,42,blue,0.070256,yes,-,magic
97,middle,4,red,0.735229,no,-,magic
98,left,20,green,0.261635,no,&,na
99,middle,38,red,0.534722,yes,-,magic


In [40]:
%time
mp = {
    'middle' : 'S',
    'left'   : 'M',
    'right'  : 'N'
}

df['dict_lUT'] = np.where(
    df['D'].values > 0.2,
    'hello',
    df['A'].map(mp)
)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


In [42]:
df.head()
df.tail()

Unnamed: 0,A,B,C,D,age_gt_30,symbolize,ym,dict_lUT
95,middle,8,red,0.458504,no,-,magic,hello
96,middle,42,blue,0.070256,yes,-,magic,S
97,middle,4,red,0.735229,no,-,magic,hello
98,left,20,green,0.261635,no,&,na,hello
99,middle,38,red,0.534722,yes,-,magic,hello


In [43]:
# pd[].values => np.dt.days

In [45]:
# ndarray type casting
# np.timedelta64(1, 'D')
# np.astype('timedelta64[D]')

In [46]:
df['D'].values.astype('timedelta64[D]')

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='timedelta64[D]')

In [49]:
df['D'].astype('timedelta64[D]').dt.days

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
     ..
70    0
71    0
72    0
73    0
74    0
75    0
76    0
77    0
78    0
79    0
80    0
81    0
82    0
83    0
84    0
85    0
86    0
87    0
88    0
89    0
90    0
91    0
92    0
93    0
94    0
95    0
96    0
97    0
98    0
99    0
Name: D, Length: 100, dtype: int64

In [60]:
df2 = get_dataset(1_000_000)

In [61]:
df2.head()

Unnamed: 0,A,B,C,D,date
0,middle,6,red,0.336495,2022-05-08
1,middle,28,blue,0.338123,2021-08-29
2,left,17,red,0.769266,2021-12-20
3,left,6,blue,0.5704,2021-08-26
4,middle,1,red,0.416746,2021-05-03


In [62]:
df2.tail()

Unnamed: 0,A,B,C,D,date
999995,middle,24,green,0.076154,2020-10-14
999996,middle,20,green,0.601369,2020-03-04
999997,left,1,blue,0.679519,2020-02-08
999998,left,27,red,0.145602,2020-04-22
999999,right,19,blue,0.760244,2020-11-17


In [63]:
def get_dataset(size:int) -> pd.DataFrame:
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','small','medium'],size)
    df['age']  = np.random.randint(1,50,size)
    df['team'] = np.random.choice(['red','blue','green','yellow'],size)
    df['win']  = np.random.choice(['yes','no'],size)
    dates = pd.data_range('2020-01-01','2022-12-01')
    df['date'] = np.random.choice(dates,size)
    df['prob'] = np.random.uniform(0,1,size)
    return df

In [64]:
def set_dtypes(df:pd.DataFrame) -> None:
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age']  = df['age'].astype('int16')
    df['win']  = df['win'].map({'yes':True,'no':False})
    df['prob'] = df['prob'].astype('float16')
    return df

In [65]:
# !ls -GFlash ...filename

# Parquet (long-term-storage)
```
!pip install pyarrow
!pip install fastparquet
```

# Feather (short-term-storage)
```
!pip install feather
```

In [66]:
!ls

20221003_pd_speed_up_and_storage_effciency.ipynb
