In [1]:
import pyarrow;pyarrow.__version__

'11.0.0'

In [2]:
import pandas as pd;pd.__version__

'2.0.0'

# 2 速通pandas 2.0新版本干货内容

## 2.1 数据读取及运算性能提升

- 默认读取方式

In [3]:
%%timeit

df = pd.read_csv('./fraudTrain.csv')

4.09 s ± 387 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


- 使用`pyarrow`引擎

In [4]:
%%timeit

df = pd.read_csv('./fraudTrain.csv', engine='pyarrow')

1.16 s ± 38.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


- 使用`pyarrow`引擎+`dtype_backend`

In [5]:
%%timeit

df = pd.read_csv('./fraudTrain.csv', engine='pyarrow', dtype_backend='pyarrow')

224 ms ± 8.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


- 默认写出方式

In [6]:
%%timeit

df = pd.read_csv('./fraudTrain.csv')

4.85 s ± 406 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


- 使用`pyarrow`引擎

In [7]:
%%timeit

df = pd.read_csv('./fraudTrain.csv', engine='pyarrow')

1.24 s ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


- 使用`pyarrow`引擎+`dtype_backend`

In [8]:
%%timeit

df = pd.read_csv('./fraudTrain.csv', engine='pyarrow', dtype_backend='pyarrow')

228 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
df_pyarrow = pd.read_csv('./fraudTrain.csv', engine='pyarrow', dtype_backend='pyarrow')
df_pyarrow.dtypes

                                int64[pyarrow]
trans_date_trans_time    timestamp[s][pyarrow]
cc_num                          int64[pyarrow]
merchant                       string[pyarrow]
category                       string[pyarrow]
amt                            double[pyarrow]
first                          string[pyarrow]
last                           string[pyarrow]
gender                         string[pyarrow]
street                         string[pyarrow]
city                           string[pyarrow]
state                          string[pyarrow]
zip                             int64[pyarrow]
lat                            double[pyarrow]
long                           double[pyarrow]
city_pop                        int64[pyarrow]
job                            string[pyarrow]
dob                       date32[day][pyarrow]
trans_num                      string[pyarrow]
unix_time                       int64[pyarrow]
merch_lat                      double[pyarrow]
merch_long   

In [10]:
# 仅使用engine='pyarrow'不会改变列的默认数据类型
df_numpy = pd.read_csv('./fraudTrain.csv', engine='pyarrow')
df_numpy.dtypes

                                  int64
trans_date_trans_time    datetime64[ns]
cc_num                            int64
merchant                         object
category                         object
amt                             float64
first                            object
last                             object
gender                           object
street                           object
city                             object
state                            object
zip                               int64
lat                             float64
long                            float64
city_pop                          int64
job                              object
dob                              object
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
dtype: object

In [11]:
df_pyarrow.head(1)

Unnamed: 0,Unnamed: 1,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0


In [12]:
%timeit df_numpy['cc_num'].mean()

747 µs ± 41.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [13]:
%timeit df_pyarrow['cc_num'].mean()

635 µs ± 33.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [14]:
%timeit df_numpy['category'].str[-5:]

196 ms ± 5.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit df_pyarrow['category'].str[-5:]

28.8 ms ± 837 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%timeit df_numpy['merchant'].str.startswith('fraud')

266 ms ± 3.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit df_pyarrow['merchant'].str.startswith('fraud')

5.5 ms ± 56.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 2.2 “修改时复制”机制

In [18]:
# 默认未开启
pd.options.mode.copy_on_write

False

In [19]:
df = pd.DataFrame({'field1': range(3)})
df

Unnamed: 0,field1
0,0
1,1
2,2


In [20]:
field1 = df['field1']
field1.iloc[0] = 999
field1

0    999
1      1
2      2
Name: field1, dtype: int64

In [21]:
df

Unnamed: 0,field1
0,999
1,1
2,2


In [22]:
pd.set_option("mode.copy_on_write", True)

In [23]:
df = pd.DataFrame({'field1': range(3)})
df

Unnamed: 0,field1
0,0
1,1
2,2


In [24]:
field1 = df['field1']
field1.iloc[0] = 999
field1

0    999
1      1
2      2
Name: field1, dtype: int64

In [25]:
df

Unnamed: 0,field1
0,0
1,1
2,2
