# DataFrame 基本操作

In [43]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 40

## 选取多个 dataframe 列

In [44]:
df_report = pd.read_csv('data.csv', encoding='gb2312')
df_report.head()

Unnamed: 0,里程,测点位置,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,director_name
0,63316.4,A1,-3082,-3101.0,-18.7,3082,3116,34.19,文件名称1
1,63316.4,A2,-4000,-4003.0,-2.6,4000,4035,35.03,文件名称2
2,63316.4,A3,-3082,-3043.0,38.9,3082,3092,10.5,文件名称3
3,63318.2,A1,-3082,-3105.0,-23.2,3082,3120,38.11,文件名称4
4,63318.2,A2,-4000,-3998.0,2.3,4000,4039,38.46,文件名称5


In [45]:
df_report.columns

Index(['里程', '测点位置', '设计宽度（mm）左', '实测宽度（mm）左', '差值（mm）左', '设计宽度（mm）右',
       '实测宽度（mm）右', '差值（mm）右', 'director_name'],
      dtype='object')

In [46]:
# 用列表选取多个列
df_report_left = df_report[['里程', '测点位置', '设计宽度（mm）左', '实测宽度（mm）左', '差值（mm）左']]
df_report_left.head()

Unnamed: 0,里程,测点位置,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左
0,63316.4,A1,-3082,-3101.0,-18.7
1,63316.4,A2,-4000,-4003.0,-2.6
2,63316.4,A3,-3082,-3043.0,38.9
3,63318.2,A1,-3082,-3105.0,-23.2
4,63318.2,A2,-4000,-3998.0,2.3


In [47]:
# 选取单列
df_report[['设计宽度（mm）左']].head()

Unnamed: 0,设计宽度（mm）左
0,-3082
1,-4000
2,-3082
3,-3082
4,-4000


In [48]:
# 将列表赋值给一个变量，便于多选
col = ['里程', '测点位置', '设计宽度（mm）左', '实测宽度（mm）左', '差值（mm）左']
df_report_left = df_report[col]
df_report_left.head()

Unnamed: 0,里程,测点位置,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左
0,63316.4,A1,-3082,-3101.0,-18.7
1,63316.4,A2,-4000,-4003.0,-2.6
2,63316.4,A3,-3082,-3043.0,38.9
3,63318.2,A1,-3082,-3105.0,-23.2
4,63318.2,A2,-4000,-3998.0,2.3


In [49]:
df_report_left.dtypes.value_counts()

float64    3
object     1
int64      1
Name: count, dtype: int64

In [50]:
# 使用 select_dtypes() 选取整数列
df_report_left.select_dtypes(include=['int']).head()

Unnamed: 0,设计宽度（mm）左
0,-3082
1,-4000
2,-3082
3,-3082
4,-4000


In [51]:
# 选取所有的数值列
df_report_left.select_dtypes(include=['number']).head()

Unnamed: 0,里程,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左
0,63316.4,-3082,-3101.0,-18.7
1,63316.4,-4000,-4003.0,-2.6
2,63316.4,-3082,-3043.0,38.9
3,63318.2,-3082,-3105.0,-23.2
4,63318.2,-4000,-3998.0,2.3


In [52]:
# 通过 filter() 函数过滤选取多列
df_report_left.filter(like='左').head()  # 从 df_report_left 这个 DataFrame 里筛选出列名包含 左 的列

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左
0,-3082,-3101.0,-18.7
1,-4000,-4003.0,-2.6
2,-3082,-3043.0,38.9
3,-3082,-3105.0,-23.2
4,-4000,-3998.0,2.3


In [53]:
# 通过正则表达式选取多列
df_report_left.filter(regex='m').head()  # 筛选出列名中包含m的列

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左
0,-3082,-3101.0,-18.7
1,-4000,-4003.0,-2.6
2,-3082,-3043.0,38.9
3,-3082,-3105.0,-23.2
4,-4000,-3998.0,2.3


In [54]:
# filter()函数，传递列表到参数items，选取多列
df_report_left.filter(items=['设计宽度（mm）左','实测宽度（mm）左']).head()

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左
0,-3082,-3101.0
1,-4000,-4003.0
2,-3082,-3043.0
3,-3082,-3105.0
4,-4000,-3998.0


## 对列名进行排序

In [55]:
import pandas as pd
import numpy as np

df_report = pd.read_csv('data.csv', encoding='gb2312')
df_report.head()

Unnamed: 0,里程,测点位置,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,director_name
0,63316.4,A1,-3082,-3101.0,-18.7,3082,3116,34.19,文件名称1
1,63316.4,A2,-4000,-4003.0,-2.6,4000,4035,35.03,文件名称2
2,63316.4,A3,-3082,-3043.0,38.9,3082,3092,10.5,文件名称3
3,63318.2,A1,-3082,-3105.0,-23.2,3082,3120,38.11,文件名称4
4,63318.2,A2,-4000,-3998.0,2.3,4000,4039,38.46,文件名称5


In [56]:
df_report.columns

Index(['里程', '测点位置', '设计宽度（mm）左', '实测宽度（mm）左', '差值（mm）左', '设计宽度（mm）右',
       '实测宽度（mm）右', '差值（mm）右', 'director_name'],
      dtype='object')

In [57]:
# 将列索引按照指定的顺序排列
disc_core = ['里程', '测点位置', 'director_name']
disc_left = ['设计宽度（mm）左', '实测宽度（mm）左', '差值（mm）左']
disc_right = ['设计宽度（mm）右', '实测宽度（mm）右', '差值（mm）右']

new_col_order = disc_core + disc_left + disc_right

In [58]:
set(df_report.columns) == set(new_col_order)

True

In [59]:
df_report2 = df_report[new_col_order]
df_report2.head()

Unnamed: 0,里程,测点位置,director_name,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,63316.4,A1,文件名称1,-3082,-3101.0,-18.7,3082,3116,34.19
1,63316.4,A2,文件名称2,-4000,-4003.0,-2.6,4000,4035,35.03
2,63316.4,A3,文件名称3,-3082,-3043.0,38.9,3082,3092,10.5
3,63318.2,A1,文件名称4,-3082,-3105.0,-23.2,3082,3120,38.11
4,63318.2,A2,文件名称5,-4000,-3998.0,2.3,4000,4039,38.46


## 在整个 dataframe 上操作

In [60]:
pd.options.display.max_rows = 8

In [61]:
# 打印行数和列数
df_report.shape

(21, 9)

In [62]:
# 打印数据的个数
df_report.size

189

In [63]:
# 打印数据集的维度
df_report.ndim


2

In [64]:
# 打印该数据集的长度
len(df_report)

21

In [65]:
# 打印各列值的个数
df_report.count()

里程               21
测点位置             21
设计宽度（mm）左        21
实测宽度（mm）左        20
                 ..
设计宽度（mm）右        21
实测宽度（mm）右        21
差值（mm）右          21
director_name    21
Length: 9, dtype: int64

In [66]:
# 打印各列的最小值
df_report.min()

里程               63316.4
测点位置                  A1
设计宽度（mm）左          -4000
实测宽度（mm）左        -4003.0
                  ...   
设计宽度（mm）右           3082
实测宽度（mm）右           3065
差值（mm）右           -16.58
director_name      文件名称1
Length: 9, dtype: object

In [67]:
# 打印描述信息
df_report.describe()

Unnamed: 0,里程,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
count,21.0,21.0,20.0,21.0,21.0,21.0,21.0
mean,63321.8,-3388.0,-3398.0,6.714286,3388.0,3410.666667,22.73381
std,3.688902,443.436128,451.179157,22.55372,443.436128,454.102999,21.304702
min,63316.4,-4000.0,-4003.0,-23.2,3082.0,3065.0,-16.58
25%,63318.2,-4000.0,-3993.5,-14.3,3082.0,3082.0,-0.01
50%,63321.8,-3082.0,-3101.0,3.6,3082.0,3120.0,34.19
75%,63325.4,-3082.0,-3052.5,30.8,4000.0,4035.0,38.46
max,63327.2,-3082.0,-3043.0,38.9,4000.0,4042.0,43.49


In [68]:
# 使用 percentiles() 方法计算分位数
df_report.describe(percentiles=[.01, .3, .99])

Unnamed: 0,里程,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
count,21.000000,21.000000,20.000000,21.000000,21.000000,21.000000,21.000000
mean,63321.800000,-3388.000000,-3398.000000,6.714286,3388.000000,3410.666667,22.733810
std,3.688902,443.436128,451.179157,22.553720,443.436128,454.102999,21.304702
min,63316.400000,-4000.000000,-4003.000000,-23.200000,3082.000000,3065.000000,-16.580000
...,...,...,...,...,...,...,...
30%,63320.000000,-4000.000000,-3991.600000,-14.200000,3082.000000,3092.000000,10.500000
50%,63321.800000,-3082.000000,-3101.000000,3.600000,3082.000000,3120.000000,34.190000
99%,63327.200000,-3082.000000,-3043.000000,38.860000,4000.000000,4041.400000,43.200000
max,63327.200000,-3082.000000,-3043.000000,38.900000,4000.000000,4042.000000,43.490000


In [69]:
# 打印各列空值的个数
df_report.isnull().sum()

里程               0
测点位置             0
设计宽度（mm）左        0
实测宽度（mm）左        1
                ..
设计宽度（mm）右        0
实测宽度（mm）右        0
差值（mm）右          0
director_name    0
Length: 9, dtype: int64

In [70]:
# 设定skipna参数为False，没有缺失值的数值列才会计算结果
df_report.min(skipna=False)

里程               63316.4
测点位置                  A1
设计宽度（mm）左          -4000
实测宽度（mm）左            NaN
                  ...   
设计宽度（mm）右           3082
实测宽度（mm）右           3065
差值（mm）右           -16.58
director_name      文件名称1
Length: 9, dtype: object

## 串联 dataframe 方法

In [71]:
# 使用 isnull 方法将每个值转变为布尔值
df_report.isnull()

Unnamed: 0,里程,测点位置,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,director_name
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
17,False,False,False,False,False,False,False,False,False
18,False,False,False,False,False,False,False,False,False
19,False,False,False,False,False,False,False,False,False
20,False,False,False,False,False,False,False,False,False


In [72]:
# 使用 sum 统计布尔值，返回的是 series
df_report.isnull().sum().head()

里程           0
测点位置         0
设计宽度（mm）左    0
实测宽度（mm）左    1
差值（mm）左      0
dtype: int64

In [73]:
# 对这个 series 在使用 sum ，返回整个 dataframe 的缺失值个数，返回值是个标量
df_report.isnull().sum().sum()

np.int64(1)

In [74]:
# 判断整个 dataframe 有没有缺失值，方法是连着使用两个 any
df_report.isnull().any().any()

np.True_

In [75]:
# isnull 返回同样大小的 dataframe ，但所有的值变为布尔值
df_report.isnull().dtypes.value_counts()

bool    9
Name: count, dtype: int64

In [76]:
# df_report 数据集的对象数据包含缺失值。默认条件下，聚合方法 min、max、sum，默认会跳过缺失值
df_report[['实测宽度（mm）左']].sum()

实测宽度（mm）左   -67960.0
dtype: float64

In [77]:
df_report[['实测宽度（mm）左']].max()

实测宽度（mm）左   -3043.0
dtype: float64

In [78]:
df_report.select_dtypes(['float64'])

Unnamed: 0,里程,实测宽度（mm）左,差值（mm）左,差值（mm）右
0,63316.4,-3101.0,-18.7,34.19
1,63316.4,-4003.0,-2.6,35.03
2,63316.4,-3043.0,38.9,10.50
3,63318.2,-3105.0,-23.2,38.11
...,...,...,...,...
17,63325.4,-3053.0,28.9,-3.95
18,63327.2,-3100.0,-18.6,39.68
19,63327.2,-3995.0,5.0,39.42
20,63327.2,-3051.0,30.8,-6.33


In [79]:
df_report.select_dtypes(['float64']).fillna(np.nan).max()

里程           63327.20
实测宽度（mm）左    -3043.00
差值（mm）左         38.90
差值（mm）右         43.49
dtype: float64

In [80]:
df_report['director_name'][5] = None
df_report['实测宽度（mm）左'][10] = None

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_report['director_name'][5] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_report['director_name']

In [81]:
# 忽略上面 warning，可用下面这种赋值方法
'''
loc：基于标签（label）进行索引，这里的标签可以是行和列的名称。
iloc：基于整数位置（integer position）进行索引，索引从 0 开始计数。
'''

df_report.loc[5, 'director_name'] = None
df_report.loc[10, '实测宽度（mm）左'] = None



In [82]:
col_index1 = df_report.columns.get_loc('director_name')  # col_index1 = 8
col_index2 = df_report.columns.get_loc('实测宽度（mm）左')  # col_index2 = 3

df_report.iloc[5, col_index1] = None
df_report.iloc[10, col_index2] = None

In [83]:
col_index2

3

In [84]:
df_report.select_dtypes(['object']).max()

TypeError: '>=' not supported between instances of 'str' and 'float'

In [85]:
# 要让pandas强行返回每列的值，必须填入缺失值
df_report.select_dtypes(['object']).fillna('').max()

测点位置                A3
director_name    文件名称9
dtype: object

In [86]:
df_report.select_dtypes(['float64']).max()

里程           63327.20
实测宽度（mm）左    -3043.00
差值（mm）左         38.90
差值（mm）右         43.49
dtype: float64

In [87]:
df_report.select_dtypes(['float64']).fillna(np.nan).max()

里程           63327.20
实测宽度（mm）左    -3043.00
差值（mm）左         38.90
差值（mm）右         43.49
dtype: float64

## 在 dataframe 上使用运算符

In [88]:
# 数据集既有数字也有对象，不能直接相加
df_report + 5

TypeError: can only concatenate str (not "int") to str

In [89]:
df_report_num = df_report.filter(like='mm')
df_report_num

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,-3082,-3101.0,-18.7,3082,3116,34.19
1,-4000,-4003.0,-2.6,4000,4035,35.03
2,-3082,-3043.0,38.9,3082,3092,10.50
3,-3082,-3105.0,-23.2,3082,3120,38.11
...,...,...,...,...,...,...
17,-3082,-3053.0,28.9,3082,3078,-3.95
18,-3082,-3100.0,-18.6,3082,3122,39.68
19,-4000,-3995.0,5.0,4000,4039,39.42
20,-3082,-3051.0,30.8,3082,3076,-6.33


In [90]:
# 筛选只有数字，可以相加
df_report_num + 5

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,-3077,-3096.0,-13.7,3087,3121,39.19
1,-3995,-3998.0,2.4,4005,4040,40.03
2,-3077,-3038.0,43.9,3087,3097,15.50
3,-3077,-3100.0,-18.2,3087,3125,43.11
...,...,...,...,...,...,...
17,-3077,-3048.0,33.9,3087,3083,1.05
18,-3077,-3095.0,-13.6,3087,3127,44.68
19,-3995,-3990.0,10.0,4005,4044,44.42
20,-3077,-3046.0,35.8,3087,3081,-1.33


In [91]:
# 用底除计算百分比分数
df_report_test = df_report_num.head() / 100000
df_report_test = df_report_test // 0.01 / 100
df_report_test

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,-0.04,-0.04,-0.01,0.03,0.03,0.0
1,-0.04,-0.05,-0.01,0.04,0.04,0.0
2,-0.04,-0.04,0.0,0.03,0.03,0.0
3,-0.04,-0.04,-0.01,0.03,0.03,0.0
4,-0.04,-0.04,0.0,0.04,0.04,0.0


In [92]:
# 保留两位小数
df_report_test_round = (df_report_test + 0.0001).round(2)
df_report_test_round

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,-0.04,-0.04,-0.01,0.03,0.03,0.0
1,-0.04,-0.05,-0.01,0.04,0.04,0.0
2,-0.04,-0.04,0.0,0.03,0.03,0.0
3,-0.04,-0.04,-0.01,0.03,0.03,0.0
4,-0.04,-0.04,0.0,0.04,0.04,0.0


In [93]:
.045 + .005


0.049999999999999996

In [94]:
# 比较两个 DataFrame 对象是否完全相等
df_report_test.equals(df_report_test_round)

True

In [95]:
# dataframe 的通用函数也可以实现上述方法

df_report_num.add(5).div(100000).floordiv(.01).div(100)

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,-0.04,-0.04,-0.01,0.03,0.03,0.00
1,-0.04,-0.04,0.00,0.04,0.04,0.00
2,-0.04,-0.04,0.00,0.03,0.03,0.00
3,-0.04,-0.04,-0.01,0.03,0.03,0.00
...,...,...,...,...,...,...
17,-0.04,-0.04,0.00,0.03,0.03,0.00
18,-0.04,-0.04,-0.01,0.03,0.03,0.00
19,-0.04,-0.04,0.00,0.04,0.04,0.00
20,-0.04,-0.04,0.00,0.03,0.03,-0.01


## 比较缺失值


In [96]:
import numpy as np
import pandas as pd

# pandas 用 np.nan 表示缺失值，这是一个不等于自身的特殊对象
np.nan == np.nan

False

In [97]:
# python 的 none 对象是等于自身的
None == None

True

In [98]:
# 所有和 哪np.nan 的比较都返回 False，除了不等于
5 > np.nan

False

In [99]:
np.nan > 5

False

In [100]:
5 != np.nan

True

In [101]:
df_report_num = df_report.filter(like='mm')
df_report_num.head()

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,-3082,-3101.0,-18.7,3082,3116,34.19
1,-4000,-4003.0,-2.6,4000,4035,35.03
2,-3082,-3043.0,38.9,3082,3092,10.5
3,-3082,-3105.0,-23.2,3082,3120,38.11
4,-4000,-3998.0,2.3,4000,4039,38.46


In [102]:
# 两个值比较，返回布尔值 dataframe
df_report_num.head() == -18.7

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,False,False,True,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [103]:
# dataframe 和 dataframe 比较
df_report_cpmpare = df_report_num == df_report_num
df_report_cpmpare.head()

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,True,True,True,True,True,True
1,True,True,True,True,True,True
2,True,True,True,True,True,True
3,True,True,True,True,True,True
4,True,True,True,True,True,True


In [104]:
# 用 all 检查是否所有值都是 true，这是因为缺失值不互相等于
df_report_cpmpare.all()

设计宽度（mm）左     True
实测宽度（mm）左    False
差值（mm）左       True
设计宽度（mm）右     True
实测宽度（mm）右     True
差值（mm）右       True
dtype: bool

In [105]:
df_report_num['实测宽度（mm）左']

0    -3101.0
1    -4003.0
2    -3043.0
3    -3105.0
       ...  
17   -3053.0
18   -3100.0
19   -3995.0
20   -3051.0
Name: 实测宽度（mm）左, Length: 21, dtype: float64

In [106]:
# 不可以用 == 判断然后求和
(df_report_num == np.nan).sum()

设计宽度（mm）左    0
实测宽度（mm）左    0
差值（mm）左      0
设计宽度（mm）右    0
实测宽度（mm）右    0
差值（mm）右      0
dtype: int64

In [107]:
# 统计缺失值的最好方法是使用 isnull() 
df_report_num.isnull().sum()

设计宽度（mm）左    0
实测宽度（mm）左    2
差值（mm）左      0
设计宽度（mm）右    0
实测宽度（mm）右    0
差值（mm）右      0
dtype: int64

In [108]:
# 比较两个 dataframe 最直接的方法是 equals() 方法
from pandas.testing import assert_frame_equal
assert_frame_equal(df_report_num, df_report_num) # 相等不会返回任何值，不等会抛出 AssertionError 异常

In [109]:
assert_frame_equal(df_report_num, df_report)

AssertionError: DataFrame are different

DataFrame shape mismatch
[left]:  (21, 6)
[right]: (21, 9)

In [110]:
# eq() 方法类似于 == 和前面 equals() 不同
df_report_num.eq(-18.7).head()

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,False,False,True,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False


## 矩阵的转置

In [111]:
transposed_df_report_num  = df_report_num.T
transposed_df_report_num


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
设计宽度（mm）左,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0
实测宽度（mm）左,-3101.0,-4003.0,-3043.0,-3105.0,-3998.0,-3047.0,-3101.0,-3993.0,-3043.0,-3096.0,,,-3096.0,-3996.0,-3047.0,-3101.0,-4000.0,-3053.0,-3100.0,-3995.0,-3051.0
差值（mm）左,-18.7,-2.6,38.9,-23.2,2.3,34.7,-19.0,6.8,38.7,-14.3,9.2,37.4,-14.2,3.6,34.4,-18.9,-0.2,28.9,-18.6,5.0,30.8
设计宽度（mm）右,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0
实测宽度（mm）右,3116.0,4035.0,3092.0,3120.0,4039.0,3066.0,3120.0,4029.0,3076.0,3114.0,4035.0,3065.0,3114.0,4039.0,3082.0,3125.0,4042.0,3078.0,3122.0,4039.0,3076.0
差值（mm）右,34.19,35.03,10.5,38.11,38.46,-15.72,38.09,29.21,-6.12,32.0,35.28,-16.58,31.8,38.82,-0.01,43.49,42.04,-3.95,39.68,39.42,-6.33


In [112]:
transposed_df_report_num  = df_report_num.transpose()
transposed_df_report_num

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
设计宽度（mm）左,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0,-3082.0,-4000.0,-3082.0
实测宽度（mm）左,-3101.0,-4003.0,-3043.0,-3105.0,-3998.0,-3047.0,-3101.0,-3993.0,-3043.0,-3096.0,,,-3096.0,-3996.0,-3047.0,-3101.0,-4000.0,-3053.0,-3100.0,-3995.0,-3051.0
差值（mm）左,-18.7,-2.6,38.9,-23.2,2.3,34.7,-19.0,6.8,38.7,-14.3,9.2,37.4,-14.2,3.6,34.4,-18.9,-0.2,28.9,-18.6,5.0,30.8
设计宽度（mm）右,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0,3082.0,4000.0,3082.0
实测宽度（mm）右,3116.0,4035.0,3092.0,3120.0,4039.0,3066.0,3120.0,4029.0,3076.0,3114.0,4035.0,3065.0,3114.0,4039.0,3082.0,3125.0,4042.0,3078.0,3122.0,4039.0,3076.0
差值（mm）右,34.19,35.03,10.5,38.11,38.46,-15.72,38.09,29.21,-6.12,32.0,35.28,-16.58,31.8,38.82,-0.01,43.49,42.04,-3.95,39.68,39.42,-6.33


In [113]:
# 返回非缺失值的个数
df_report_num.count()

设计宽度（mm）左    21
实测宽度（mm）左    19
差值（mm）左      21
设计宽度（mm）右    21
实测宽度（mm）右    21
差值（mm）右      21
dtype: int64

In [114]:
# axis 默认设置为0
'''
axis=0 或 axis='index'：表示按列统计，即对每列中的非缺失值进行计数。这是 count 方法的默认值。
axis=1 或 axis='columns'：表示按行统计，即对每行中的非缺失值进行计数。
'''
df_report_num.count(axis=0)

设计宽度（mm）左    21
实测宽度（mm）左    19
差值（mm）左      21
设计宽度（mm）右    21
实测宽度（mm）右    21
差值（mm）右      21
dtype: int64

In [115]:
df_report_num.count(axis='columns').head()

0    6
1    6
2    6
3    6
4    6
dtype: int64

In [116]:
# 除了统计每行的非缺失值个数，也可以求和
df_report_num.sum(axis='columns').head()

0    30.49
1    64.43
2    98.40
3    29.91
4    81.76
dtype: float64

In [117]:
# 查看每列的中位数
df_report_num.median(axis='index')

设计宽度（mm）左   -3082.00
实测宽度（mm）左   -3101.00
差值（mm）左         3.60
设计宽度（mm）右    3082.00
实测宽度（mm）右    3120.00
差值（mm）右        34.19
dtype: float64

In [118]:
# 使用 cumsum() 累计求和
df_report_num.cumsum(axis='index').head()

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
0,-3082,-3101.0,-18.7,3082,3116,34.19
1,-7082,-7104.0,-21.3,7082,7151,69.22
2,-10164,-10147.0,17.6,10164,10243,79.72
3,-13246,-13252.0,-5.6,13246,13363,117.83
4,-17246,-17250.0,-3.3,17246,17402,156.29


In [119]:
# 降序排列
df_report_num.sort_values(by='差值（mm）左', ascending=False).head()  # False 降序 True 升序

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右
2,-3082,-3043.0,38.9,3082,3092,10.5
8,-3082,-3043.0,38.7,3082,3076,-6.12
11,-3082,,37.4,3082,3065,-16.58
5,-3082,-3047.0,34.7,3082,3066,-15.72
14,-3082,-3047.0,34.4,3082,3082,-0.01


In [120]:
df_report_num['new_col'] = np.nan

new_index = len(df_report_num)
df_report_num.loc[new_index] = np.nan

df_report_num

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_report_num['new_col'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_report_num.loc[new_index] = np.nan


Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,new_col
0,-3082.0,-3101.0,-18.7,3082.0,3116.0,34.19,
1,-4000.0,-4003.0,-2.6,4000.0,4035.0,35.03,
2,-3082.0,-3043.0,38.9,3082.0,3092.0,10.50,
3,-3082.0,-3105.0,-23.2,3082.0,3120.0,38.11,
...,...,...,...,...,...,...,...
18,-3082.0,-3100.0,-18.6,3082.0,3122.0,39.68,
19,-4000.0,-3995.0,5.0,4000.0,4039.0,39.42,
20,-3082.0,-3051.0,30.8,3082.0,3076.0,-6.33,
21,,,,,,,


In [121]:
# 如果所有行都是缺失值，则将其去除
df_report_num.dropna(how='all', inplace=True)  # inplace=True 表示直接在原 DataFrame df_report_num 上进行修改
df_report_num

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_report_num.dropna(how='all', inplace=True)  # inplace=True 表示直接在原 DataFrame df_report_num 上进行修改


Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,new_col
0,-3082.0,-3101.0,-18.7,3082.0,3116.0,34.19,
1,-4000.0,-4003.0,-2.6,4000.0,4035.0,35.03,
2,-3082.0,-3043.0,38.9,3082.0,3092.0,10.50,
3,-3082.0,-3105.0,-23.2,3082.0,3120.0,38.11,
...,...,...,...,...,...,...,...
17,-3082.0,-3053.0,28.9,3082.0,3078.0,-3.95,
18,-3082.0,-3100.0,-18.6,3082.0,3122.0,39.68,
19,-4000.0,-3995.0,5.0,4000.0,4039.0,39.42,
20,-3082.0,-3051.0,30.8,3082.0,3076.0,-6.33,


In [122]:
df_report_num.isnull().sum()

设计宽度（mm）左     0
实测宽度（mm）左     2
差值（mm）左       0
设计宽度（mm）右     0
实测宽度（mm）右     0
差值（mm）右       0
new_col      21
dtype: int64

In [123]:
# 用大于或者等于的方法ge(), 将dataframe 变为 bool 矩阵
df_report_num.ge(38.7).head()  # 找到大于等于38.7的数据

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,new_col
0,False,False,False,True,True,False,False
1,False,False,False,True,True,False,False
2,False,False,True,True,True,False,False
3,False,False,False,True,True,False,False
4,False,False,False,True,True,False,False


In [124]:
# 对所有 true 值求和
df_report_num_387 = df_report_num.ge(38.7).sum(axis='columns')
df_report_num_387.head()

0    2
1    2
2    3
3    2
4    2
dtype: int64

In [125]:
df_report_num_387.value_counts()

2    14
3     7
Name: count, dtype: int64

In [126]:
df_report_num

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,new_col
0,-3082.0,-3101.0,-18.7,3082.0,3116.0,34.19,
1,-4000.0,-4003.0,-2.6,4000.0,4035.0,35.03,
2,-3082.0,-3043.0,38.9,3082.0,3092.0,10.50,
3,-3082.0,-3105.0,-23.2,3082.0,3120.0,38.11,
...,...,...,...,...,...,...,...
17,-3082.0,-3053.0,28.9,3082.0,3078.0,-3.95,
18,-3082.0,-3100.0,-18.6,3082.0,3122.0,39.68,
19,-4000.0,-3995.0,5.0,4000.0,4039.0,39.42,
20,-3082.0,-3051.0,30.8,3082.0,3076.0,-6.33,


In [127]:
# 使用 value_counts() 查看分布情况
df_report_num.value_counts()  # 若 DataFrame 里所有行都包含 NaN 值，value_counts() 就会返回空值

Series([], Name: count, dtype: int64)

In [128]:
df_report_num.value_counts(dropna=False)  # # 使用 value_counts 方法统计每行出现的频次，包含 NaN 行

设计宽度（mm）左  实测宽度（mm）左  差值（mm）左  设计宽度（mm）右  实测宽度（mm）右  差值（mm）右  new_col
-4000.0    -4003.0    -2.6     4000.0     4035.0      35.03   NaN        1
           -4000.0    -0.2     4000.0     4042.0      42.04   NaN        1
           -3998.0     2.3     4000.0     4039.0      38.46   NaN        1
           -3996.0     3.6     4000.0     4039.0      38.82   NaN        1
                                                                        ..
-3082.0    -3047.0     34.7    3082.0     3066.0     -15.72   NaN        1
           -3043.0     38.7    3082.0     3076.0     -6.12    NaN        1
                       38.9    3082.0     3092.0      10.50   NaN        1
            NaN        37.4    3082.0     3065.0     -16.58   NaN        1
Name: count, Length: 21, dtype: int64

In [129]:
df_report_num_387.sort_values(ascending=False).head(8)

2     3
19    3
15    3
16    3
8     3
18    3
13    3
1     2
dtype: int64

In [130]:
df_report_num.loc[[2, 19, 15, 16]]

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,new_col
2,-3082.0,-3043.0,38.9,3082.0,3092.0,10.5,
19,-4000.0,-3995.0,5.0,4000.0,4039.0,39.42,
15,-3082.0,-3101.0,-18.9,3082.0,3125.0,43.49,
16,-4000.0,-4000.0,-0.2,4000.0,4042.0,42.04,


In [131]:
df_report_num

Unnamed: 0,设计宽度（mm）左,实测宽度（mm）左,差值（mm）左,设计宽度（mm）右,实测宽度（mm）右,差值（mm）右,new_col
0,-3082.0,-3101.0,-18.7,3082.0,3116.0,34.19,
1,-4000.0,-4003.0,-2.6,4000.0,4035.0,35.03,
2,-3082.0,-3043.0,38.9,3082.0,3092.0,10.50,
3,-3082.0,-3105.0,-23.2,3082.0,3120.0,38.11,
...,...,...,...,...,...,...,...
17,-3082.0,-3053.0,28.9,3082.0,3078.0,-3.95,
18,-3082.0,-3100.0,-18.6,3082.0,3122.0,39.68,
19,-4000.0,-3995.0,5.0,4000.0,4039.0,39.42,
20,-3082.0,-3051.0,30.8,3082.0,3076.0,-6.33,


In [132]:
df_report_num.max(axis=1).sort_values(ascending=False).head()

16    4042.0
13    4039.0
19    4039.0
4     4039.0
1     4035.0
dtype: float64

In [133]:
df_report_num.loc[16]


设计宽度（mm）左   -4000.00
实测宽度（mm）左   -4000.00
差值（mm）左        -0.20
设计宽度（mm）右    4000.00
实测宽度（mm）右    4042.00
差值（mm）右        42.04
new_col          NaN
Name: 16, dtype: float64