# PySpark 缺失与排序

In [1]:
import pyspark.sql.functions as F

## 缺失

### 缺失查找

- `isnan` 将非数字数据筛选出来
- `isnull` 将空数据筛选出来

In [2]:
df = spark.createDataFrame(
    [['Alice', 5, 80,],[None, 6, 76,],
        ['Bob', None, 60,],],
    schema=['name', 'age', 'height'])
df.show()

+-----+----+------+
| name| age|height|
+-----+----+------+
|Alice|   5|    80|
| null|   6|    76|
|  Bob|null|    60|
+-----+----+------+



### 删除含有缺失值的行

In [3]:
df.dropna(how='any', thresh=None, subset=None).show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
+-----+---+------+



### 将缺失值填充成特定的值

In [4]:
df.fillna(1).show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
| null|  6|    76|
|  Bob|  1|    60|
+-----+---+------+



### 按字段填充成特定值

In [5]:
df.na.fill(
    {'age': 50, 'name': 'unknown'}
).show()

+-------+---+------+
|   name|age|height|
+-------+---+------+
|  Alice|  5|    80|
|unknown|  6|    76|
|    Bob| 50|    60|
+-------+---+------+



### 统计缺失值

In [6]:
df_agg = df.agg(
    *[F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns])
df_agg.show()

+----+---+------+
|name|age|height|
+----+---+------+
|   1|  1|     0|
+----+---+------+



### 统计缺失率

In [7]:
df.agg(
    *[(1 - (F.count(c)/F.count('*'))).alias(c + 'missing') for c in df.columns]
).show()

+-------------------+-------------------+-------------+
|        namemissing|         agemissing|heightmissing|
+-------------------+-------------------+-------------+
|0.33333333333333337|0.33333333333333337|          0.0|
+-------------------+-------------------+-------------+



## 排序

### 升序排列

In [8]:
df.orderBy("name").show()

+-----+----+------+
| name| age|height|
+-----+----+------+
| null|   6|    76|
|Alice|   5|    80|
|  Bob|null|    60|
+-----+----+------+



### `age`升序，`name`降序排列

In [9]:
df.orderBy(
    ['age', 'name'], ascending=[0, 1]).show()

+-----+----+------+
| name| age|height|
+-----+----+------+
| null|   6|    76|
|Alice|   5|    80|
|  Bob|null|    60|
+-----+----+------+



In [10]:
df.orderBy(df["age"].asc()).show()

+-----+----+------+
| name| age|height|
+-----+----+------+
|  Bob|null|    60|
|Alice|   5|    80|
| null|   6|    76|
+-----+----+------+



In [11]:
df.orderBy(F.asc('age')).show()              # 升序
df.orderBy(F.desc('age')).show()             # 降序
df.orderBy(F.asc_nulls_last('age')).show()   # 升序Null置于最后
df.orderBy(F.desc_nulls_last('age')).show()  # 降序Null置于最后
df.orderBy(F.asc_nulls_first('age')).show()  # 升序Null置于开始
df.orderBy(F.desc_nulls_first('age')).show() # 降序Null置于开始

+-----+----+------+
| name| age|height|
+-----+----+------+
|  Bob|null|    60|
|Alice|   5|    80|
| null|   6|    76|
+-----+----+------+

+-----+----+------+
| name| age|height|
+-----+----+------+
| null|   6|    76|
|Alice|   5|    80|
|  Bob|null|    60|
+-----+----+------+

+-----+----+------+
| name| age|height|
+-----+----+------+
|Alice|   5|    80|
| null|   6|    76|
|  Bob|null|    60|
+-----+----+------+

+-----+----+------+
| name| age|height|
+-----+----+------+
| null|   6|    76|
|Alice|   5|    80|
|  Bob|null|    60|
+-----+----+------+

+-----+----+------+
| name| age|height|
+-----+----+------+
|  Bob|null|    60|
|Alice|   5|    80|
| null|   6|    76|
+-----+----+------+

+-----+----+------+
| name| age|height|
+-----+----+------+
|  Bob|null|    60|
| null|   6|    76|
|Alice|   5|    80|
+-----+----+------+



----